diff --git a/.gitattributes b/.gitattributes
index 6b281f33f737d..142d6689f1088 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,5 @@
+clang/bindings/python/.git_archival.txt  export-subst
+
 libcxx/src/**/*.cpp     merge=libcxx-reformat
 libcxx/include/**/*.h   merge=libcxx-reformat
 
diff --git a/.github/workflows/bazel-checks.yml b/.github/workflows/bazel-checks.yml
index dc9dcb97ce0a8..aa318569532ec 100644
--- a/.github/workflows/bazel-checks.yml
+++ b/.github/workflows/bazel-checks.yml
@@ -22,7 +22,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Setup Buildifier
         run: |
           sudo curl -L https://github.com/bazelbuild/buildtools/releases/download/v8.2.1/buildifier-linux-amd64 -o /usr/bin/buildifier --fail
@@ -41,7 +41,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         # TODO(boomanaiden154): We should use a purpose built container for this. Move
         # over when we have fixed the issues with using custom containers with Github
         # ARC in GKE.
diff --git a/.github/workflows/build-ci-container-tooling.yml b/.github/workflows/build-ci-container-tooling.yml
index 531da2ccbd446..c75d84829bba6 100644
--- a/.github/workflows/build-ci-container-tooling.yml
+++ b/.github/workflows/build-ci-container-tooling.yml
@@ -41,7 +41,7 @@ jobs:
             target: abi-tests
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: |
             .github/workflows/containers/github-action-ci-tooling/
@@ -67,7 +67,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: |
             .github/actions/push-container
diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml
index 3996948bb44e0..3fd1c73d0ee40 100644
--- a/.github/workflows/build-ci-container-windows.yml
+++ b/.github/workflows/build-ci-container-windows.yml
@@ -25,7 +25,7 @@ jobs:
       container-filename: ${{ steps.vars.outputs.container-filename }}
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: .github/workflows/containers/github-action-ci-windows
       - name: Write Variables
diff --git a/.github/workflows/build-ci-container.yml b/.github/workflows/build-ci-container.yml
index ddb803fb969ff..e17e852166cee 100644
--- a/.github/workflows/build-ci-container.yml
+++ b/.github/workflows/build-ci-container.yml
@@ -36,7 +36,7 @@ jobs:
           - cd $HOME && printf '#include <iostream>\nint main(int argc, char **argv) { std::cout << "Hello\\n"; }' | clang++ -x c++ - && ./a.out | grep Hello
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: |
             .github/workflows/containers/github-action-ci/
@@ -62,7 +62,7 @@ jobs:
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: |
             .github/actions/push-container
diff --git a/.github/workflows/build-metrics-container.yml b/.github/workflows/build-metrics-container.yml
index e1407a29cc295..0436e6ec82fda 100644
--- a/.github/workflows/build-metrics-container.yml
+++ b/.github/workflows/build-metrics-container.yml
@@ -23,7 +23,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: |
             .ci/metrics/
@@ -46,7 +46,7 @@ jobs:
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: |
             .github/actions/push-container
diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml
index 7fecb010a64ff..914ead803181e 100644
--- a/.github/workflows/check-ci.yml
+++ b/.github/workflows/check-ci.yml
@@ -22,7 +22,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: .ci
       - name: Setup Python
diff --git a/.github/workflows/ci-post-commit-analyzer.yml b/.github/workflows/ci-post-commit-analyzer.yml
index 59df0b68a8ad7..abd64809f9dc9 100644
--- a/.github/workflows/ci-post-commit-analyzer.yml
+++ b/.github/workflows/ci-post-commit-analyzer.yml
@@ -41,7 +41,7 @@ jobs:
       LLVM_VERSION: 18
     steps:
       - name: Checkout Source
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
diff --git a/.github/workflows/commit-access-greeter.yml b/.github/workflows/commit-access-greeter.yml
index f31cd015642e2..d11bdec6b2ea8 100644
--- a/.github/workflows/commit-access-greeter.yml
+++ b/.github/workflows/commit-access-greeter.yml
@@ -18,7 +18,7 @@ jobs:
       github.event.label.name == 'infra:commit-access-request'
     runs-on: ubuntu-24.04
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: llvm/utils/git/
 
diff --git a/.github/workflows/commit-access-review.yml b/.github/workflows/commit-access-review.yml
index 7cdcfca532990..f6ec0bc9d4a9a 100644
--- a/.github/workflows/commit-access-review.yml
+++ b/.github/workflows/commit-access-review.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       
       - name: Install dependencies
         run: |
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 3eb146d21dc40..ef1b727dc00a8 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -55,7 +55,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           fetch-depth: 2
       - name: Get subprojects that have doc changes
diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml
index ba625b2b3b062..28cd2905d816d 100644
--- a/.github/workflows/email-check.yaml
+++ b/.github/workflows/email-check.yaml
@@ -14,7 +14,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: ${{ github.event.pull_request.head.sha }}
 
diff --git a/.github/workflows/gha-codeql.yml b/.github/workflows/gha-codeql.yml
index 689bdf408522b..9c6c23e34758c 100644
--- a/.github/workflows/gha-codeql.yml
+++ b/.github/workflows/gha-codeql.yml
@@ -24,7 +24,7 @@ jobs:
       security-events: write
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: |
             .github/
diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml
index 6e5bfd5b870a3..cc4b52ae6a728 100644
--- a/.github/workflows/hlsl-test-all.yaml
+++ b/.github/workflows/hlsl-test-all.yaml
@@ -29,25 +29,25 @@ jobs:
     runs-on: ${{ inputs.SKU }}
     steps:
       - name: Checkout DXC
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           repository: Microsoft/DirectXShaderCompiler
           ref: main
           path: DXC
           submodules: true
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: ${{ inputs.LLVM-branch }}
           path: llvm-project
       - name: Checkout OffloadTest
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           repository: llvm/offload-test-suite
           ref: main
           path: OffloadTest
       - name: Checkout Golden Images
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           repository: llvm/offload-golden-images
           ref: main
diff --git a/.github/workflows/issue-release-workflow.yml b/.github/workflows/issue-release-workflow.yml
index 7fd0280b2eedf..9aa1276e10e92 100644
--- a/.github/workflows/issue-release-workflow.yml
+++ b/.github/workflows/issue-release-workflow.yml
@@ -42,7 +42,7 @@ jobs:
       contains(github.event.action == 'opened' && github.event.issue.body || github.event.comment.body, '/cherry-pick')
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           repository: llvm/llvm-project
           # GitHub stores the token used for checkout and uses it for pushes
diff --git a/.github/workflows/issue-subscriber.yml b/.github/workflows/issue-subscriber.yml
index afcd17c757b39..0006b982aeae6 100644
--- a/.github/workflows/issue-subscriber.yml
+++ b/.github/workflows/issue-subscriber.yml
@@ -14,7 +14,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Checkout Automation Script
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: llvm/utils/git/
           ref: main
diff --git a/.github/workflows/issue-write.yml b/.github/workflows/issue-write.yml
index 4f8fd7a48aff6..ece6081ce9ba6 100644
--- a/.github/workflows/issue-write.yml
+++ b/.github/workflows/issue-write.yml
@@ -27,7 +27,7 @@ jobs:
       )
     steps:
       - name: Fetch Sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: |
             .github/workflows/unprivileged-download-artifact/action.yml
diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml
index 01fd895cce7e8..13c0c2b82ab42 100644
--- a/.github/workflows/libc-fullbuild-tests.yml
+++ b/.github/workflows/libc-fullbuild-tests.yml
@@ -88,7 +88,7 @@ jobs:
           # - c_compiler: gcc
           #   cpp_compiler: g++
     steps:
-    - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
     
     # Libc's build is relatively small comparing with other components of LLVM.
     # A fresh fullbuild takes about 190MiB of uncompressed disk space, which can
diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml
index df9a20dce8eae..29bcd0f600490 100644
--- a/.github/workflows/libc-overlay-tests.yml
+++ b/.github/workflows/libc-overlay-tests.yml
@@ -41,7 +41,7 @@ jobs:
               cpp_compiler: clang++
     
     steps:
-    - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
     
     # Libc's build is relatively small comparing with other components of LLVM.
     # A fresh linux overlay takes about 180MiB of uncompressed disk space, which can
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 0d3f9fe3f69ea..b8be6a42e0fa8 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -38,7 +38,7 @@ jobs:
       LLVM_VERSION_PATCH: ${{ steps.version.outputs.patch }}
     steps:
       - name: Checkout source
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           fetch-depth: 250
 
@@ -102,7 +102,7 @@ jobs:
             repo: ${{ github.repository }}
     steps:
       - name: Download source code
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: ${{ matrix.ref }}
           repository: ${{ matrix.repo }}
diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml
index 0d66f5d595e0e..69d281b65b5fb 100644
--- a/.github/workflows/libclang-python-tests.yml
+++ b/.github/workflows/libclang-python-tests.yml
@@ -32,7 +32,7 @@ jobs:
       matrix:
         python-version: ["3.8", "3.13"]
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Setup Python
         uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 7dad30f994fd1..8e6dc48f4c495 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -54,7 +54,7 @@ jobs:
             cc: 'gcc-15'
             cxx: 'g++-15'
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: ${{ matrix.config }}.${{ matrix.cxx }}
         run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
         env:
@@ -99,7 +99,7 @@ jobs:
             cc: 'clang-20'
             cxx: 'clang++-20'
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: ${{ matrix.config }}
         run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
         env:
@@ -163,7 +163,7 @@ jobs:
           machine: llvm-premerge-libcxx-runners
     runs-on: ${{ matrix.machine }}
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: ${{ matrix.config }}
         run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
         env:
@@ -211,7 +211,7 @@ jobs:
           os: macos-15
     runs-on: ${{ matrix.os }}
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - uses: maxim-lobanov/setup-xcode@60606e260d2fc5762a71e64e74b2174e8ea3c8bd # v1.6.0
         with:
           # https://github.com/actions/runner-images/blob/main/images/macos/macos-15-Readme.md
@@ -253,7 +253,7 @@ jobs:
         - { config: mingw-static, mingw: true, runner: windows-11-arm }
     runs-on: ${{ matrix.runner != '' && matrix.runner || 'windows-2022' }}
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Install dependencies
         run: |
           pip install psutil
diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml
index eee7bb8913944..a49ad812606e8 100644
--- a/.github/workflows/libcxx-build-containers.yml
+++ b/.github/workflows/libcxx-build-containers.yml
@@ -30,7 +30,7 @@ jobs:
       packages: write
 
     steps:
-    - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+    - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
     # The default Docker storage location for GitHub Actions doesn't have
     # enough disk space, so change it to /mnt, which has more disk space.
diff --git a/.github/workflows/libcxx-check-generated-files.yml b/.github/workflows/libcxx-check-generated-files.yml
index ba97ccc64ba70..a25dc8b70001d 100644
--- a/.github/workflows/libcxx-check-generated-files.yml
+++ b/.github/workflows/libcxx-check-generated-files.yml
@@ -12,7 +12,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Install dependencies
         uses: aminya/setup-cpp@9bc9b8cd8a8d678f920e4e1e73f29da8010ced51 # v1.7.2
diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml
index e2ca940d2f0b3..64a902482f9a3 100644
--- a/.github/workflows/libcxx-run-benchmarks.yml
+++ b/.github/workflows/libcxx-run-benchmarks.yml
@@ -56,7 +56,7 @@ jobs:
           BENCHMARKS=$(echo "$COMMENT_BODY" | sed -nE 's/\/libcxx-bot benchmark (.+)/\1/p')
           echo "benchmarks=${BENCHMARKS}" >> ${GITHUB_OUTPUT}
 
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: ${{ steps.vars.outputs.pr_head }}
           fetch-depth: 0
diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml
index 7bd4a471bfd3a..4b0268b0ad9f2 100644
--- a/.github/workflows/llvm-abi-tests.yml
+++ b/.github/workflows/llvm-abi-tests.yml
@@ -38,7 +38,7 @@ jobs:
       LLVM_VERSION_PATCH: ${{ steps.version.outputs.patch }}
     steps:
       - name: Checkout source
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           fetch-depth: 250
 
@@ -90,7 +90,7 @@ jobs:
             repo: ${{ github.repository }}
     steps:
       - name: Download source code
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: ${{ matrix.ref }}
           repository: ${{ matrix.repo }}
diff --git a/.github/workflows/merged-prs.yml b/.github/workflows/merged-prs.yml
index 107bbc51b5314..22786dc10b1ea 100644
--- a/.github/workflows/merged-prs.yml
+++ b/.github/workflows/merged-prs.yml
@@ -21,7 +21,7 @@ jobs:
       (github.event.pull_request.merged == true)
     steps:
       - name: Checkout Automation Script
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: llvm/utils/git/
           ref: main
diff --git a/.github/workflows/mlir-spirv-tests.yml b/.github/workflows/mlir-spirv-tests.yml
index 5bb16c739cdde..e9b0cddb391be 100644
--- a/.github/workflows/mlir-spirv-tests.yml
+++ b/.github/workflows/mlir-spirv-tests.yml
@@ -28,7 +28,7 @@ jobs:
     container:
       image: ghcr.io/llvm/ci-ubuntu-24.04:latest
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
diff --git a/.github/workflows/new-prs.yml b/.github/workflows/new-prs.yml
index f5826728d2c7b..01032230c3883 100644
--- a/.github/workflows/new-prs.yml
+++ b/.github/workflows/new-prs.yml
@@ -34,7 +34,7 @@ jobs:
       (github.event.pull_request.author_association != 'OWNER')
     steps:
       - name: Checkout Automation Script
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: llvm/utils/git/
           ref: main
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index dc253e4fbae98..441f94960bc0a 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -21,7 +21,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           fetch-depth: 2
 
diff --git a/.github/workflows/pr-code-lint.yml b/.github/workflows/pr-code-lint.yml
index 5444a29c22205..ea4f8217cd003 100644
--- a/.github/workflows/pr-code-lint.yml
+++ b/.github/workflows/pr-code-lint.yml
@@ -27,7 +27,7 @@ jobs:
       cancel-in-progress: true
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           fetch-depth: 2
       
diff --git a/.github/workflows/pr-request-release-note.yml b/.github/workflows/pr-request-release-note.yml
index c2dc2de65f133..ac80347d00ab4 100644
--- a/.github/workflows/pr-request-release-note.yml
+++ b/.github/workflows/pr-request-release-note.yml
@@ -19,7 +19,7 @@ jobs:
       # We need to pull the script from the main branch, so that we ensure
       # we get the latest version of this script.
       - name: Checkout Scripts
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: |
             llvm/utils/git/requirements.txt
diff --git a/.github/workflows/pr-subscriber.yml b/.github/workflows/pr-subscriber.yml
index 23c7a679185ee..eac93be196356 100644
--- a/.github/workflows/pr-subscriber.yml
+++ b/.github/workflows/pr-subscriber.yml
@@ -14,7 +14,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Checkout Automation Script
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: llvm/utils/git/
           ref: main
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index daf88b5b22125..252d7fbe8e67f 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -58,7 +58,7 @@ jobs:
         shell: bash
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           fetch-depth: 2
       - name: Build and Test
@@ -139,7 +139,7 @@ jobs:
         shell: bash
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           fetch-depth: 2
       - name: Compute Projects
@@ -196,7 +196,7 @@ jobs:
       (github.event_name != 'pull_request' || github.event.action != 'closed')
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           fetch-depth: 2
       - name: Setup ccache
diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml
index b658167d1db36..66ea3537a9162 100644
--- a/.github/workflows/release-asset-audit.yml
+++ b/.github/workflows/release-asset-audit.yml
@@ -23,7 +23,7 @@ jobs:
     if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: |
             .github/workflows/release-asset-audit.py
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 64f371e9f8db8..104d37db8a28d 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -71,7 +71,7 @@ jobs:
         python-version: '3.14'
 
     - name: Checkout LLVM
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
     - name: Install Dependencies
       shell: bash
@@ -184,7 +184,7 @@ jobs:
     steps:
 
     - name: Checkout LLVM
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       with:
         ref: ${{ needs.prepare.outputs.ref }}
 
@@ -246,7 +246,7 @@ jobs:
 
     steps:
     - name: Checkout Release Scripts
-      uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       with:
         sparse-checkout: |
           llvm/utils/release/github-upload-release.py
diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml
index c09ad57066711..23bc0aed4a546 100644
--- a/.github/workflows/release-documentation.yml
+++ b/.github/workflows/release-documentation.yml
@@ -38,7 +38,7 @@ jobs:
       upload: ${{ inputs.upload && !contains(inputs.release-version, 'rc') }}
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Setup Python env
         uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
@@ -70,7 +70,7 @@ jobs:
 
       - name: Clone www-releases
         if: env.upload
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           repository: ${{ github.repository_owner }}/www-releases
           ref: main
diff --git a/.github/workflows/release-doxygen.yml b/.github/workflows/release-doxygen.yml
index c31319e47833d..6e6ea883ef1d0 100644
--- a/.github/workflows/release-doxygen.yml
+++ b/.github/workflows/release-doxygen.yml
@@ -40,7 +40,7 @@ jobs:
       upload: ${{ inputs.upload && !contains(inputs.release-version, 'rc') }}
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Setup Python env
         uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
diff --git a/.github/workflows/release-lit.yml b/.github/workflows/release-lit.yml
index f2fd4a52328a7..e7a94572429d0 100644
--- a/.github/workflows/release-lit.yml
+++ b/.github/workflows/release-lit.yml
@@ -28,7 +28,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: "llvmorg-${{ inputs.release-version }}"
 
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
index 4c47bd7575d99..9b21d2adfd27a 100644
--- a/.github/workflows/release-sources.yml
+++ b/.github/workflows/release-sources.yml
@@ -71,7 +71,7 @@ jobs:
       attestations: write
     steps:
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           ref: ${{ needs.inputs.outputs.ref }}
           fetch-tags: true
diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml
index d4c2a55fcc9d7..199017a4a7b27 100644
--- a/.github/workflows/release-tasks.yml
+++ b/.github/workflows/release-tasks.yml
@@ -38,7 +38,7 @@ jobs:
           sudo apt-get install python3-github
 
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
 
       - name: Create Release
         env:
@@ -132,7 +132,7 @@ jobs:
           sudo apt-get install python3-github
 
       - name: Checkout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: llvm/utils/release/github-upload-release.py
           sparse-checkout-cone-mode: false
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index 597b6f0c0e0f0..4ce3df621bb76 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -31,7 +31,7 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           persist-credentials: false
 
diff --git a/.github/workflows/spirv-tests.yml b/.github/workflows/spirv-tests.yml
index 69374ae563306..5ede9df3b006d 100644
--- a/.github/workflows/spirv-tests.yml
+++ b/.github/workflows/spirv-tests.yml
@@ -24,7 +24,7 @@ jobs:
     container:
       image: ghcr.io/llvm/ci-ubuntu-24.04:latest
     steps:
-      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+      - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@bfa03e1de4d7f7c3e80ad9109feedd05c4f5a716 # v1.2.19
         with:
diff --git a/.github/workflows/test-unprivileged-download-artifact.yml b/.github/workflows/test-unprivileged-download-artifact.yml
index a9c0912b0f44e..39ac3d57a3879 100644
--- a/.github/workflows/test-unprivileged-download-artifact.yml
+++ b/.github/workflows/test-unprivileged-download-artifact.yml
@@ -38,7 +38,7 @@ jobs:
     needs: [ upload-test-artifact ]
     steps:
       - name: Chekcout LLVM
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           sparse-checkout: |
             .github/workflows/unprivileged-download-artifact/action.yml
diff --git a/.github/workflows/version-check.yml b/.github/workflows/version-check.yml
index 7e451880f4cfa..b3fc1f49db56a 100644
--- a/.github/workflows/version-check.yml
+++ b/.github/workflows/version-check.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on: ubuntu-24.04
     steps:
       - name: Fetch LLVM sources
-        uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
+        uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
         with:
           fetch-depth: 0
 
diff --git a/amd/comgr/src/comgr-compiler.cpp b/amd/comgr/src/comgr-compiler.cpp
index 358e8af87f4c8..822e4b3ce512f 100644
--- a/amd/comgr/src/comgr-compiler.cpp
+++ b/amd/comgr/src/comgr-compiler.cpp
@@ -1129,7 +1129,7 @@ amd_comgr_status_t AMDGPUCompiler::addDeviceLibraries() {
   SmallString<256> ClangBinaryPath(env::getLLVMPath());
   sys::path::append(ClangBinaryPath, "bin", "clang");
 
-  std::string ClangResourceDir = GetResourcesPath(ClangBinaryPath);
+  std::string ClangResourceDir = Driver::GetResourcesPath(ClangBinaryPath);
 
   SmallString<256> DeviceLibPath(ClangResourceDir);
   sys::path::append(DeviceLibPath, "lib");
diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp
index 4eda330716f21..7990f2719e9a0 100644
--- a/clang-tools-extra/clangd/CompileCommands.cpp
+++ b/clang-tools-extra/clangd/CompileCommands.cpp
@@ -132,7 +132,8 @@ std::optional<std::string> detectSysroot() {
 
 std::string detectStandardResourceDir() {
   static int StaticForMainAddr; // Just an address in this process.
-  return GetResourcesPath("clangd", (void *)&StaticForMainAddr);
+  return CompilerInvocation::GetResourcesPath("clangd",
+                                              (void *)&StaticForMainAddr);
 }
 
 // The path passed to argv[0] is important:
diff --git a/clang-tools-extra/clangd/Compiler.cpp b/clang-tools-extra/clangd/Compiler.cpp
index 9ea7df139382a..6ebc2eac25745 100644
--- a/clang-tools-extra/clangd/Compiler.cpp
+++ b/clang-tools-extra/clangd/Compiler.cpp
@@ -9,7 +9,6 @@
 #include "Compiler.h"
 #include "support/Logger.h"
 #include "clang/Basic/TargetInfo.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Lex/PreprocessorOptions.h"
 #include "clang/Serialization/PCHContainerOperations.h"
diff --git a/clang/bindings/python/.git_archival.txt b/clang/bindings/python/.git_archival.txt
new file mode 100644
index 0000000000000..7876d4af4c620
--- /dev/null
+++ b/clang/bindings/python/.git_archival.txt
@@ -0,0 +1,3 @@
+node: $Format:%H$
+node-date: $Format:%cI$
+describe-name: $Format:%(describe:tags=true,match=llvmorg-*[0-9]*)$
diff --git a/clang/bindings/python/.gitignore b/clang/bindings/python/.gitignore
new file mode 100644
index 0000000000000..1641a745fb682
--- /dev/null
+++ b/clang/bindings/python/.gitignore
@@ -0,0 +1,21 @@
+# setuptools_scm auto-generated version file
+_version.py
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+build/
+dist/
+*.egg-info/
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
diff --git a/clang/bindings/python/pyproject.toml b/clang/bindings/python/pyproject.toml
new file mode 100644
index 0000000000000..a7ea9f2860ac2
--- /dev/null
+++ b/clang/bindings/python/pyproject.toml
@@ -0,0 +1,43 @@
+[build-system]
+requires = ["hatchling>=1.27", "hatch-vcs>=0.4"]
+build-backend = "hatchling.build"
+
+[project]
+name = "clang"
+description = "clang python bindings"
+readme = {file = "README.txt", content-type = "text/plain"}
+
+license = "Apache-2.0 WITH LLVM-exception"
+authors = [
+    { name = "LLVM" }
+]
+keywords = ["llvm", "clang", "libclang"]
+classifiers = [
+    "Intended Audience :: Developers",
+    "Development Status :: 5 - Production/Stable",
+    "Topic :: Software Development :: Compilers",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+]
+requires-python = ">=3.8"
+dynamic = ["version"]
+
+[project.urls]
+Homepage = "https://clang.llvm.org/"
+Download = "https://llvm.org/releases/download.html"
+Discussions = "https://discourse.llvm.org/"
+"Issue Tracker" = "https://github.com/llvm/llvm-project/issues"
+"Source Code" = "https://github.com/llvm/llvm-project/tree/main/clang/bindings/python"
+
+[tool.hatch.version]
+source = "vcs"
+version-scheme = "no-guess-dev"
+# regex version capture group gets x.y.z with optional -rcN, -aN, -bN suffixes; -init is just consumed
+tag-pattern = "^llvmorg-(?P<version>\\d+(?:\\.\\d+)*(?:-rc\\d+)?)"
+
+[tool.hatch.build.hooks.vcs]
+version-file = "clang/_version.py"
+
+[tool.hatch.version.raw-options]
+search_parent_directories = true
+version_scheme = "no-guess-dev"
diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst
index ab9ea110e6d54..92ea07974373e 100644
--- a/clang/docs/HIPSupport.rst
+++ b/clang/docs/HIPSupport.rst
@@ -210,6 +210,95 @@ Host Code Compilation
 - These relocatable objects are then linked together.
 - Host code within a TU can call host functions and launch kernels from another TU.
 
+HIP Fat Binary Registration and Unregistration
+==============================================
+
+When compiling HIP for AMD GPUs, Clang embeds device code into HIP "fat
+binaries" and generates host-side helper functions that register these
+fat binaries with the HIP runtime at program start and unregister them at
+program exit. In non-RDC mode (``-fno-gpu-rdc``), each compilation unit
+typically produces its own HIP fat binary: a container that holds, for every
+enabled GPU architecture, a fully linked offloading device image (for example,
+a GPU code object) that can be loaded directly by the HIP runtime. In RDC mode
+(``-fgpu-rdc``), each compilation unit contributes device code in a relocatable
+form (for example, GPU object files or LLVM IR). A later device-link step links
+those relocatable inputs into fully linked device images per GPU architecture
+and then packages those images into a HIP fat binary container.
+
+Registering a HIP fat binary allows the runtime to discover the kernels and
+device variables defined in that container and to associate host-side addresses
+and symbols with the corresponding GPU-side entities. For example, when a
+host-side kernel launch stub is called, the HIP runtime uses information
+established during registration (and the fat binary handle it returned) to
+identify which GPU kernel symbol to launch from which device image.
+
+At the LLVM IR level, Clang/LLVM typically create an internal module
+constructor (for example ``__hip_module_ctor`` or a ``.hip.fatbin_reg``
+function) and add it to ``@llvm.global_ctors``. This constructor is called by
+the C runtime before ``main`` and it:
+
+* calls ``__hipRegisterFatBinary`` with a pointer to an internal wrapper
+  object that describes the HIP fat binary;
+* stores the returned handle in an internal global variable;
+* calls an internal helper such as ``__hip_register_globals`` to register
+  kernels, device variables and other metadata associated with the fat binary;
+* registers a corresponding module destructor with ``atexit`` so it will run
+  during program termination and use the stored handle to unregister the fat
+  binary from the HIP runtime.
+
+The module destructor (for example ``__hip_module_dtor`` or a
+``.hip.fatbin_unreg`` function) loads the stored handle, checks that it is
+non-null, calls ``__hipUnregisterFatBinary`` to unregister the fat binary from
+the HIP runtime, and then clears the handle. This ensures that the HIP runtime
+sees each fat binary registered exactly once and that it is unregistered once
+at exit, even when multiple translation units contribute HIP kernels to the
+same host program.
+
+These registration/unregistration helpers are implementation details of Clang's
+HIP code generation; user code should not call ``__hipRegisterFatBinary`` or
+``__hipUnregisterFatBinary`` directly.
+
+Implications for HIP Application Developers
+-------------------------------------------
+
+From the point of view of HIP application code, Clang and the HIP runtime
+provide the following guarantees:
+
+* Kernels and device variables defined in HIP code will be registered with the
+  HIP runtime before ``main`` begins execution.
+* Fat binaries will be unregistered via an ``atexit``-registered module
+  destructor after ``main`` returns (or after ``exit`` is called).
+
+Beyond these points, the detailed ordering of fat binary registration and
+unregistration relative to user-defined global constructors, destructors and
+other ``atexit`` handlers is not specified and should not be relied upon.
+Applications should avoid depending on HIP kernels or device variables being
+usable from global constructors or destructors, and instead perform HIP
+initialization and teardown that touches device state in ``main`` (or in
+functions called from ``main``).
+
+Implications for HIP Runtime Developers
+---------------------------------------
+
+HIP runtime implementations that are linked with Clang-generated host code
+must handle registration and unregistration in the presence of uncertain
+global ctor/dtor ordering:
+
+* ``__hipRegisterFatBinary`` must accept a pointer to the compiler-generated
+  wrapper object and return an opaque handle that remains valid for as long as
+  the fat binary may be used.
+* ``__hipUnregisterFatBinary`` must accept the handle previously returned by
+  ``__hipRegisterFatBinary`` and perform any necessary cleanup. It may be
+  called late in process teardown, after other parts of the runtime have
+  started shutting down, so it should be robust in the presence of partially
+  torn-down state.
+* Runtimes should use appropriate synchronization and guards so that fat
+  binary registration does not observe uninitialized resources and
+  unregistration does not release resources that are still required by other
+  runtime components. In particular, registration and unregistration routines
+  should be written to be safe under repeated calls and in the presence of
+  concurrent or overlapping initialization/teardown logic.
+
 Syntax Difference with CUDA
 ===========================
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3b67ee3819507..7ee7a9ea8fa1f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -84,8 +84,6 @@ Potentially Breaking Changes
 - Downstream projects that previously linked only against ``clangDriver`` may
   now (also) need to link against the new ``clangOptions`` library, since
   options-related code has been moved out of the Driver into a separate library.
-- The ``clangFrontend`` library no longer depends on ``clangDriver``, which may
-  break downstream projects that relied on this transitive dependency.
 
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 52360b67b306c..76a6463881c6f 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -270,6 +270,8 @@ CODEGENOPT(SanitizeMemoryUseAfterDtor, 1, 0, Benign) ///< Enable use-after-delet
 CODEGENOPT(SanitizeCfiCrossDso, 1, 0, Benign) ///< Enable cross-dso support in CFI.
 CODEGENOPT(SanitizeMinimalRuntime, 1, 0, Benign) ///< Use "_minimal" sanitizer runtime for
                                                  ///< diagnostics.
+CODEGENOPT(SanitizeHandlerPreserveAllRegs, 1, 0, Benign) ///< Use "_preserve" sanitizer runtime for
+                                                 ///< diagnostics.
 CODEGENOPT(SanitizeCfiICallGeneralizePointers, 1, 0, Benign) ///< Generalize pointer types in
                                                              ///< CFI icall function signatures
 CODEGENOPT(SanitizeCfiICallNormalizeIntegers, 1, 0, Benign) ///< Normalize integer types in
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 39af84c8d0872..1f5932225d31e 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1848,6 +1848,9 @@ class TargetInfo : public TransferrableTargetInfo,
     }
   }
 
+  /// Set features that depend on other features.
+  virtual void setDependentOpenCLOpts();
+
   /// Get supported OpenCL extensions and optional core features.
   llvm::StringMap<bool> &getSupportedOpenCLOpts() {
     return getTargetOpts().OpenCLFeaturesMap;
diff --git a/clang/include/clang/Driver/CommonArgs.h b/clang/include/clang/Driver/CommonArgs.h
index 464a4b335b620..56c2a65ce8c7b 100644
--- a/clang/include/clang/Driver/CommonArgs.h
+++ b/clang/include/clang/Driver/CommonArgs.h
@@ -312,6 +312,16 @@ void handleVectorizeLoopsArgs(const llvm::opt::ArgList &Args,
 void handleVectorizeSLPArgs(const llvm::opt::ArgList &Args,
                             llvm::opt::ArgStringList &CmdArgs);
 
+// Parse -mprefer-vector-width=. Return the Value string if well-formed.
+// Otherwise, return an empty string and issue a diagnosic message if needed.
+StringRef parseMPreferVectorWidthOption(clang::DiagnosticsEngine &Diags,
+                                        const llvm::opt::ArgList &Args);
+
+// Parse -mrecip. Return the Value string if well-formed.
+// Otherwise, return an empty string and issue a diagnosic message if needed.
+StringRef parseMRecipOption(clang::DiagnosticsEngine &Diags,
+                            const llvm::opt::ArgList &Args);
+
 // Convert ComplexRangeKind to a string that can be passed as a frontend option.
 std::string complexRangeKindToStr(LangOptions::ComplexRangeKind Range);
 
diff --git a/clang/include/clang/Driver/CreateASTUnitFromArgs.h b/clang/include/clang/Driver/CreateASTUnitFromArgs.h
deleted file mode 100644
index 30575cc04ca7c..0000000000000
--- a/clang/include/clang/Driver/CreateASTUnitFromArgs.h
+++ /dev/null
@@ -1,80 +0,0 @@
-//===-- CreateInvocationFromArgs.h - Create an ASTUnit from Args-*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Utility for creating an ASTUnit from a vector of command line arguments.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_DRIVER_CREATEASTUNITFROMARGS_H
-#define LLVM_CLANG_DRIVER_CREATEASTUNITFROMARGS_H
-
-#include "clang/Frontend/ASTUnit.h"
-
-namespace clang {
-
-/// Create an ASTUnit from a vector of command line arguments, which must
-/// specify exactly one source file.
-///
-/// \param ArgBegin - The beginning of the argument vector.
-///
-/// \param ArgEnd - The end of the argument vector.
-///
-/// \param PCHContainerOps - The PCHContainerOperations to use for loading and
-/// creating modules.
-///
-/// \param Diags - The diagnostics engine to use for reporting errors; its
-/// lifetime is expected to extend past that of the returned ASTUnit.
-///
-/// \param ResourceFilesPath - The path to the compiler resource files.
-///
-/// \param StorePreamblesInMemory - Whether to store PCH in memory. If false,
-/// PCH are stored in temporary files.
-///
-/// \param PreambleStoragePath - The path to a directory, in which to create
-/// temporary PCH files. If empty, the default system temporary directory is
-/// used. This parameter is ignored if \p StorePreamblesInMemory is true.
-///
-/// \param ModuleFormat - If provided, uses the specific module format.
-///
-/// \param ErrAST - If non-null and parsing failed without any AST to return
-/// (e.g. because the PCH could not be loaded), this accepts the ASTUnit
-/// mainly to allow the caller to see the diagnostics.
-///
-/// \param VFS - A llvm::vfs::FileSystem to be used for all file accesses.
-/// Note that preamble is saved to a temporary directory on a RealFileSystem,
-/// so in order for it to be loaded correctly, VFS should have access to
-/// it(i.e., be an overlay over RealFileSystem). RealFileSystem will be used
-/// if \p VFS is nullptr.
-///
-// FIXME: Move OnlyLocalDecls, UseBumpAllocator to setters on the ASTUnit, we
-// shouldn't need to specify them at construction time.
-std::unique_ptr<ASTUnit> CreateASTUnitFromCommandLine(
-    const char **ArgBegin, const char **ArgEnd,
-    std::shared_ptr<PCHContainerOperations> PCHContainerOps,
-    std::shared_ptr<DiagnosticOptions> DiagOpts,
-    IntrusiveRefCntPtr<DiagnosticsEngine> Diags, StringRef ResourceFilesPath,
-    bool StorePreamblesInMemory = false,
-    StringRef PreambleStoragePath = StringRef(), bool OnlyLocalDecls = false,
-    CaptureDiagsKind CaptureDiagnostics = CaptureDiagsKind::None,
-    ArrayRef<ASTUnit::RemappedFile> RemappedFiles = {},
-    bool RemappedFilesKeepOriginalName = true,
-    unsigned PrecompilePreambleAfterNParses = 0,
-    TranslationUnitKind TUKind = TU_Complete,
-    bool CacheCodeCompletionResults = false,
-    bool IncludeBriefCommentsInCodeCompletion = false,
-    bool AllowPCHWithCompilerErrors = false,
-    SkipFunctionBodiesScope SkipFunctionBodies = SkipFunctionBodiesScope::None,
-    bool SingleFileParse = false, bool UserFilesAreVolatile = false,
-    bool ForSerialization = false, bool RetainExcludedConditionalBlocks = false,
-    std::optional<StringRef> ModuleFormat = std::nullopt,
-    std::unique_ptr<ASTUnit> *ErrAST = nullptr,
-    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS = nullptr);
-
-} // namespace clang
-
-#endif // LLVM_CLANG_DRIVER_CREATEASTUNITFROMARGS_H
diff --git a/clang/include/clang/Driver/CreateInvocationFromArgs.h b/clang/include/clang/Driver/CreateInvocationFromArgs.h
deleted file mode 100644
index 0e0f67373ce87..0000000000000
--- a/clang/include/clang/Driver/CreateInvocationFromArgs.h
+++ /dev/null
@@ -1,76 +0,0 @@
-//===--- CreateInvocationFromArgs.h - CompilerInvocation from Args --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Utility for creating a CompilerInvocation from command-line arguments, for
-// tools to use in preparation to parse a file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_DRIVER_CREATEINVOCATIONFROMARGS_H
-#define LLVM_CLANG_DRIVER_CREATEINVOCATIONFROMARGS_H
-
-#include "clang/Basic/Diagnostic.h"
-#include "clang/Basic/LLVM.h"
-#include "llvm/Support/VirtualFileSystem.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace clang {
-
-class CompilerInvocation;
-class DiagnosticsEngine;
-
-/// Optional inputs to createInvocation.
-struct CreateInvocationOptions {
-  /// Receives diagnostics encountered while parsing command-line flags.
-  /// If not provided, these are printed to stderr.
-  IntrusiveRefCntPtr<DiagnosticsEngine> Diags = nullptr;
-  /// Used e.g. to probe for system headers locations.
-  /// If not provided, the real filesystem is used.
-  /// FIXME: the driver does perform some non-virtualized IO.
-  IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS = nullptr;
-  /// Whether to attempt to produce a non-null (possibly incorrect) invocation
-  /// if any errors were encountered.
-  /// By default, always return null on errors.
-  bool RecoverOnError = false;
-  /// Allow the driver to probe the filesystem for PCH files.
-  /// This is used to replace -include with -include-pch in the cc1 args.
-  /// FIXME: ProbePrecompiled=true is a poor, historical default.
-  /// It misbehaves if the PCH file is from GCC, has the wrong version, etc.
-  bool ProbePrecompiled = false;
-  /// If set, the target is populated with the cc1 args produced by the driver.
-  /// This may be populated even if createInvocation returns nullptr.
-  std::vector<std::string> *CC1Args = nullptr;
-};
-
-/// Interpret clang arguments in preparation to parse a file.
-///
-/// This simulates a number of steps Clang takes when its driver is invoked:
-/// - choosing actions (e.g compile + link) to run
-/// - probing the system for settings like standard library locations
-/// - spawning a cc1 subprocess to compile code, with more explicit arguments
-/// - in the cc1 process, assembling those arguments into a CompilerInvocation
-///   which is used to configure the parser
-///
-/// This simulation is lossy, e.g. in some situations one driver run would
-/// result in multiple parses. (Multi-arch, CUDA, ...).
-/// This function tries to select a reasonable invocation that tools should use.
-///
-/// Args[0] should be the driver name, such as "clang" or "/usr/bin/g++".
-/// Absolute path is preferred - this affects searching for system headers.
-///
-/// May return nullptr if an invocation could not be determined.
-/// See CreateInvocationOptions::RecoverOnError to try harder!
-std::unique_ptr<CompilerInvocation>
-createInvocation(ArrayRef<const char *> Args,
-                 CreateInvocationOptions Opts = {});
-
-} // namespace clang
-
-#endif // LLVM_CLANG_DRIVER_CREATEINVOCATIONFROMARGS_H
diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index f13a0dd439f22..ed0048a507d71 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -412,6 +412,10 @@ class Driver {
                               SmallString<128> &CrashDiagDir);
 
 public:
+  /// Takes the path to a binary that's either in bin/ or lib/ and returns
+  /// the path to clang's resource directory.
+  static std::string GetResourcesPath(StringRef BinaryPath);
+
   Driver(StringRef ClangExecutable, StringRef TargetTriple,
          DiagnosticsEngine &Diags, std::string Title = "clang LLVM compiler",
          IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS = nullptr);
diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
index 08e3c147d0557..84fb66e16bee3 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -68,6 +68,7 @@ class SanitizerArgs {
   bool TsanAtomics = true;
   bool MinimalRuntime = false;
   bool TysanOutlineInstrumentation = true;
+  bool HandlerPreserveAllRegs = false;
   // True if cross-dso CFI support if provided by the system (i.e. Android).
   bool ImplicitCfiRuntime = false;
   bool NeedsMemProfRt = false;
diff --git a/clang/include/clang/Frontend/ASTUnit.h b/clang/include/clang/Frontend/ASTUnit.h
index 341460e1962cb..e585933a5c8be 100644
--- a/clang/include/clang/Frontend/ASTUnit.h
+++ b/clang/include/clang/Frontend/ASTUnit.h
@@ -23,13 +23,11 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TargetOptions.h"
 #include "clang/Frontend/PrecompiledPreamble.h"
-#include "clang/Frontend/StandaloneDiagnostic.h"
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/ModuleLoader.h"
 #include "clang/Lex/PreprocessingRecord.h"
 #include "clang/Sema/CodeCompleteConsumer.h"
 #include "clang/Serialization/ASTBitCodes.h"
-#include "clang/Serialization/ASTWriter.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
@@ -38,7 +36,6 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/Bitstream/BitstreamWriter.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -91,6 +88,25 @@ enum class CaptureDiagsKind { None, All, AllWithoutNonErrorsFromIncludes };
 
 /// Utility class for loading a ASTContext from an AST file.
 class ASTUnit {
+public:
+  struct StandaloneFixIt {
+    std::pair<unsigned, unsigned> RemoveRange;
+    std::pair<unsigned, unsigned> InsertFromRange;
+    std::string CodeToInsert;
+    bool BeforePreviousInsertions;
+  };
+
+  struct StandaloneDiagnostic {
+    unsigned ID;
+    DiagnosticsEngine::Level Level;
+    std::string Message;
+    std::string Filename;
+    unsigned LocOffset;
+    std::vector<std::pair<unsigned, unsigned>> Ranges;
+    std::vector<StandaloneFixIt> FixIts;
+  };
+
+private:
   std::unique_ptr<LangOptions> LangOpts;
   std::unique_ptr<CodeGenOptions> CodeGenOpts;
   // FIXME: The documentation on \c LoadFrom* member functions states that the
@@ -113,15 +129,7 @@ class ASTUnit {
   bool HadModuleLoaderFatalFailure = false;
   bool StorePreamblesInMemory = false;
 
-  /// Utility struct for managing ASTWriter and its associated data streams.
-  struct ASTWriterData {
-    SmallString<128> Buffer;
-    llvm::BitstreamWriter Stream;
-    ASTWriter Writer;
-
-    ASTWriterData(ModuleCache &ModCache, const CodeGenOptions &CGOpts)
-        : Stream(Buffer), Writer(Stream, Buffer, ModCache, CGOpts, {}) {}
-  };
+  struct ASTWriterData;
   std::unique_ptr<ASTWriterData> WriterData;
 
   FileSystemOptions FileSystemOpts;
@@ -263,6 +271,11 @@ class ASTUnit {
   static void ConfigureDiags(IntrusiveRefCntPtr<DiagnosticsEngine> Diags,
                              ASTUnit &AST, CaptureDiagsKind CaptureDiagnostics);
 
+  void
+  TranslateStoredDiagnostics(FileManager &FileMgr, SourceManager &SrcMan,
+                             const SmallVectorImpl<StandaloneDiagnostic> &Diags,
+                             SmallVectorImpl<StoredDiagnostic> &Out);
+
   void clearFileLevelDecls();
 
 public:
@@ -821,24 +834,65 @@ class ASTUnit {
       bool IncludeBriefCommentsInCodeCompletion = false,
       bool UserFilesAreVolatile = false);
 
-  friend std::unique_ptr<ASTUnit> CreateASTUnitFromCommandLine(
+  /// LoadFromCommandLine - Create an ASTUnit from a vector of command line
+  /// arguments, which must specify exactly one source file.
+  ///
+  /// \param ArgBegin - The beginning of the argument vector.
+  ///
+  /// \param ArgEnd - The end of the argument vector.
+  ///
+  /// \param PCHContainerOps - The PCHContainerOperations to use for loading and
+  /// creating modules.
+  ///
+  /// \param Diags - The diagnostics engine to use for reporting errors; its
+  /// lifetime is expected to extend past that of the returned ASTUnit.
+  ///
+  /// \param ResourceFilesPath - The path to the compiler resource files.
+  ///
+  /// \param StorePreamblesInMemory - Whether to store PCH in memory. If false,
+  /// PCH are stored in temporary files.
+  ///
+  /// \param PreambleStoragePath - The path to a directory, in which to create
+  /// temporary PCH files. If empty, the default system temporary directory is
+  /// used. This parameter is ignored if \p StorePreamblesInMemory is true.
+  ///
+  /// \param ModuleFormat - If provided, uses the specific module format.
+  ///
+  /// \param ErrAST - If non-null and parsing failed without any AST to return
+  /// (e.g. because the PCH could not be loaded), this accepts the ASTUnit
+  /// mainly to allow the caller to see the diagnostics.
+  ///
+  /// \param VFS - A llvm::vfs::FileSystem to be used for all file accesses.
+  /// Note that preamble is saved to a temporary directory on a RealFileSystem,
+  /// so in order for it to be loaded correctly, VFS should have access to
+  /// it(i.e., be an overlay over RealFileSystem). RealFileSystem will be used
+  /// if \p VFS is nullptr.
+  ///
+  // FIXME: Move OnlyLocalDecls, UseBumpAllocator to setters on the ASTUnit, we
+  // shouldn't need to specify them at construction time.
+  static std::unique_ptr<ASTUnit> LoadFromCommandLine(
       const char **ArgBegin, const char **ArgEnd,
       std::shared_ptr<PCHContainerOperations> PCHContainerOps,
       std::shared_ptr<DiagnosticOptions> DiagOpts,
       IntrusiveRefCntPtr<DiagnosticsEngine> Diags, StringRef ResourceFilesPath,
-      bool StorePreamblesInMemory, StringRef PreambleStoragePath,
-      bool OnlyLocalDecls, CaptureDiagsKind CaptureDiagnostics,
-      ArrayRef<ASTUnit::RemappedFile> RemappedFiles,
-      bool RemappedFilesKeepOriginalName,
-      unsigned PrecompilePreambleAfterNParses, TranslationUnitKind TUKind,
-      bool CacheCodeCompletionResults,
-      bool IncludeBriefCommentsInCodeCompletion,
-      bool AllowPCHWithCompilerErrors,
-      SkipFunctionBodiesScope SkipFunctionBodies, bool SingleFileParse,
-      bool UserFilesAreVolatile, bool ForSerialization,
-      bool RetainExcludedConditionalBlocks,
-      std::optional<StringRef> ModuleFormat, std::unique_ptr<ASTUnit> *ErrAST,
-      IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS);
+      bool StorePreamblesInMemory = false,
+      StringRef PreambleStoragePath = StringRef(), bool OnlyLocalDecls = false,
+      CaptureDiagsKind CaptureDiagnostics = CaptureDiagsKind::None,
+      ArrayRef<RemappedFile> RemappedFiles = {},
+      bool RemappedFilesKeepOriginalName = true,
+      unsigned PrecompilePreambleAfterNParses = 0,
+      TranslationUnitKind TUKind = TU_Complete,
+      bool CacheCodeCompletionResults = false,
+      bool IncludeBriefCommentsInCodeCompletion = false,
+      bool AllowPCHWithCompilerErrors = false,
+      SkipFunctionBodiesScope SkipFunctionBodies =
+          SkipFunctionBodiesScope::None,
+      bool SingleFileParse = false, bool UserFilesAreVolatile = false,
+      bool ForSerialization = false,
+      bool RetainExcludedConditionalBlocks = false,
+      std::optional<StringRef> ModuleFormat = std::nullopt,
+      std::unique_ptr<ASTUnit> *ErrAST = nullptr,
+      IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS = nullptr);
 
   /// Reparse the source files using the same command-line options that
   /// were originally used to produce this translation unit.
@@ -909,44 +963,6 @@ class ASTUnit {
   bool serialize(raw_ostream &OS);
 };
 
-/// Diagnostic consumer that saves each diagnostic it is given.
-class FilterAndStoreDiagnosticConsumer : public DiagnosticConsumer {
-  SmallVectorImpl<StoredDiagnostic> *StoredDiags;
-  SmallVectorImpl<StandaloneDiagnostic> *StandaloneDiags;
-  bool CaptureNonErrorsFromIncludes = true;
-  const LangOptions *LangOpts = nullptr;
-  SourceManager *SourceMgr = nullptr;
-
-public:
-  FilterAndStoreDiagnosticConsumer(
-      SmallVectorImpl<StoredDiagnostic> *StoredDiags,
-      SmallVectorImpl<StandaloneDiagnostic> *StandaloneDiags,
-      bool CaptureNonErrorsFromIncludes);
-
-  void BeginSourceFile(const LangOptions &LangOpts,
-                       const Preprocessor *PP = nullptr) override;
-
-  void HandleDiagnostic(DiagnosticsEngine::Level Level,
-                        const Diagnostic &Info) override;
-};
-
-/// RAII object that optionally captures and filters diagnostics, if
-/// there is no diagnostic client to capture them already.
-class CaptureDroppedDiagnostics {
-  DiagnosticsEngine &Diags;
-  FilterAndStoreDiagnosticConsumer Client;
-  DiagnosticConsumer *PreviousClient = nullptr;
-  std::unique_ptr<DiagnosticConsumer> OwningPreviousClient;
-
-public:
-  CaptureDroppedDiagnostics(
-      CaptureDiagsKind CaptureDiagnostics, DiagnosticsEngine &Diags,
-      SmallVectorImpl<StoredDiagnostic> *StoredDiags,
-      SmallVectorImpl<StandaloneDiagnostic> *StandaloneDiags);
-
-  ~CaptureDroppedDiagnostics();
-};
-
 } // namespace clang
 
 #endif // LLVM_CLANG_FRONTEND_ASTUNIT_H
diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h
index 4977ddb307d21..b19a6e1a8acc3 100644
--- a/clang/include/clang/Frontend/CompilerInvocation.h
+++ b/clang/include/clang/Frontend/CompilerInvocation.h
@@ -299,6 +299,16 @@ class CompilerInvocation : public CompilerInvocationBase {
                              DiagnosticsEngine &Diags,
                              const char *Argv0 = nullptr);
 
+  /// Get the directory where the compiler headers
+  /// reside, relative to the compiler binary (found by the passed in
+  /// arguments).
+  ///
+  /// \param Argv0 - The program path (from argv[0]), for finding the builtin
+  /// compiler path.
+  /// \param MainAddr - The address of main (or some other function in the main
+  /// executable), for finding the builtin compiler path.
+  static std::string GetResourcesPath(const char *Argv0, void *MainAddr);
+
   /// Populate \p Opts with the default set of pointer authentication-related
   /// options given \p LangOpts and \p Triple.
   ///
diff --git a/clang/include/clang/Frontend/StandaloneDiagnostic.h b/clang/include/clang/Frontend/StandaloneDiagnostic.h
deleted file mode 100644
index c23d5f95e0c2f..0000000000000
--- a/clang/include/clang/Frontend/StandaloneDiagnostic.h
+++ /dev/null
@@ -1,82 +0,0 @@
-//===--- StandaloneDiagnostic.h - Serializable Diagnostic -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A serializable diagnostic representation to retain diagnostics after their
-// SourceManager has been destroyed.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_FRONTEND_STANDALONEDIAGNOSTICS_H
-#define LLVM_CLANG_FRONTEND_STANDALONEDIAGNOSTICS_H
-
-#include "clang/Basic/DiagnosticIDs.h"
-#include "clang/Basic/DiagnosticOptions.h"
-#include "clang/Basic/SourceLocation.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Basic/Specifiers.h"
-#include "llvm/ADT/StringExtras.h"
-#include <cassert>
-#include <string>
-#include <vector>
-
-namespace clang {
-
-/// Represents a StoredDiagnostic in a form that can be retained until after its
-/// SourceManager has been destroyed.
-///
-/// Source locations are stored as a combination of filename and offsets into
-/// that file.
-/// To report the diagnostic, it must first be translated back into a
-/// StoredDiagnostic with a new associated SourceManager.
-struct StandaloneDiagnostic {
-  /// Represents a CharSourceRange within a StandaloneDiagnostic.
-  struct SourceOffsetRange {
-    SourceOffsetRange(CharSourceRange Range, const SourceManager &SrcMgr,
-                      const LangOptions &LangOpts);
-
-    unsigned Begin = 0;
-    unsigned End = 0;
-  };
-
-  /// Represents a FixItHint within a StandaloneDiagnostic.
-  struct StandaloneFixIt {
-    StandaloneFixIt(const SourceManager &SrcMgr, const LangOptions &LangOpts,
-                    const FixItHint &FixIt);
-
-    SourceOffsetRange RemoveRange;
-    SourceOffsetRange InsertFromRange;
-    std::string CodeToInsert;
-    bool BeforePreviousInsertions;
-  };
-
-  StandaloneDiagnostic(const LangOptions &LangOpts,
-                       const StoredDiagnostic &InDiag);
-
-  DiagnosticsEngine::Level Level;
-  SrcMgr::CharacteristicKind FileKind;
-  unsigned ID = 0;
-  unsigned FileOffset = 0;
-  std::string Message;
-  std::string Filename;
-  std::vector<SourceOffsetRange> Ranges;
-  std::vector<StandaloneFixIt> FixIts;
-};
-
-/// Translates \c StandaloneDiag into a StoredDiagnostic, associating it with
-/// the provided FileManager and SourceManager.
-///
-/// This allows the diagnostic to be emitted using the diagnostics engine, since
-/// StandaloneDiagnostics themselfs cannot be emitted directly.
-StoredDiagnostic
-translateStandaloneDiag(FileManager &FileMgr, SourceManager &SrcMgr,
-                        const StandaloneDiagnostic &StandaloneDiag,
-                        llvm::StringMap<SourceLocation> &SrcLocCache);
-
-} // namespace clang
-
-#endif // STANDALONEDIAGNOSTICS
diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h
index 1c561b47b5c47..ed2703c76f18d 100644
--- a/clang/include/clang/Frontend/Utils.h
+++ b/clang/include/clang/Frontend/Utils.h
@@ -192,6 +192,51 @@ IntrusiveRefCntPtr<ExternalSemaSource>
 createChainedIncludesSource(CompilerInstance &CI,
                             IntrusiveRefCntPtr<ASTReader> &OutReader);
 
+/// Optional inputs to createInvocation.
+struct CreateInvocationOptions {
+  /// Receives diagnostics encountered while parsing command-line flags.
+  /// If not provided, these are printed to stderr.
+  IntrusiveRefCntPtr<DiagnosticsEngine> Diags = nullptr;
+  /// Used e.g. to probe for system headers locations.
+  /// If not provided, the real filesystem is used.
+  /// FIXME: the driver does perform some non-virtualized IO.
+  IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS = nullptr;
+  /// Whether to attempt to produce a non-null (possibly incorrect) invocation
+  /// if any errors were encountered.
+  /// By default, always return null on errors.
+  bool RecoverOnError = false;
+  /// Allow the driver to probe the filesystem for PCH files.
+  /// This is used to replace -include with -include-pch in the cc1 args.
+  /// FIXME: ProbePrecompiled=true is a poor, historical default.
+  /// It misbehaves if the PCH file is from GCC, has the wrong version, etc.
+  bool ProbePrecompiled = false;
+  /// If set, the target is populated with the cc1 args produced by the driver.
+  /// This may be populated even if createInvocation returns nullptr.
+  std::vector<std::string> *CC1Args = nullptr;
+};
+
+/// Interpret clang arguments in preparation to parse a file.
+///
+/// This simulates a number of steps Clang takes when its driver is invoked:
+/// - choosing actions (e.g compile + link) to run
+/// - probing the system for settings like standard library locations
+/// - spawning a cc1 subprocess to compile code, with more explicit arguments
+/// - in the cc1 process, assembling those arguments into a CompilerInvocation
+///   which is used to configure the parser
+///
+/// This simulation is lossy, e.g. in some situations one driver run would
+/// result in multiple parses. (Multi-arch, CUDA, ...).
+/// This function tries to select a reasonable invocation that tools should use.
+///
+/// Args[0] should be the driver name, such as "clang" or "/usr/bin/g++".
+/// Absolute path is preferred - this affects searching for system headers.
+///
+/// May return nullptr if an invocation could not be determined.
+/// See CreateInvocationOptions::ShouldRecoverOnErrors to try harder!
+std::unique_ptr<CompilerInvocation>
+createInvocation(ArrayRef<const char *> Args,
+                 CreateInvocationOptions Opts = {});
+
 } // namespace clang
 
 #endif // LLVM_CLANG_FRONTEND_UTILS_H
diff --git a/clang/include/clang/Options/OptionUtils.h b/clang/include/clang/Options/OptionUtils.h
index 02c9c27554db1..83c48bd7d6843 100644
--- a/clang/include/clang/Options/OptionUtils.h
+++ b/clang/include/clang/Options/OptionUtils.h
@@ -28,7 +28,6 @@ class ArgList;
 } // namespace llvm
 
 namespace clang {
-
 /// Return the value of the last argument as an integer, or a default. If Diags
 /// is non-null, emits an error if the argument is given, but non-integral.
 int getLastArgIntValue(const llvm::opt::ArgList &Args,
@@ -54,29 +53,6 @@ inline uint64_t getLastArgUInt64Value(const llvm::opt::ArgList &Args,
   return getLastArgUInt64Value(Args, Id, Default, &Diags, Base);
 }
 
-// Parse -mprefer-vector-width=. Return the Value string if well-formed.
-// Otherwise, return an empty string and issue a diagnosic message if needed.
-StringRef parseMPreferVectorWidthOption(clang::DiagnosticsEngine &Diags,
-                                        const llvm::opt::ArgList &Args);
-
-// Parse -mrecip. Return the Value string if well-formed.
-// Otherwise, return an empty string and issue a diagnosic message if needed.
-StringRef parseMRecipOption(clang::DiagnosticsEngine &Diags,
-                            const llvm::opt::ArgList &Args);
-
-/// Get the directory where the compiler headers reside, relative to the
-/// compiler binary path \p BinaryPath.
-std::string GetResourcesPath(StringRef BinaryPath);
-
-/// Get the directory where the compiler headers reside, relative to the
-/// compiler binary path (found by the passed in arguments).
-///
-/// \param Argv0 The program path (from argv[0]), for finding the builtin
-/// compiler path.
-/// \param MainAddr The address of main (or some other function in the main
-/// executable), for finding the builtin compiler path.
-std::string GetResourcesPath(const char *Argv0, void *MainAddr);
-
 } // namespace clang
 
 #endif // LLVM_CLANG_OPTIONS_OPTIONUTILS_H
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 3fc3baed9636c..67d17674ac794 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -2672,6 +2672,15 @@ defm sanitize_minimal_runtime : BoolOption<"f", "sanitize-minimal-runtime",
   PosFlag<SetTrue>,
   NegFlag<SetFalse>>,
   Group<f_clang_Group>;
+defm sanitize_handler_preserve_all_regs
+    : BoolOption<
+          "f", "sanitize-handler-preserve-all-regs",
+          CodeGenOpts<"SanitizeHandlerPreserveAllRegs">, DefaultFalse,
+          PosFlag<SetTrue, [], [],
+                  "Enable handlers with preserve_all calling convention">,
+          NegFlag<SetFalse, [], [],
+                  "Disable handlers with preserve_all calling convention">>,
+      Group<f_clang_Group>;
 def fsanitize_link_runtime : Flag<["-"], "fsanitize-link-runtime">,
                            Group<f_clang_Group>;
 def fno_sanitize_link_runtime : Flag<["-"], "fno-sanitize-link-runtime">,
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index 9a5db6e164f66..c0ed900ebd45c 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -640,6 +640,17 @@ bool TargetInfo::areDefaultedSMFStillPOD(const LangOptions &LangOpts) const {
   return LangOpts.getClangABICompat() > LangOptions::ClangABI::Ver15;
 }
 
+void TargetInfo::setDependentOpenCLOpts() {
+  auto &Opts = getSupportedOpenCLOpts();
+  if (!hasFeatureEnabled(Opts, "cl_khr_fp64") ||
+      !hasFeatureEnabled(Opts, "__opencl_c_fp64")) {
+    setFeatureEnabled(Opts, "__opencl_c_ext_fp64_global_atomic_add", false);
+    setFeatureEnabled(Opts, "__opencl_c_ext_fp64_local_atomic_add", false);
+    setFeatureEnabled(Opts, "__opencl_c_ext_fp64_global_atomic_min_max", false);
+    setFeatureEnabled(Opts, "__opencl_c_ext_fp64_local_atomic_min_max", false);
+  }
+}
+
 LangAS TargetInfo::getOpenCLTypeAddrSpace(OpenCLTypeKind TK) const {
   switch (TK) {
   case OCLTK_Image:
diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index f39c698b5d734..38eb1edd4bfb7 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -862,6 +862,7 @@ TargetInfo *TargetInfo::CreateTargetInfo(DiagnosticsEngine &Diags,
 
   Target->setSupportedOpenCLOpts();
   Target->setCommandLineOpenCLOpts();
+  Target->setDependentOpenCLOpts();
   Target->setMaxAtomicWidth();
 
   if (!Opts->DarwinTargetVariantTriple.empty())
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 978fee7dbec9d..e7aa8a234efd9 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -68,6 +68,28 @@ static mlir::Value emitVectorFCmp(CIRGenBuilderTy &builder,
   return bitCast;
 }
 
+static mlir::Value getMaskVecValue(CIRGenFunction &cgf, const CallExpr *expr,
+                                   mlir::Value mask, unsigned numElems) {
+
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+  auto maskTy = cir::VectorType::get(
+      builder.getUIntNTy(1), cast<cir::IntType>(mask.getType()).getWidth());
+  mlir::Value maskVec = builder.createBitcast(mask, maskTy);
+
+  // If we have less than 8 elements, then the starting mask was an i8 and
+  // we need to extract down to the right number of elements.
+  if (numElems < 8) {
+    SmallVector<mlir::Attribute, 4> indices;
+    mlir::Type i32Ty = builder.getSInt32Ty();
+    for (auto i : llvm::seq<unsigned>(0, numElems))
+      indices.push_back(cir::IntAttr::get(i32Ty, i));
+
+    maskVec = builder.createVecShuffle(cgf.getLoc(expr->getExprLoc()), maskVec,
+                                       maskVec, indices);
+  }
+  return maskVec;
+}
+
 mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
                                                const CallExpr *expr) {
   if (builtinID == Builtin::BI__builtin_cpu_is) {
@@ -575,14 +597,60 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_psrldqi128_byteshift:
   case X86::BI__builtin_ia32_psrldqi256_byteshift:
   case X86::BI__builtin_ia32_psrldqi512_byteshift:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_kshiftliqi:
   case X86::BI__builtin_ia32_kshiftlihi:
   case X86::BI__builtin_ia32_kshiftlisi:
-  case X86::BI__builtin_ia32_kshiftlidi:
+  case X86::BI__builtin_ia32_kshiftlidi: {
+    unsigned shiftVal =
+        ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue() &
+        0xff;
+    unsigned numElems = cast<cir::IntType>(ops[0].getType()).getWidth();
+
+    if (shiftVal >= numElems)
+      return builder.getNullValue(ops[0].getType(), getLoc(expr->getExprLoc()));
+
+    mlir::Value in = getMaskVecValue(*this, expr, ops[0], numElems);
+
+    SmallVector<mlir::Attribute, 64> indices;
+    mlir::Type i32Ty = builder.getSInt32Ty();
+    for (auto i : llvm::seq<unsigned>(0, numElems))
+      indices.push_back(cir::IntAttr::get(i32Ty, numElems + i - shiftVal));
+
+    mlir::Value zero =
+        builder.getNullValue(in.getType(), getLoc(expr->getExprLoc()));
+    mlir::Value sv =
+        builder.createVecShuffle(getLoc(expr->getExprLoc()), zero, in, indices);
+    return builder.createBitcast(sv, ops[0].getType());
+  }
   case X86::BI__builtin_ia32_kshiftriqi:
   case X86::BI__builtin_ia32_kshiftrihi:
   case X86::BI__builtin_ia32_kshiftrisi:
-  case X86::BI__builtin_ia32_kshiftridi:
+  case X86::BI__builtin_ia32_kshiftridi: {
+    unsigned shiftVal =
+        ops[1].getDefiningOp<cir::ConstantOp>().getIntValue().getZExtValue() &
+        0xff;
+    unsigned numElems = cast<cir::IntType>(ops[0].getType()).getWidth();
+
+    if (shiftVal >= numElems)
+      return builder.getNullValue(ops[0].getType(), getLoc(expr->getExprLoc()));
+
+    mlir::Value in = getMaskVecValue(*this, expr, ops[0], numElems);
+
+    SmallVector<mlir::Attribute, 64> indices;
+    mlir::Type i32Ty = builder.getSInt32Ty();
+    for (auto i : llvm::seq<unsigned>(0, numElems))
+      indices.push_back(cir::IntAttr::get(i32Ty, i + shiftVal));
+
+    mlir::Value zero =
+        builder.getNullValue(in.getType(), getLoc(expr->getExprLoc()));
+    mlir::Value sv =
+        builder.createVecShuffle(getLoc(expr->getExprLoc()), in, zero, indices);
+    return builder.createBitcast(sv, ops[0].getType());
+  }
   case X86::BI__builtin_ia32_vprotbi:
   case X86::BI__builtin_ia32_vprotwi:
   case X86::BI__builtin_ia32_vprotdi:
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index f8f354c2d1072..cedc2a73b9260 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -702,6 +702,7 @@ cir::FuncOp LoweringPreparePass::getOrCreateDtorFunc(CIRBaseBuilderTy &builder,
                                                      cir::GlobalOp op,
                                                      mlir::Region &dtorRegion,
                                                      cir::CallOp &dtorCall) {
+  mlir::OpBuilder::InsertionGuard guard(builder);
   assert(!cir::MissingFeatures::astVarDeclInterface());
   assert(!cir::MissingFeatures::opGlobalThreadLocal());
 
diff --git a/clang/lib/CrossTU/CMakeLists.txt b/clang/lib/CrossTU/CMakeLists.txt
index eef7a892701fb..3349fc283925d 100644
--- a/clang/lib/CrossTU/CMakeLists.txt
+++ b/clang/lib/CrossTU/CMakeLists.txt
@@ -9,7 +9,6 @@ add_clang_library(clangCrossTU
   LINK_LIBS
   clangAST
   clangBasic
-  clangDriver
   clangFrontend
   clangIndex
   )
diff --git a/clang/lib/CrossTU/CrossTranslationUnit.cpp b/clang/lib/CrossTU/CrossTranslationUnit.cpp
index a3fc2cf6bfb3c..0287845a741ed 100644
--- a/clang/lib/CrossTU/CrossTranslationUnit.cpp
+++ b/clang/lib/CrossTU/CrossTranslationUnit.cpp
@@ -16,7 +16,6 @@
 #include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CrossTU/CrossTUDiagnostic.h"
-#include "clang/Driver/CreateASTUnitFromArgs.h"
 #include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
@@ -620,7 +619,7 @@ CrossTranslationUnitContext::ASTLoader::loadFromSource(
   auto Diags = llvm::makeIntrusiveRefCnt<DiagnosticsEngine>(DiagID, *DiagOpts,
                                                             DiagClient);
 
-  return CreateASTUnitFromCommandLine(
+  return ASTUnit::LoadFromCommandLine(
       CommandLineArgs.begin(), (CommandLineArgs.end()),
       CI.getPCHContainerOperations(), DiagOpts, Diags,
       CI.getHeaderSearchOpts().ResourceDir);
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index 7a74b444eb8df..b68e26f4d3847 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -18,8 +18,6 @@ endif()
 add_clang_library(clangDriver
   Action.cpp
   Compilation.cpp
-  CreateASTUnitFromArgs.cpp
-  CreateInvocationFromArgs.cpp
   Distro.cpp
   Driver.cpp
   Job.cpp
@@ -99,8 +97,6 @@ add_clang_library(clangDriver
 
   LINK_LIBS
   clangBasic
-  clangFrontend
-  clangSerialization
   clangLex
   clangOptions
   ${system_libs}
diff --git a/clang/lib/Driver/CreateASTUnitFromArgs.cpp b/clang/lib/Driver/CreateASTUnitFromArgs.cpp
deleted file mode 100644
index ea31a8ed07c5f..0000000000000
--- a/clang/lib/Driver/CreateASTUnitFromArgs.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-//===--- CreateASTUnitFromArgs.h - Create an ASTUnit from Args ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Utility for creating an ASTUnit from a vector of command line arguments.
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Driver/CreateASTUnitFromArgs.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
-#include "clang/Frontend/CompilerInvocation.h"
-#include "clang/Lex/PreprocessorOptions.h"
-#include "clang/Serialization/ModuleCache.h"
-#include "llvm/Support/CrashRecoveryContext.h"
-
-using namespace clang;
-
-/// Create an ASTUnit from a vector of command line arguments, which must
-/// specify exactly one source file.
-///
-/// \param ArgBegin - The beginning of the argument vector.
-///
-/// \param ArgEnd - The end of the argument vector.
-///
-/// \param PCHContainerOps - The PCHContainerOperations to use for loading and
-/// creating modules.
-///
-/// \param Diags - The diagnostics engine to use for reporting errors; its
-/// lifetime is expected to extend past that of the returned ASTUnit.
-///
-/// \param ResourceFilesPath - The path to the compiler resource files.
-///
-/// \param StorePreamblesInMemory - Whether to store PCH in memory. If false,
-/// PCH are stored in temporary files.
-///
-/// \param PreambleStoragePath - The path to a directory, in which to create
-/// temporary PCH files. If empty, the default system temporary directory is
-/// used. This parameter is ignored if \p StorePreamblesInMemory is true.
-///
-/// \param ModuleFormat - If provided, uses the specific module format.
-///
-/// \param ErrAST - If non-null and parsing failed without any AST to return
-/// (e.g. because the PCH could not be loaded), this accepts the ASTUnit
-/// mainly to allow the caller to see the diagnostics.
-///
-/// \param VFS - A llvm::vfs::FileSystem to be used for all file accesses.
-/// Note that preamble is saved to a temporary directory on a RealFileSystem,
-/// so in order for it to be loaded correctly, VFS should have access to
-/// it(i.e., be an overlay over RealFileSystem). RealFileSystem will be used
-/// if \p VFS is nullptr.
-///
-// FIXME: Move OnlyLocalDecls, UseBumpAllocator to setters on the ASTUnit, we
-// shouldn't need to specify them at construction time.
-std::unique_ptr<ASTUnit> clang::CreateASTUnitFromCommandLine(
-    const char **ArgBegin, const char **ArgEnd,
-    std::shared_ptr<PCHContainerOperations> PCHContainerOps,
-    std::shared_ptr<DiagnosticOptions> DiagOpts,
-    IntrusiveRefCntPtr<DiagnosticsEngine> Diags, StringRef ResourceFilesPath,
-    bool StorePreamblesInMemory, StringRef PreambleStoragePath,
-    bool OnlyLocalDecls, CaptureDiagsKind CaptureDiagnostics,
-    ArrayRef<ASTUnit::RemappedFile> RemappedFiles,
-    bool RemappedFilesKeepOriginalName, unsigned PrecompilePreambleAfterNParses,
-    TranslationUnitKind TUKind, bool CacheCodeCompletionResults,
-    bool IncludeBriefCommentsInCodeCompletion, bool AllowPCHWithCompilerErrors,
-    SkipFunctionBodiesScope SkipFunctionBodies, bool SingleFileParse,
-    bool UserFilesAreVolatile, bool ForSerialization,
-    bool RetainExcludedConditionalBlocks, std::optional<StringRef> ModuleFormat,
-    std::unique_ptr<ASTUnit> *ErrAST,
-    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS) {
-  assert(Diags.get() && "no DiagnosticsEngine was provided");
-
-  // If no VFS was provided, create one that tracks the physical file system.
-  // If '-working-directory' was passed as an argument, 'createInvocation' will
-  // set this as the current working directory of the VFS.
-  if (!VFS)
-    VFS = llvm::vfs::createPhysicalFileSystem();
-
-  SmallVector<StoredDiagnostic, 4> StoredDiagnostics;
-
-  std::shared_ptr<CompilerInvocation> CI;
-
-  {
-    CaptureDroppedDiagnostics Capture(CaptureDiagnostics, *Diags,
-                                      &StoredDiagnostics, nullptr);
-
-    CreateInvocationOptions CIOpts;
-    CIOpts.VFS = VFS;
-    CIOpts.Diags = Diags;
-    CIOpts.ProbePrecompiled = true; // FIXME: historical default. Needed?
-    CI = createInvocation(llvm::ArrayRef(ArgBegin, ArgEnd), std::move(CIOpts));
-    if (!CI)
-      return nullptr;
-  }
-
-  // Override any files that need remapping
-  for (const auto &RemappedFile : RemappedFiles) {
-    CI->getPreprocessorOpts().addRemappedFile(RemappedFile.first,
-                                              RemappedFile.second);
-  }
-  PreprocessorOptions &PPOpts = CI->getPreprocessorOpts();
-  PPOpts.RemappedFilesKeepOriginalName = RemappedFilesKeepOriginalName;
-  PPOpts.AllowPCHWithCompilerErrors = AllowPCHWithCompilerErrors;
-  PPOpts.SingleFileParseMode = SingleFileParse;
-  PPOpts.RetainExcludedConditionalBlocks = RetainExcludedConditionalBlocks;
-
-  // Override the resources path.
-  CI->getHeaderSearchOpts().ResourceDir = std::string(ResourceFilesPath);
-
-  CI->getFrontendOpts().SkipFunctionBodies =
-      SkipFunctionBodies == SkipFunctionBodiesScope::PreambleAndMainFile;
-
-  if (ModuleFormat)
-    CI->getHeaderSearchOpts().ModuleFormat = std::string(*ModuleFormat);
-
-  // Create the AST unit.
-  std::unique_ptr<ASTUnit> AST;
-  AST.reset(new ASTUnit(false));
-  AST->NumStoredDiagnosticsFromDriver = StoredDiagnostics.size();
-  AST->StoredDiagnostics.swap(StoredDiagnostics);
-  ASTUnit::ConfigureDiags(Diags, *AST, CaptureDiagnostics);
-  AST->DiagOpts = DiagOpts;
-  AST->Diagnostics = Diags;
-  AST->FileSystemOpts = CI->getFileSystemOpts();
-  AST->CodeGenOpts = std::make_unique<CodeGenOptions>(CI->getCodeGenOpts());
-  VFS = createVFSFromCompilerInvocation(*CI, *Diags, VFS);
-  AST->FileMgr =
-      llvm::makeIntrusiveRefCnt<FileManager>(AST->FileSystemOpts, VFS);
-  AST->StorePreamblesInMemory = StorePreamblesInMemory;
-  AST->PreambleStoragePath = PreambleStoragePath;
-  AST->ModCache = createCrossProcessModuleCache();
-  AST->OnlyLocalDecls = OnlyLocalDecls;
-  AST->CaptureDiagnostics = CaptureDiagnostics;
-  AST->TUKind = TUKind;
-  AST->ShouldCacheCodeCompletionResults = CacheCodeCompletionResults;
-  AST->IncludeBriefCommentsInCodeCompletion =
-      IncludeBriefCommentsInCodeCompletion;
-  AST->UserFilesAreVolatile = UserFilesAreVolatile;
-  AST->Invocation = CI;
-  AST->SkipFunctionBodies = SkipFunctionBodies;
-  if (ForSerialization)
-    AST->WriterData.reset(
-        new ASTUnit::ASTWriterData(*AST->ModCache, *AST->CodeGenOpts));
-  // Zero out now to ease cleanup during crash recovery.
-  CI = nullptr;
-  Diags = nullptr;
-
-  // Recover resources if we crash before exiting this method.
-  llvm::CrashRecoveryContextCleanupRegistrar<ASTUnit> ASTUnitCleanup(AST.get());
-
-  if (AST->LoadFromCompilerInvocation(std::move(PCHContainerOps),
-                                      PrecompilePreambleAfterNParses, VFS)) {
-    // Some error occurred, if caller wants to examine diagnostics, pass it the
-    // ASTUnit.
-    if (ErrAST) {
-      AST->StoredDiagnostics.swap(AST->FailedParseDiagnostics);
-      ErrAST->swap(AST);
-    }
-    return nullptr;
-  }
-
-  return AST;
-}
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index ee9c39c479eba..2a6648b13e4e5 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -67,8 +67,8 @@
 #include "clang/Driver/Types.h"
 #include "clang/Driver/Util.h"
 #include "clang/Lex/DependencyDirectivesScanner.h"
-#include "clang/Options/OptionUtils.h"
 #include "clang/Options/Options.h"
+#include "clang/Options/OptionUtils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -127,6 +127,40 @@ template <typename F> static bool usesInput(const ArgList &Args, F &&Fn) {
   });
 }
 
+// static
+std::string Driver::GetResourcesPath(StringRef BinaryPath) {
+  // Since the resource directory is embedded in the module hash, it's important
+  // that all places that need it call this function, so that they get the
+  // exact same string ("a/../b/" and "b/" get different hashes, for example).
+
+  // Dir is bin/ or lib/, depending on where BinaryPath is.
+  StringRef Dir = llvm::sys::path::parent_path(BinaryPath);
+  SmallString<128> P(Dir);
+
+  StringRef ConfiguredResourceDir(CLANG_RESOURCE_DIR);
+  if (!ConfiguredResourceDir.empty()) {
+    // FIXME: We should fix the behavior of llvm::sys::path::append so we don't
+    // need to check for absolute paths here.
+    if (llvm::sys::path::is_absolute(ConfiguredResourceDir))
+      P = ConfiguredResourceDir;
+    else
+      llvm::sys::path::append(P, ConfiguredResourceDir);
+  } else {
+    // On Windows, libclang.dll is in bin/.
+    // On non-Windows, libclang.so/.dylib is in lib/.
+    // With a static-library build of libclang, LibClangPath will contain the
+    // path of the embedding binary, which for LLVM binaries will be in bin/.
+    // ../lib gets us to lib/ in both cases.
+    P = llvm::sys::path::parent_path(Dir);
+    // This search path is also created in the COFF driver of lld, so any
+    // changes here also needs to happen in lld/COFF/Driver.cpp
+    llvm::sys::path::append(P, CLANG_INSTALL_LIBDIR_BASENAME, "clang",
+                            CLANG_VERSION_MAJOR_STRING);
+  }
+
+  return std::string(P);
+}
+
 CUIDOptions::CUIDOptions(llvm::opt::DerivedArgList &Args, const Driver &D)
     : UseCUID(Kind::Hash) {
   if (Arg *A = Args.getLastArg(options::OPT_fuse_cuid_EQ)) {
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 225153587664e..9902cbbf99436 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -423,6 +423,10 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
   MinimalRuntime =
       Args.hasFlag(options::OPT_fsanitize_minimal_runtime,
                    options::OPT_fno_sanitize_minimal_runtime, MinimalRuntime);
+  HandlerPreserveAllRegs =
+      Args.hasFlag(options::OPT_fsanitize_handler_preserve_all_regs,
+                   options::OPT_fno_sanitize_handler_preserve_all_regs,
+                   HandlerPreserveAllRegs);
 
   // The object size sanitizer should not be enabled at -O0.
   Arg *OptLevel = Args.getLastArg(options::OPT_O_Group);
@@ -1476,6 +1480,9 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
   if (MinimalRuntime)
     CmdArgs.push_back("-fsanitize-minimal-runtime");
 
+  if (HandlerPreserveAllRegs)
+    CmdArgs.push_back("-fsanitize-handler-preserve-all-regs");
+
   if (AsanFieldPadding)
     CmdArgs.push_back(Args.MakeArgString("-fsanitize-address-field-padding=" +
                                          Twine(AsanFieldPadding)));
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index f09a48b61a303..952b4fce80e5b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -33,7 +33,6 @@
 #include "clang/Driver/SanitizerArgs.h"
 #include "clang/Driver/Types.h"
 #include "clang/Driver/XRayArgs.h"
-#include "clang/Options/OptionUtils.h"
 #include "clang/Options/Options.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index ae373c60b47a1..d58735294aa62 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -3587,6 +3587,169 @@ void tools::handleInterchangeLoopsArgs(const ArgList &Args,
     CmdArgs.push_back("-floop-interchange");
 }
 
+// Parse -mprefer-vector-width=. Return the Value string if well-formed.
+// Otherwise, return an empty string and issue a diagnosic message if needed.
+StringRef tools::parseMPreferVectorWidthOption(clang::DiagnosticsEngine &Diags,
+                                               const llvm::opt::ArgList &Args) {
+  Arg *A = Args.getLastArg(options::OPT_mprefer_vector_width_EQ);
+  if (!A)
+    return "";
+
+  StringRef Value = A->getValue();
+  unsigned Width LLVM_ATTRIBUTE_UNINITIALIZED;
+
+  // Only "none" and Integer values are accepted by
+  // -mprefer-vector-width=<value>.
+  if (Value != "none" && Value.getAsInteger(10, Width)) {
+    Diags.Report(clang::diag::err_drv_invalid_value)
+        << A->getOption().getName() << Value;
+    return "";
+  }
+
+  return Value;
+}
+
+// This is a helper function for validating the optional refinement step
+// parameter in reciprocal argument strings. Return false if there is an error
+// parsing the refinement step. Otherwise, return true and set the Position
+// of the refinement step in the input string.
+static bool getRefinementStep(StringRef In, clang::DiagnosticsEngine &Diags,
+                              const Arg &A, size_t &Position) {
+  const char RefinementStepToken = ':';
+  Position = In.find(RefinementStepToken);
+  if (Position != StringRef::npos) {
+    StringRef Option = A.getOption().getName();
+    StringRef RefStep = In.substr(Position + 1);
+    // Allow exactly one numeric character for the additional refinement
+    // step parameter. This is reasonable for all currently-supported
+    // operations and architectures because we would expect that a larger value
+    // of refinement steps would cause the estimate "optimization" to
+    // under-perform the native operation. Also, if the estimate does not
+    // converge quickly, it probably will not ever converge, so further
+    // refinement steps will not produce a better answer.
+    if (RefStep.size() != 1) {
+      Diags.Report(diag::err_drv_invalid_value) << Option << RefStep;
+      return false;
+    }
+    char RefStepChar = RefStep[0];
+    if (RefStepChar < '0' || RefStepChar > '9') {
+      Diags.Report(diag::err_drv_invalid_value) << Option << RefStep;
+      return false;
+    }
+  }
+  return true;
+}
+
+// Parse -mrecip. Return the Value string if well-formed.
+// Otherwise, return an empty string and issue a diagnosic message if needed.
+StringRef tools::parseMRecipOption(clang::DiagnosticsEngine &Diags,
+                                   const ArgList &Args) {
+  StringRef DisabledPrefixIn = "!";
+  StringRef DisabledPrefixOut = "!";
+  StringRef EnabledPrefixOut = "";
+  StringRef Out = "";
+
+  Arg *A = Args.getLastArg(options::OPT_mrecip, options::OPT_mrecip_EQ);
+  if (!A)
+    return "";
+
+  unsigned NumOptions = A->getNumValues();
+  if (NumOptions == 0) {
+    // No option is the same as "all".
+    return "all";
+  }
+
+  // Pass through "all", "none", or "default" with an optional refinement step.
+  if (NumOptions == 1) {
+    StringRef Val = A->getValue(0);
+    size_t RefStepLoc;
+    if (!getRefinementStep(Val, Diags, *A, RefStepLoc))
+      return "";
+    StringRef ValBase = Val.slice(0, RefStepLoc);
+    if (ValBase == "all" || ValBase == "none" || ValBase == "default") {
+      return Val;
+    }
+  }
+
+  // Each reciprocal type may be enabled or disabled individually.
+  // Check each input value for validity, concatenate them all back together,
+  // and pass through.
+
+  llvm::StringMap<bool> OptionStrings;
+  OptionStrings.insert(std::make_pair("divd", false));
+  OptionStrings.insert(std::make_pair("divf", false));
+  OptionStrings.insert(std::make_pair("divh", false));
+  OptionStrings.insert(std::make_pair("vec-divd", false));
+  OptionStrings.insert(std::make_pair("vec-divf", false));
+  OptionStrings.insert(std::make_pair("vec-divh", false));
+  OptionStrings.insert(std::make_pair("sqrtd", false));
+  OptionStrings.insert(std::make_pair("sqrtf", false));
+  OptionStrings.insert(std::make_pair("sqrth", false));
+  OptionStrings.insert(std::make_pair("vec-sqrtd", false));
+  OptionStrings.insert(std::make_pair("vec-sqrtf", false));
+  OptionStrings.insert(std::make_pair("vec-sqrth", false));
+
+  for (unsigned i = 0; i != NumOptions; ++i) {
+    StringRef Val = A->getValue(i);
+
+    bool IsDisabled = Val.starts_with(DisabledPrefixIn);
+    // Ignore the disablement token for string matching.
+    if (IsDisabled)
+      Val = Val.substr(1);
+
+    size_t RefStep;
+    if (!getRefinementStep(Val, Diags, *A, RefStep))
+      return "";
+
+    StringRef ValBase = Val.slice(0, RefStep);
+    llvm::StringMap<bool>::iterator OptionIter = OptionStrings.find(ValBase);
+    if (OptionIter == OptionStrings.end()) {
+      // Try again specifying float suffix.
+      OptionIter = OptionStrings.find(ValBase.str() + 'f');
+      if (OptionIter == OptionStrings.end()) {
+        // The input name did not match any known option string.
+        Diags.Report(diag::err_drv_unknown_argument) << Val;
+        return "";
+      }
+      // The option was specified without a half or float or double suffix.
+      // Make sure that the double or half entry was not already specified.
+      // The float entry will be checked below.
+      if (OptionStrings[ValBase.str() + 'd'] ||
+          OptionStrings[ValBase.str() + 'h']) {
+        Diags.Report(diag::err_drv_invalid_value)
+            << A->getOption().getName() << Val;
+        return "";
+      }
+    }
+
+    if (OptionIter->second == true) {
+      // Duplicate option specified.
+      Diags.Report(diag::err_drv_invalid_value)
+          << A->getOption().getName() << Val;
+      return "";
+    }
+
+    // Mark the matched option as found. Do not allow duplicate specifiers.
+    OptionIter->second = true;
+
+    // If the precision was not specified, also mark the double and half entry
+    // as found.
+    if (ValBase.back() != 'f' && ValBase.back() != 'd' &&
+        ValBase.back() != 'h') {
+      OptionStrings[ValBase.str() + 'd'] = true;
+      OptionStrings[ValBase.str() + 'h'] = true;
+    }
+
+    // Build the output string.
+    StringRef Prefix = IsDisabled ? DisabledPrefixOut : EnabledPrefixOut;
+    Out = Args.MakeArgString(Out + Prefix + Val);
+    if (i != NumOptions - 1)
+      Out = Args.MakeArgString(Out + ",");
+  }
+
+  return Out;
+}
+
 std::string tools::complexRangeKindToStr(LangOptions::ComplexRangeKind Range) {
   switch (Range) {
   case LangOptions::ComplexRangeKind::CX_Full:
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index b7a35d6ab5195..413386e8288c9 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -11,7 +11,6 @@
 
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Driver/CommonArgs.h"
-#include "clang/Options/OptionUtils.h"
 #include "clang/Options/Options.h"
 #include "llvm/Frontend/Debug/Options.h"
 #include "llvm/Support/Path.h"
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 200ee13901f4b..79cfa73001e54 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -708,6 +708,11 @@ class AnnotatingParser {
         IsCpp && !IsCpp11AttributeSpecifier && !IsCSharpAttributeSpecifier &&
         Contexts.back().CanBeExpression && Left->isNot(TT_LambdaLSquare) &&
         CurrentToken->isNoneOf(tok::l_brace, tok::r_square) &&
+        // Do not consider '[' after a comma inside a braced initializer the
+        // start of an ObjC method expression. In braced initializer lists,
+        // commas are list separators and should not trigger ObjC parsing.
+        (!Parent || !Parent->is(tok::comma) ||
+         Contexts.back().ContextKind != tok::l_brace) &&
         (!Parent ||
          Parent->isOneOf(tok::colon, tok::l_square, tok::l_paren,
                          tok::kw_return, tok::kw_throw) ||
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index c7357bcd9e367..4b4d5785c21a0 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -44,7 +44,6 @@
 #include "clang/Frontend/FrontendOptions.h"
 #include "clang/Frontend/MultiplexConsumer.h"
 #include "clang/Frontend/PrecompiledPreamble.h"
-#include "clang/Frontend/StandaloneDiagnostic.h"
 #include "clang/Frontend/Utils.h"
 #include "clang/Lex/HeaderSearch.h"
 #include "clang/Lex/HeaderSearchOptions.h"
@@ -211,6 +210,15 @@ getBufferForFileHandlingRemapping(const CompilerInvocation &Invocation,
   return llvm::MemoryBuffer::getMemBufferCopy(Buffer->getBuffer(), FilePath);
 }
 
+struct ASTUnit::ASTWriterData {
+  SmallString<128> Buffer;
+  llvm::BitstreamWriter Stream;
+  ASTWriter Writer;
+
+  ASTWriterData(ModuleCache &ModCache, const CodeGenOptions &CGOpts)
+      : Stream(Buffer), Writer(Stream, Buffer, ModCache, CGOpts, {}) {}
+};
+
 void ASTUnit::clearFileLevelDecls() {
   FileDecls.clear();
 }
@@ -573,24 +581,73 @@ class ASTInfoCollector : public ASTReaderListener {
     Counter = NewCounter;
   }
 };
-} // anonymous namespace
 
-FilterAndStoreDiagnosticConsumer::FilterAndStoreDiagnosticConsumer(
-    SmallVectorImpl<StoredDiagnostic> *StoredDiags,
-    SmallVectorImpl<StandaloneDiagnostic> *StandaloneDiags,
-    bool CaptureNonErrorsFromIncludes)
-    : StoredDiags(StoredDiags), StandaloneDiags(StandaloneDiags),
-      CaptureNonErrorsFromIncludes(CaptureNonErrorsFromIncludes) {
-  assert((StoredDiags || StandaloneDiags) &&
-         "No output collections were passed to StoredDiagnosticConsumer.");
-}
+/// Diagnostic consumer that saves each diagnostic it is given.
+class FilterAndStoreDiagnosticConsumer : public DiagnosticConsumer {
+  SmallVectorImpl<StoredDiagnostic> *StoredDiags;
+  SmallVectorImpl<ASTUnit::StandaloneDiagnostic> *StandaloneDiags;
+  bool CaptureNonErrorsFromIncludes = true;
+  const LangOptions *LangOpts = nullptr;
+  SourceManager *SourceMgr = nullptr;
 
-void FilterAndStoreDiagnosticConsumer::BeginSourceFile(
-    const LangOptions &LangOpts, const Preprocessor *PP) {
-  this->LangOpts = &LangOpts;
-  if (PP)
-    SourceMgr = &PP->getSourceManager();
-}
+public:
+  FilterAndStoreDiagnosticConsumer(
+      SmallVectorImpl<StoredDiagnostic> *StoredDiags,
+      SmallVectorImpl<ASTUnit::StandaloneDiagnostic> *StandaloneDiags,
+      bool CaptureNonErrorsFromIncludes)
+      : StoredDiags(StoredDiags), StandaloneDiags(StandaloneDiags),
+        CaptureNonErrorsFromIncludes(CaptureNonErrorsFromIncludes) {
+    assert((StoredDiags || StandaloneDiags) &&
+           "No output collections were passed to StoredDiagnosticConsumer.");
+  }
+
+  void BeginSourceFile(const LangOptions &LangOpts,
+                       const Preprocessor *PP = nullptr) override {
+    this->LangOpts = &LangOpts;
+    if (PP)
+      SourceMgr = &PP->getSourceManager();
+  }
+
+  void HandleDiagnostic(DiagnosticsEngine::Level Level,
+                        const Diagnostic &Info) override;
+};
+
+/// RAII object that optionally captures and filters diagnostics, if
+/// there is no diagnostic client to capture them already.
+class CaptureDroppedDiagnostics {
+  DiagnosticsEngine &Diags;
+  FilterAndStoreDiagnosticConsumer Client;
+  DiagnosticConsumer *PreviousClient = nullptr;
+  std::unique_ptr<DiagnosticConsumer> OwningPreviousClient;
+
+public:
+  CaptureDroppedDiagnostics(
+      CaptureDiagsKind CaptureDiagnostics, DiagnosticsEngine &Diags,
+      SmallVectorImpl<StoredDiagnostic> *StoredDiags,
+      SmallVectorImpl<ASTUnit::StandaloneDiagnostic> *StandaloneDiags)
+      : Diags(Diags),
+        Client(StoredDiags, StandaloneDiags,
+               CaptureDiagnostics !=
+                   CaptureDiagsKind::AllWithoutNonErrorsFromIncludes) {
+    if (CaptureDiagnostics != CaptureDiagsKind::None ||
+        Diags.getClient() == nullptr) {
+      OwningPreviousClient = Diags.takeClient();
+      PreviousClient = Diags.getClient();
+      Diags.setClient(&Client, false);
+    }
+  }
+
+  ~CaptureDroppedDiagnostics() {
+    if (Diags.getClient() == &Client)
+      Diags.setClient(PreviousClient, !!OwningPreviousClient.release());
+  }
+};
+
+} // namespace
+
+static ASTUnit::StandaloneDiagnostic
+makeStandaloneDiagnostic(const LangOptions &LangOpts,
+                         const StoredDiagnostic &InDiag);
 
 static bool isInMainFile(const clang::Diagnostic &D) {
   if (!D.hasSourceManager() || !D.getLocation().isValid())
@@ -626,32 +683,12 @@ void FilterAndStoreDiagnosticConsumer::HandleDiagnostic(
         StoredDiag.emplace(Level, Info);
         ResultDiag = &*StoredDiag;
       }
-      StandaloneDiags->emplace_back(*LangOpts, *ResultDiag);
+      StandaloneDiags->push_back(
+          makeStandaloneDiagnostic(*LangOpts, *ResultDiag));
     }
   }
 }
 
-CaptureDroppedDiagnostics::CaptureDroppedDiagnostics(
-    CaptureDiagsKind CaptureDiagnostics, DiagnosticsEngine &Diags,
-    SmallVectorImpl<StoredDiagnostic> *StoredDiags,
-    SmallVectorImpl<StandaloneDiagnostic> *StandaloneDiags)
-    : Diags(Diags),
-      Client(StoredDiags, StandaloneDiags,
-             CaptureDiagnostics !=
-                 CaptureDiagsKind::AllWithoutNonErrorsFromIncludes) {
-  if (CaptureDiagnostics != CaptureDiagsKind::None ||
-      Diags.getClient() == nullptr) {
-    OwningPreviousClient = Diags.takeClient();
-    PreviousClient = Diags.getClient();
-    Diags.setClient(&Client, false);
-  }
-}
-
-CaptureDroppedDiagnostics::~CaptureDroppedDiagnostics() {
-  if (Diags.getClient() == &Client)
-    Diags.setClient(PreviousClient, !!OwningPreviousClient.release());
-}
-
 IntrusiveRefCntPtr<ASTReader> ASTUnit::getASTReader() const {
   return Reader;
 }
@@ -1073,7 +1110,7 @@ class ASTUnitPreambleCallbacks : public PreambleCallbacks {
   unsigned Hash = 0;
   std::vector<Decl *> TopLevelDecls;
   std::vector<LocalDeclID> TopLevelDeclIDs;
-  llvm::SmallVector<StandaloneDiagnostic, 4> PreambleDiags;
+  llvm::SmallVector<ASTUnit::StandaloneDiagnostic, 4> PreambleDiags;
 };
 
 } // namespace
@@ -1222,17 +1259,10 @@ bool ASTUnit::Parse(std::shared_ptr<PCHContainerOperations> PCHContainerOps,
   if (!Act->BeginSourceFile(*Clang, Clang->getFrontendOpts().Inputs[0]))
     return true;
 
-  if (SavedMainFileBuffer) {
-    StoredDiagnostics.clear();
-    StoredDiagnostics.reserve(PreambleDiagnostics.size());
-    llvm::transform(std::move(PreambleDiagnostics),
-                    std::back_inserter(StoredDiagnostics),
-                    [&](auto &&StandaloneDiag) {
-                      return translateStandaloneDiag(
-                          getFileManager(), getSourceManager(),
-                          std::move(StandaloneDiag), PreambleSrcLocCache);
-                    });
-  } else
+  if (SavedMainFileBuffer)
+    TranslateStoredDiagnostics(getFileManager(), getSourceManager(),
+                               PreambleDiagnostics, StoredDiagnostics);
+  else
     PreambleSrcLocCache.clear();
 
   if (llvm::Error Err = Act->Execute()) {
@@ -1251,6 +1281,51 @@ bool ASTUnit::Parse(std::shared_ptr<PCHContainerOperations> PCHContainerOps,
   return false;
 }
 
+static std::pair<unsigned, unsigned>
+makeStandaloneRange(CharSourceRange Range, const SourceManager &SM,
+                    const LangOptions &LangOpts) {
+  CharSourceRange FileRange = Lexer::makeFileCharRange(Range, SM, LangOpts);
+  unsigned Offset = SM.getFileOffset(FileRange.getBegin());
+  unsigned EndOffset = SM.getFileOffset(FileRange.getEnd());
+  return std::make_pair(Offset, EndOffset);
+}
+
+static ASTUnit::StandaloneFixIt makeStandaloneFixIt(const SourceManager &SM,
+                                                    const LangOptions &LangOpts,
+                                                    const FixItHint &InFix) {
+  ASTUnit::StandaloneFixIt OutFix;
+  OutFix.RemoveRange = makeStandaloneRange(InFix.RemoveRange, SM, LangOpts);
+  OutFix.InsertFromRange =
+      makeStandaloneRange(InFix.InsertFromRange, SM, LangOpts);
+  OutFix.CodeToInsert = InFix.CodeToInsert;
+  OutFix.BeforePreviousInsertions = InFix.BeforePreviousInsertions;
+  return OutFix;
+}
+
+static ASTUnit::StandaloneDiagnostic
+makeStandaloneDiagnostic(const LangOptions &LangOpts,
+                         const StoredDiagnostic &InDiag) {
+  ASTUnit::StandaloneDiagnostic OutDiag;
+  OutDiag.ID = InDiag.getID();
+  OutDiag.Level = InDiag.getLevel();
+  OutDiag.Message = std::string(InDiag.getMessage());
+  OutDiag.LocOffset = 0;
+  if (InDiag.getLocation().isInvalid())
+    return OutDiag;
+  const SourceManager &SM = InDiag.getLocation().getManager();
+  SourceLocation FileLoc = SM.getFileLoc(InDiag.getLocation());
+  OutDiag.Filename = std::string(SM.getFilename(FileLoc));
+  if (OutDiag.Filename.empty())
+    return OutDiag;
+  OutDiag.LocOffset = SM.getFileOffset(FileLoc);
+  for (const auto &Range : InDiag.getRanges())
+    OutDiag.Ranges.push_back(makeStandaloneRange(Range, SM, LangOpts));
+  for (const auto &FixIt : InDiag.getFixIts())
+    OutDiag.FixIts.push_back(makeStandaloneFixIt(SM, LangOpts, FixIt));
+
+  return OutDiag;
+}
+
 /// Attempt to build or re-use a precompiled preamble when (re-)parsing
 /// the source file.
 ///
@@ -1705,6 +1780,114 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromCompilerInvocation(
   return AST;
 }
 
+std::unique_ptr<ASTUnit> ASTUnit::LoadFromCommandLine(
+    const char **ArgBegin, const char **ArgEnd,
+    std::shared_ptr<PCHContainerOperations> PCHContainerOps,
+    std::shared_ptr<DiagnosticOptions> DiagOpts,
+    IntrusiveRefCntPtr<DiagnosticsEngine> Diags, StringRef ResourceFilesPath,
+    bool StorePreamblesInMemory, StringRef PreambleStoragePath,
+    bool OnlyLocalDecls, CaptureDiagsKind CaptureDiagnostics,
+    ArrayRef<RemappedFile> RemappedFiles, bool RemappedFilesKeepOriginalName,
+    unsigned PrecompilePreambleAfterNParses, TranslationUnitKind TUKind,
+    bool CacheCodeCompletionResults, bool IncludeBriefCommentsInCodeCompletion,
+    bool AllowPCHWithCompilerErrors, SkipFunctionBodiesScope SkipFunctionBodies,
+    bool SingleFileParse, bool UserFilesAreVolatile, bool ForSerialization,
+    bool RetainExcludedConditionalBlocks, std::optional<StringRef> ModuleFormat,
+    std::unique_ptr<ASTUnit> *ErrAST,
+    IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS) {
+  assert(Diags.get() && "no DiagnosticsEngine was provided");
+
+  // If no VFS was provided, create one that tracks the physical file system.
+  // If '-working-directory' was passed as an argument, 'createInvocation' will
+  // set this as the current working directory of the VFS.
+  if (!VFS)
+    VFS = llvm::vfs::createPhysicalFileSystem();
+
+  SmallVector<StoredDiagnostic, 4> StoredDiagnostics;
+
+  std::shared_ptr<CompilerInvocation> CI;
+
+  {
+    CaptureDroppedDiagnostics Capture(CaptureDiagnostics, *Diags,
+                                      &StoredDiagnostics, nullptr);
+
+    CreateInvocationOptions CIOpts;
+    CIOpts.VFS = VFS;
+    CIOpts.Diags = Diags;
+    CIOpts.ProbePrecompiled = true; // FIXME: historical default. Needed?
+    CI = createInvocation(llvm::ArrayRef(ArgBegin, ArgEnd), std::move(CIOpts));
+    if (!CI)
+      return nullptr;
+  }
+
+  // Override any files that need remapping
+  for (const auto &RemappedFile : RemappedFiles) {
+    CI->getPreprocessorOpts().addRemappedFile(RemappedFile.first,
+                                              RemappedFile.second);
+  }
+  PreprocessorOptions &PPOpts = CI->getPreprocessorOpts();
+  PPOpts.RemappedFilesKeepOriginalName = RemappedFilesKeepOriginalName;
+  PPOpts.AllowPCHWithCompilerErrors = AllowPCHWithCompilerErrors;
+  PPOpts.SingleFileParseMode = SingleFileParse;
+  PPOpts.RetainExcludedConditionalBlocks = RetainExcludedConditionalBlocks;
+
+  // Override the resources path.
+  CI->getHeaderSearchOpts().ResourceDir = std::string(ResourceFilesPath);
+
+  CI->getFrontendOpts().SkipFunctionBodies =
+      SkipFunctionBodies == SkipFunctionBodiesScope::PreambleAndMainFile;
+
+  if (ModuleFormat)
+    CI->getHeaderSearchOpts().ModuleFormat = std::string(*ModuleFormat);
+
+  // Create the AST unit.
+  std::unique_ptr<ASTUnit> AST;
+  AST.reset(new ASTUnit(false));
+  AST->NumStoredDiagnosticsFromDriver = StoredDiagnostics.size();
+  AST->StoredDiagnostics.swap(StoredDiagnostics);
+  ConfigureDiags(Diags, *AST, CaptureDiagnostics);
+  AST->DiagOpts = DiagOpts;
+  AST->Diagnostics = Diags;
+  AST->FileSystemOpts = CI->getFileSystemOpts();
+  AST->CodeGenOpts = std::make_unique<CodeGenOptions>(CI->getCodeGenOpts());
+  VFS = createVFSFromCompilerInvocation(*CI, *Diags, VFS);
+  AST->FileMgr =
+      llvm::makeIntrusiveRefCnt<FileManager>(AST->FileSystemOpts, VFS);
+  AST->StorePreamblesInMemory = StorePreamblesInMemory;
+  AST->PreambleStoragePath = PreambleStoragePath;
+  AST->ModCache = createCrossProcessModuleCache();
+  AST->OnlyLocalDecls = OnlyLocalDecls;
+  AST->CaptureDiagnostics = CaptureDiagnostics;
+  AST->TUKind = TUKind;
+  AST->ShouldCacheCodeCompletionResults = CacheCodeCompletionResults;
+  AST->IncludeBriefCommentsInCodeCompletion =
+      IncludeBriefCommentsInCodeCompletion;
+  AST->UserFilesAreVolatile = UserFilesAreVolatile;
+  AST->Invocation = CI;
+  AST->SkipFunctionBodies = SkipFunctionBodies;
+  if (ForSerialization)
+    AST->WriterData.reset(new ASTWriterData(*AST->ModCache, *AST->CodeGenOpts));
+  // Zero out now to ease cleanup during crash recovery.
+  CI = nullptr;
+  Diags = nullptr;
+
+  // Recover resources if we crash before exiting this method.
+  llvm::CrashRecoveryContextCleanupRegistrar<ASTUnit> ASTUnitCleanup(AST.get());
+
+  if (AST->LoadFromCompilerInvocation(std::move(PCHContainerOps),
+                                      PrecompilePreambleAfterNParses, VFS)) {
+    // Some error occurred, if caller wants to examine diagnostics, pass it the
+    // ASTUnit.
+    if (ErrAST) {
+      AST->StoredDiagnostics.swap(AST->FailedParseDiagnostics);
+      ErrAST->swap(AST);
+    }
+    return nullptr;
+  }
+
+  return AST;
+}
+
 bool ASTUnit::Reparse(std::shared_ptr<PCHContainerOperations> PCHContainerOps,
                       ArrayRef<RemappedFile> RemappedFiles,
                       IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS) {
@@ -2223,6 +2406,64 @@ bool ASTUnit::serialize(raw_ostream &OS) {
   return serializeUnit(Writer, Buffer, getSema(), OS);
 }
 
+void ASTUnit::TranslateStoredDiagnostics(
+    FileManager &FileMgr, SourceManager &SrcMgr,
+    const SmallVectorImpl<StandaloneDiagnostic> &Diags,
+    SmallVectorImpl<StoredDiagnostic> &Out) {
+  // Map the standalone diagnostic into the new source manager. We also need to
+  // remap all the locations to the new view. This includes the diag location,
+  // any associated source ranges, and the source ranges of associated fix-its.
+  // FIXME: There should be a cleaner way to do this.
+  SmallVector<StoredDiagnostic, 4> Result;
+  Result.reserve(Diags.size());
+
+  for (const auto &SD : Diags) {
+    // Rebuild the StoredDiagnostic.
+    if (SD.Filename.empty())
+      continue;
+    auto FE = FileMgr.getOptionalFileRef(SD.Filename);
+    if (!FE)
+      continue;
+    SourceLocation FileLoc;
+    auto ItFileID = PreambleSrcLocCache.find(SD.Filename);
+    if (ItFileID == PreambleSrcLocCache.end()) {
+      FileID FID = SrcMgr.translateFile(*FE);
+      FileLoc = SrcMgr.getLocForStartOfFile(FID);
+      PreambleSrcLocCache[SD.Filename] = FileLoc;
+    } else {
+      FileLoc = ItFileID->getValue();
+    }
+
+    if (FileLoc.isInvalid())
+      continue;
+    SourceLocation L = FileLoc.getLocWithOffset(SD.LocOffset);
+    FullSourceLoc Loc(L, SrcMgr);
+
+    SmallVector<CharSourceRange, 4> Ranges;
+    Ranges.reserve(SD.Ranges.size());
+    for (const auto &Range : SD.Ranges) {
+      SourceLocation BL = FileLoc.getLocWithOffset(Range.first);
+      SourceLocation EL = FileLoc.getLocWithOffset(Range.second);
+      Ranges.push_back(CharSourceRange::getCharRange(BL, EL));
+    }
+
+    SmallVector<FixItHint, 2> FixIts;
+    FixIts.reserve(SD.FixIts.size());
+    for (const auto &FixIt : SD.FixIts) {
+      FixIts.push_back(FixItHint());
+      FixItHint &FH = FixIts.back();
+      FH.CodeToInsert = FixIt.CodeToInsert;
+      SourceLocation BL = FileLoc.getLocWithOffset(FixIt.RemoveRange.first);
+      SourceLocation EL = FileLoc.getLocWithOffset(FixIt.RemoveRange.second);
+      FH.RemoveRange = CharSourceRange::getCharRange(BL, EL);
+    }
+
+    Result.push_back(
+        StoredDiagnostic(SD.Level, SD.ID, SD.Message, Loc, Ranges, FixIts));
+  }
+  Result.swap(Out);
+}
+
 void ASTUnit::addFileLevelDecl(Decl *D) {
   assert(D);
 
diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt
index 634f239933605..dac9e0d26f393 100644
--- a/clang/lib/Frontend/CMakeLists.txt
+++ b/clang/lib/Frontend/CMakeLists.txt
@@ -17,6 +17,7 @@ add_clang_library(clangFrontend
   ChainedIncludesSource.cpp
   CompilerInstance.cpp
   CompilerInvocation.cpp
+  CreateInvocationFromCommandLine.cpp
   DependencyFile.cpp
   DependencyGraph.cpp
   DiagnosticRenderer.cpp
@@ -35,7 +36,6 @@ add_clang_library(clangFrontend
   SARIFDiagnosticPrinter.cpp
   SerializedDiagnosticPrinter.cpp
   SerializedDiagnosticReader.cpp
-  StandaloneDiagnostic.cpp
   TestModuleFileExtension.cpp
   TextDiagnostic.cpp
   TextDiagnosticBuffer.cpp
@@ -51,6 +51,7 @@ add_clang_library(clangFrontend
   clangAPINotes
   clangAST
   clangBasic
+  clangDriver
   clangOptions
   clangEdit
   clangLex
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 9d7c851bead3e..fd9d78d9ae689 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -27,6 +27,7 @@
 #include "clang/Basic/Version.h"
 #include "clang/Basic/XRayInstr.h"
 #include "clang/Config/config.h"
+#include "clang/Driver/Driver.h"
 #include "clang/Frontend/CommandLineSourceLoc.h"
 #include "clang/Frontend/DependencyOutputOptions.h"
 #include "clang/Frontend/FrontendOptions.h"
@@ -3276,6 +3277,13 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
   return Diags.getNumErrors() == NumErrorsBefore;
 }
 
+std::string CompilerInvocation::GetResourcesPath(const char *Argv0,
+                                                 void *MainAddr) {
+  std::string ClangExecutable =
+      llvm::sys::fs::getMainExecutable(Argv0, MainAddr);
+  return driver::Driver::GetResourcesPath(ClangExecutable);
+}
+
 static void GenerateHeaderSearchArgs(const HeaderSearchOptions &Opts,
                                      ArgumentConsumer Consumer) {
   const HeaderSearchOptions *HeaderSearchOpts = &Opts;
diff --git a/clang/lib/Driver/CreateInvocationFromArgs.cpp b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
similarity index 93%
rename from clang/lib/Driver/CreateInvocationFromArgs.cpp
rename to clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
index 516d61f1a1159..e54e83151ad1e 100644
--- a/clang/lib/Driver/CreateInvocationFromArgs.cpp
+++ b/clang/lib/Frontend/CreateInvocationFromCommandLine.cpp
@@ -1,4 +1,4 @@
-//===--- CreateInvocationFromArgs.h - CompilerInvocation from Args --------===//
+//===--- CreateInvocationFromCommandLine.cpp - CompilerInvocation from Args ==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,9 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Driver/Action.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/Tool.h"
@@ -24,13 +24,12 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/TargetParser/Host.h"
-
+using namespace clang;
 using namespace llvm::opt;
 
-namespace clang {
-
 std::unique_ptr<CompilerInvocation>
-createInvocation(ArrayRef<const char *> ArgList, CreateInvocationOptions Opts) {
+clang::createInvocation(ArrayRef<const char *> ArgList,
+                        CreateInvocationOptions Opts) {
   assert(!ArgList.empty());
   std::optional<DiagnosticOptions> LocalDiagOpts;
   IntrusiveRefCntPtr<DiagnosticsEngine> Diags;
@@ -115,5 +114,3 @@ createInvocation(ArrayRef<const char *> ArgList, CreateInvocationOptions Opts) {
     return nullptr;
   return CI;
 }
-
-} // namespace clang
diff --git a/clang/lib/Frontend/StandaloneDiagnostic.cpp b/clang/lib/Frontend/StandaloneDiagnostic.cpp
deleted file mode 100644
index 4f19c91b7d266..0000000000000
--- a/clang/lib/Frontend/StandaloneDiagnostic.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//===--- StandaloneDiagnostic.h - Serializable Diagnostic ------------- ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Frontend/StandaloneDiagnostic.h"
-#include "clang/Lex/Lexer.h"
-
-namespace clang {
-
-StandaloneDiagnostic::SourceOffsetRange::SourceOffsetRange(
-    CharSourceRange Range, const SourceManager &SrcMgr,
-    const LangOptions &LangOpts) {
-  const auto FileRange = Lexer::makeFileCharRange(Range, SrcMgr, LangOpts);
-  Begin = SrcMgr.getFileOffset(FileRange.getBegin());
-  End = SrcMgr.getFileOffset(FileRange.getEnd());
-}
-
-StandaloneDiagnostic::StandaloneFixIt::StandaloneFixIt(
-    const SourceManager &SrcMgr, const LangOptions &LangOpts,
-    const FixItHint &FixIt)
-    : RemoveRange(FixIt.RemoveRange, SrcMgr, LangOpts),
-      InsertFromRange(FixIt.InsertFromRange, SrcMgr, LangOpts),
-      CodeToInsert(FixIt.CodeToInsert),
-      BeforePreviousInsertions(FixIt.BeforePreviousInsertions) {}
-
-StandaloneDiagnostic::StandaloneDiagnostic(const LangOptions &LangOpts,
-                                           const StoredDiagnostic &InDiag)
-    : Level(InDiag.getLevel()), ID(InDiag.getID()),
-      Message(InDiag.getMessage()) {
-  const FullSourceLoc &FullLoc = InDiag.getLocation();
-  // This is not an invalid diagnostic; invalid SourceLocations are used to
-  // represent diagnostics without a specific SourceLocation.
-  if (FullLoc.isInvalid())
-    return;
-
-  const auto &SrcMgr = FullLoc.getManager();
-  FileKind = SrcMgr.getFileCharacteristic(static_cast<SourceLocation>(FullLoc));
-  const auto FileLoc = SrcMgr.getFileLoc(static_cast<SourceLocation>(FullLoc));
-  FileOffset = SrcMgr.getFileOffset(FileLoc);
-  Filename = SrcMgr.getFilename(FileLoc);
-  assert(!Filename.empty() && "diagnostic with location has no source file?");
-
-  Ranges.reserve(InDiag.getRanges().size());
-  for (const auto &Range : InDiag.getRanges())
-    Ranges.emplace_back(Range, SrcMgr, LangOpts);
-
-  FixIts.reserve(InDiag.getFixIts().size());
-  for (const auto &FixIt : InDiag.getFixIts())
-    FixIts.emplace_back(SrcMgr, LangOpts, FixIt);
-}
-
-StoredDiagnostic
-translateStandaloneDiag(FileManager &FileMgr, SourceManager &SrcMgr,
-                        const StandaloneDiagnostic &StandaloneDiag,
-                        llvm::StringMap<SourceLocation> &SrcLocCache) {
-  const auto FileRef = FileMgr.getOptionalFileRef(StandaloneDiag.Filename);
-  if (!FileRef)
-    return StoredDiagnostic(StandaloneDiag.Level, StandaloneDiag.ID,
-                            StandaloneDiag.Message);
-
-  // Try to get FileLoc from cache first
-  SourceLocation FileLoc;
-  auto It = SrcLocCache.find(StandaloneDiag.Filename);
-  if (It != SrcLocCache.end()) {
-    FileLoc = It->getValue();
-  }
-
-  // Cache miss - compute and cache the location
-  if (FileLoc.isInvalid()) {
-    const auto FileID =
-        SrcMgr.getOrCreateFileID(*FileRef, StandaloneDiag.FileKind);
-    FileLoc = SrcMgr.getLocForStartOfFile(FileID);
-
-    if (FileLoc.isInvalid())
-      return StoredDiagnostic(StandaloneDiag.Level, StandaloneDiag.ID,
-                              StandaloneDiag.Message);
-
-    SrcLocCache[StandaloneDiag.Filename] = FileLoc;
-  }
-
-  const auto DiagLoc = FileLoc.getLocWithOffset(StandaloneDiag.FileOffset);
-  const FullSourceLoc Loc(DiagLoc, SrcMgr);
-
-  auto ConvertOffsetRange =
-      [&](const StandaloneDiagnostic::SourceOffsetRange &Range) {
-        return CharSourceRange(
-            SourceRange(FileLoc.getLocWithOffset(Range.Begin),
-                        FileLoc.getLocWithOffset(Range.End)),
-            /*IsTokenRange*/ false);
-      };
-
-  SmallVector<CharSourceRange, 4> TranslatedRanges;
-  TranslatedRanges.reserve(StandaloneDiag.Ranges.size());
-  transform(StandaloneDiag.Ranges, std::back_inserter(TranslatedRanges),
-            ConvertOffsetRange);
-
-  SmallVector<FixItHint, 2> TranslatedFixIts;
-  TranslatedFixIts.reserve(StandaloneDiag.FixIts.size());
-  for (const auto &FixIt : StandaloneDiag.FixIts) {
-    FixItHint TranslatedFixIt;
-    TranslatedFixIt.CodeToInsert = FixIt.CodeToInsert;
-    TranslatedFixIt.RemoveRange = ConvertOffsetRange(FixIt.RemoveRange);
-    TranslatedFixIt.InsertFromRange = ConvertOffsetRange(FixIt.InsertFromRange);
-    TranslatedFixIt.BeforePreviousInsertions = FixIt.BeforePreviousInsertions;
-    TranslatedFixIts.push_back(std::move(TranslatedFixIt));
-  }
-
-  return StoredDiagnostic(StandaloneDiag.Level, StandaloneDiag.ID,
-                          StandaloneDiag.Message, Loc, TranslatedRanges,
-                          TranslatedFixIts);
-}
-
-} // namespace clang
diff --git a/clang/lib/Interpreter/CMakeLists.txt b/clang/lib/Interpreter/CMakeLists.txt
index 9a597146b2fc4..37faa0302caaa 100644
--- a/clang/lib/Interpreter/CMakeLists.txt
+++ b/clang/lib/Interpreter/CMakeLists.txt
@@ -46,7 +46,6 @@ add_clang_library(clangInterpreter
   clangFrontend
   clangFrontendTool
   clangLex
-  clangOptions
   clangParse
   clangSema
   clangSerialization
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index 6cbc5e9910bcc..7764fa7dc92b9 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -42,7 +42,6 @@
 #include "clang/Interpreter/Interpreter.h"
 #include "clang/Interpreter/Value.h"
 #include "clang/Lex/PreprocessorOptions.h"
-#include "clang/Options/OptionUtils.h"
 #include "clang/Options/Options.h"
 #include "clang/Sema/Lookup.h"
 #include "clang/Serialization/ObjectFilePCHContainerReader.h"
@@ -106,7 +105,7 @@ CreateCI(const llvm::opt::ArgStringList &Argv) {
   if (Clang->getHeaderSearchOpts().UseBuiltinIncludes &&
       Clang->getHeaderSearchOpts().ResourceDir.empty())
     Clang->getHeaderSearchOpts().ResourceDir =
-        GetResourcesPath(Argv[0], nullptr);
+        CompilerInvocation::GetResourcesPath(Argv[0], nullptr);
 
   Clang->createVirtualFileSystem();
 
diff --git a/clang/lib/Options/OptionUtils.cpp b/clang/lib/Options/OptionUtils.cpp
index e5aefa012f679..fcafd3c83c6b3 100644
--- a/clang/lib/Options/OptionUtils.cpp
+++ b/clang/lib/Options/OptionUtils.cpp
@@ -9,12 +9,7 @@
 #include "clang/Options/OptionUtils.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/DiagnosticDriver.h"
-#include "clang/Basic/Version.h"
-#include "clang/Config/config.h"
-#include "clang/Options/Options.h"
 #include "llvm/Option/ArgList.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
 
 using namespace clang;
 using namespace llvm::opt;
@@ -36,211 +31,17 @@ IntTy getLastArgIntValueImpl(const ArgList &Args, OptSpecifier Id,
 }
 } // namespace
 
-int clang::getLastArgIntValue(const ArgList &Args, OptSpecifier Id, int Default,
-                              DiagnosticsEngine *Diags, unsigned Base) {
+namespace clang {
+
+int getLastArgIntValue(const ArgList &Args, OptSpecifier Id, int Default,
+                       DiagnosticsEngine *Diags, unsigned Base) {
   return getLastArgIntValueImpl<int>(Args, Id, Default, Diags, Base);
 }
 
-uint64_t clang::getLastArgUInt64Value(const ArgList &Args, OptSpecifier Id,
-                                      uint64_t Default,
-                                      DiagnosticsEngine *Diags, unsigned Base) {
+uint64_t getLastArgUInt64Value(const ArgList &Args, OptSpecifier Id,
+                               uint64_t Default, DiagnosticsEngine *Diags,
+                               unsigned Base) {
   return getLastArgIntValueImpl<uint64_t>(Args, Id, Default, Diags, Base);
 }
 
-StringRef clang::parseMPreferVectorWidthOption(clang::DiagnosticsEngine &Diags,
-                                               const llvm::opt::ArgList &Args) {
-  const Arg *A = Args.getLastArg(options::OPT_mprefer_vector_width_EQ);
-  if (!A)
-    return "";
-
-  StringRef Value = A->getValue();
-  unsigned Width LLVM_ATTRIBUTE_UNINITIALIZED;
-
-  // Only "none" and Integer values are accepted by
-  // -mprefer-vector-width=<value>.
-  if (Value != "none" && Value.getAsInteger(10, Width)) {
-    Diags.Report(clang::diag::err_drv_invalid_value)
-        << A->getOption().getName() << Value;
-    return "";
-  }
-
-  return Value;
-}
-
-// This is a helper function for validating the optional refinement step
-// parameter in reciprocal argument strings. Return false if there is an error
-// parsing the refinement step. Otherwise, return true and set the Position
-// of the refinement step in the input string.
-static bool getRefinementStep(StringRef In, clang::DiagnosticsEngine &Diags,
-                              const Arg &A, size_t &Position) {
-  const char RefinementStepToken = ':';
-  Position = In.find(RefinementStepToken);
-  if (Position != StringRef::npos) {
-    StringRef Option = A.getOption().getName();
-    StringRef RefStep = In.substr(Position + 1);
-    // Allow exactly one numeric character for the additional refinement
-    // step parameter. This is reasonable for all currently-supported
-    // operations and architectures because we would expect that a larger value
-    // of refinement steps would cause the estimate "optimization" to
-    // under-perform the native operation. Also, if the estimate does not
-    // converge quickly, it probably will not ever converge, so further
-    // refinement steps will not produce a better answer.
-    if (RefStep.size() != 1) {
-      Diags.Report(diag::err_drv_invalid_value) << Option << RefStep;
-      return false;
-    }
-    char RefStepChar = RefStep[0];
-    if (RefStepChar < '0' || RefStepChar > '9') {
-      Diags.Report(diag::err_drv_invalid_value) << Option << RefStep;
-      return false;
-    }
-  }
-  return true;
-}
-
-StringRef clang::parseMRecipOption(clang::DiagnosticsEngine &Diags,
-                                   const ArgList &Args) {
-  StringRef DisabledPrefixIn = "!";
-  StringRef DisabledPrefixOut = "!";
-  StringRef EnabledPrefixOut = "";
-  StringRef Out = "";
-
-  const Arg *A = Args.getLastArg(options::OPT_mrecip, options::OPT_mrecip_EQ);
-  if (!A)
-    return "";
-
-  const unsigned NumOptions = A->getNumValues();
-  if (NumOptions == 0) {
-    // No option is the same as "all".
-    return "all";
-  }
-
-  // Pass through "all", "none", or "default" with an optional refinement step.
-  if (NumOptions == 1) {
-    StringRef Val = A->getValue(0);
-    size_t RefStepLoc;
-    if (!getRefinementStep(Val, Diags, *A, RefStepLoc))
-      return "";
-    StringRef ValBase = Val.slice(0, RefStepLoc);
-    if (ValBase == "all" || ValBase == "none" || ValBase == "default") {
-      return Val;
-    }
-  }
-
-  // Each reciprocal type may be enabled or disabled individually.
-  // Check each input value for validity, concatenate them all back together,
-  // and pass through.
-
-  llvm::StringMap<bool> OptionStrings;
-  OptionStrings.insert(std::make_pair("divd", false));
-  OptionStrings.insert(std::make_pair("divf", false));
-  OptionStrings.insert(std::make_pair("divh", false));
-  OptionStrings.insert(std::make_pair("vec-divd", false));
-  OptionStrings.insert(std::make_pair("vec-divf", false));
-  OptionStrings.insert(std::make_pair("vec-divh", false));
-  OptionStrings.insert(std::make_pair("sqrtd", false));
-  OptionStrings.insert(std::make_pair("sqrtf", false));
-  OptionStrings.insert(std::make_pair("sqrth", false));
-  OptionStrings.insert(std::make_pair("vec-sqrtd", false));
-  OptionStrings.insert(std::make_pair("vec-sqrtf", false));
-  OptionStrings.insert(std::make_pair("vec-sqrth", false));
-
-  for (unsigned i = 0; i != NumOptions; ++i) {
-    StringRef Val = A->getValue(i);
-
-    bool IsDisabled = Val.starts_with(DisabledPrefixIn);
-    // Ignore the disablement token for string matching.
-    if (IsDisabled)
-      Val = Val.substr(1);
-
-    size_t RefStep;
-    if (!getRefinementStep(Val, Diags, *A, RefStep))
-      return "";
-
-    StringRef ValBase = Val.slice(0, RefStep);
-    llvm::StringMap<bool>::iterator OptionIter = OptionStrings.find(ValBase);
-    if (OptionIter == OptionStrings.end()) {
-      // Try again specifying float suffix.
-      OptionIter = OptionStrings.find(ValBase.str() + 'f');
-      if (OptionIter == OptionStrings.end()) {
-        // The input name did not match any known option string.
-        Diags.Report(diag::err_drv_unknown_argument) << Val;
-        return "";
-      }
-      // The option was specified without a half or float or double suffix.
-      // Make sure that the double or half entry was not already specified.
-      // The float entry will be checked below.
-      if (OptionStrings[ValBase.str() + 'd'] ||
-          OptionStrings[ValBase.str() + 'h']) {
-        Diags.Report(diag::err_drv_invalid_value)
-            << A->getOption().getName() << Val;
-        return "";
-      }
-    }
-
-    if (OptionIter->second == true) {
-      // Duplicate option specified.
-      Diags.Report(diag::err_drv_invalid_value)
-          << A->getOption().getName() << Val;
-      return "";
-    }
-
-    // Mark the matched option as found. Do not allow duplicate specifiers.
-    OptionIter->second = true;
-
-    // If the precision was not specified, also mark the double and half entry
-    // as found.
-    if (ValBase.back() != 'f' && ValBase.back() != 'd' &&
-        ValBase.back() != 'h') {
-      OptionStrings[ValBase.str() + 'd'] = true;
-      OptionStrings[ValBase.str() + 'h'] = true;
-    }
-
-    // Build the output string.
-    StringRef Prefix = IsDisabled ? DisabledPrefixOut : EnabledPrefixOut;
-    Out = Args.MakeArgString(Out + Prefix + Val);
-    if (i != NumOptions - 1)
-      Out = Args.MakeArgString(Out + ",");
-  }
-
-  return Out;
-}
-
-std::string clang::GetResourcesPath(StringRef BinaryPath) {
-  // Since the resource directory is embedded in the module hash, it's important
-  // that all places that need it call this function, so that they get the
-  // exact same string ("a/../b/" and "b/" get different hashes, for example).
-
-  // Dir is bin/ or lib/, depending on where BinaryPath is.
-  StringRef Dir = llvm::sys::path::parent_path(BinaryPath);
-  SmallString<128> P(Dir);
-
-  StringRef ConfiguredResourceDir(CLANG_RESOURCE_DIR);
-  if (!ConfiguredResourceDir.empty()) {
-    // FIXME: We should fix the behavior of llvm::sys::path::append so we don't
-    // need to check for absolute paths here.
-    if (llvm::sys::path::is_absolute(ConfiguredResourceDir))
-      P = ConfiguredResourceDir;
-    else
-      llvm::sys::path::append(P, ConfiguredResourceDir);
-  } else {
-    // On Windows, libclang.dll is in bin/.
-    // On non-Windows, libclang.so/.dylib is in lib/.
-    // With a static-library build of libclang, LibClangPath will contain the
-    // path of the embedding binary, which for LLVM binaries will be in bin/.
-    // ../lib gets us to lib/ in both cases.
-    P = llvm::sys::path::parent_path(Dir);
-    // This search path is also created in the COFF driver of lld, so any
-    // changes here also needs to happen in lld/COFF/Driver.cpp
-    llvm::sys::path::append(P, CLANG_INSTALL_LIBDIR_BASENAME, "clang",
-                            CLANG_VERSION_MAJOR_STRING);
-  }
-
-  return std::string(P);
-}
-
-std::string clang::GetResourcesPath(const char *Argv0, void *MainAddr) {
-  const std::string ClangExecutable =
-      llvm::sys::fs::getMainExecutable(Argv0, MainAddr);
-  return GetResourcesPath(ClangExecutable);
-}
+} // namespace clang
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index 1d55f615de8a9..9bae12454d2dc 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -31,7 +31,6 @@
 #include "clang/Frontend/TextDiagnosticPrinter.h"
 #include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/PreprocessorOptions.h"
-#include "clang/Options/OptionUtils.h"
 #include "clang/Options/Options.h"
 #include "clang/Tooling/ArgumentsAdjusters.h"
 #include "clang/Tooling/CompilationDatabase.h"
@@ -511,7 +510,8 @@ static void injectResourceDir(CommandLineArguments &Args, const char *Argv0,
 
   // If there's no override in place add our resource dir.
   Args = getInsertArgumentAdjuster(
-      ("-resource-dir=" + GetResourcesPath(Argv0, MainAddr)).c_str())(Args, "");
+      ("-resource-dir=" + CompilerInvocation::GetResourcesPath(Argv0, MainAddr))
+          .c_str())(Args, "");
 }
 
 int ClangTool::run(ToolAction *Action) {
diff --git a/clang/lib/Tooling/Transformer/RangeSelector.cpp b/clang/lib/Tooling/Transformer/RangeSelector.cpp
index b4bdec1fcdd69..54a1590d3106d 100644
--- a/clang/lib/Tooling/Transformer/RangeSelector.cpp
+++ b/clang/lib/Tooling/Transformer/RangeSelector.cpp
@@ -139,7 +139,8 @@ RangeSelector transformer::node(std::string ID) {
             (Node->get<Stmt>() != nullptr && Node->get<Expr>() == nullptr))
                ? tooling::getExtendedRange(*Node, tok::TokenKind::semi,
                                            *Result.Context)
-               : CharSourceRange::getTokenRange(Node->getSourceRange());
+               : CharSourceRange::getTokenRange(
+                     Node->getSourceRange(/*IncludeQualifier=*/true));
   };
 }
 
diff --git a/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c b/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c
new file mode 100644
index 0000000000000..3522e2c7e50bf
--- /dev/null
+++ b/clang/test/CIR/CodeGen/X86/avx512bw-builtins.c
@@ -0,0 +1,117 @@
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fclangir -emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -fclangir -emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -fno-signed-char -fclangir -emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+
+// This test mimics clang/test/CodeGen/X86/avx512bw-builtins.c, which eventually
+// CIR shall be able to support fully.
+
+#include <immintrin.h>
+
+__mmask32 test_kshiftli_mask32(__mmask32 A) {
+  // CIR-LABEL: test_kshiftli_mask32
+  // CIR: [[VAL:%.*]] = cir.cast bitcast %{{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: [[SHIFT:%.*]] = cir.const #cir.zero : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: %{{.*}} = cir.vec.shuffle([[SHIFT]], [[VAL]] : !cir.vector<32 x !cir.int<u, 1>>) [#cir.int<1> : !s32i, #cir.int<2> : !s32i, #cir.int<3> : !s32i, #cir.int<4> : !s32i, #cir.int<5> : !s32i, #cir.int<6> : !s32i, #cir.int<7> : !s32i, #cir.int<8> : !s32i, #cir.int<9> : !s32i, #cir.int<10> : !s32i, #cir.int<11> : !s32i, #cir.int<12> : !s32i, #cir.int<13> : !s32i, #cir.int<14> : !s32i, #cir.int<15> : !s32i, #cir.int<16> : !s32i, #cir.int<17> : !s32i, #cir.int<18> : !s32i, #cir.int<19> : !s32i, #cir.int<20> : !s32i, #cir.int<21> : !s32i, #cir.int<22> : !s32i, #cir.int<23> : !s32i, #cir.int<24> : !s32i, #cir.int<25> : !s32i, #cir.int<26> : !s32i, #cir.int<27> : !s32i, #cir.int<28> : !s32i, #cir.int<29> : !s32i, #cir.int<30> : !s32i, #cir.int<31> : !s32i, #cir.int<32> : !s32i] : !cir.vector<32 x !cir.int<u, 1>>
+
+  // LLVM-LABEL: test_kshiftli_mask32
+  // LLVM: [[VAL:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[RES:%.*]] = shufflevector <32 x i1> zeroinitializer, <32 x i1> [[VAL]], <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32>
+
+  // OGCG-LABEL: test_kshiftli_mask32
+  // OGCG: [[VAL:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: [[RES:%.*]] = shufflevector <32 x i1> zeroinitializer, <32 x i1> [[VAL]], <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32>
+  return _kshiftli_mask32(A, 31);
+}
+
+__mmask32 test_kshiftri_mask32(__mmask32 A) {
+  // CIR-LABEL: test_kshiftri_mask32
+  // CIR: [[VAL:%.*]] = cir.cast bitcast %{{.*}} : !u32i -> !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: [[SHIFT:%.*]] = cir.const #cir.zero : !cir.vector<32 x !cir.int<u, 1>>
+  // CIR: %{{.*}} = cir.vec.shuffle([[VAL]], [[SHIFT]] : !cir.vector<32 x !cir.int<u, 1>>) [#cir.int<31> : !s32i, #cir.int<32> : !s32i, #cir.int<33> : !s32i, #cir.int<34> : !s32i, #cir.int<35> : !s32i, #cir.int<36> : !s32i, #cir.int<37> : !s32i, #cir.int<38> : !s32i, #cir.int<39> : !s32i, #cir.int<40> : !s32i, #cir.int<41> : !s32i, #cir.int<42> : !s32i, #cir.int<43> : !s32i, #cir.int<44> : !s32i, #cir.int<45> : !s32i, #cir.int<46> : !s32i, #cir.int<47> : !s32i, #cir.int<48> : !s32i, #cir.int<49> : !s32i, #cir.int<50> : !s32i, #cir.int<51> : !s32i, #cir.int<52> : !s32i, #cir.int<53> : !s32i, #cir.int<54> : !s32i, #cir.int<55> : !s32i, #cir.int<56> : !s32i, #cir.int<57> : !s32i, #cir.int<58> : !s32i, #cir.int<59> : !s32i, #cir.int<60> : !s32i, #cir.int<61> : !s32i, #cir.int<62> : !s32i] : !cir.vector<32 x !cir.int<u, 1>>
+
+  // LLVM-LABEL: test_kshiftri_mask32
+  // LLVM: [[VAL:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // LLVM: [[RES:%.*]] = shufflevector <32 x i1> [[VAL]], <32 x i1> zeroinitializer, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
+
+  // OGCG-LABEL: test_kshiftri_mask32
+  // OGCG: [[VAL:%.*]] = bitcast i32 %{{.*}} to <32 x i1>
+  // OGCG: [[RES:%.*]] = shufflevector <32 x i1> [[VAL]], <32 x i1> zeroinitializer, <32 x i32> <i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
+  return _kshiftri_mask32(A, 31);
+}
+
+__mmask64 test_kshiftli_mask64(__mmask64 A) {
+  // CIR-LABEL: test_kshiftli_mask64
+  // CIR: [[VAL:%.*]] = cir.cast bitcast %{{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: [[SHIFT:%.*]] = cir.const #cir.zero : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: %{{.*}} = cir.vec.shuffle([[SHIFT]], [[VAL]] : !cir.vector<64 x !cir.int<u, 1>>) [#cir.int<32> : !s32i, #cir.int<33> : !s32i, #cir.int<34> : !s32i, #cir.int<35> : !s32i, #cir.int<36> : !s32i, #cir.int<37> : !s32i, #cir.int<38> : !s32i, #cir.int<39> : !s32i, #cir.int<40> : !s32i, #cir.int<41> : !s32i, #cir.int<42> : !s32i, #cir.int<43> : !s32i, #cir.int<44> : !s32i, #cir.int<45> : !s32i, #cir.int<46> : !s32i, #cir.int<47> : !s32i, #cir.int<48> : !s32i, #cir.int<49> : !s32i, #cir.int<50> : !s32i, #cir.int<51> : !s32i, #cir.int<52> : !s32i, #cir.int<53> : !s32i, #cir.int<54> : !s32i, #cir.int<55> : !s32i, #cir.int<56> : !s32i, #cir.int<57> : !s32i, #cir.int<58> : !s32i, #cir.int<59> : !s32i, #cir.int<60> : !s32i, #cir.int<61> : !s32i, #cir.int<62> : !s32i, #cir.int<63> : !s32i, #cir.int<64> : !s32i, #cir.int<65> : !s32i, #cir.int<66> : !s32i, #cir.int<67> : !s32i, #cir.int<68> : !s32i, #cir.int<69> : !s32i, #cir.int<70> : !s32i, #cir.int<71> : !s32i, #cir.int<72> : !s32i, #cir.int<73> : !s32i, #cir.int<74> : !s32i, #cir.int<75> : !s32i, #cir.int<76> : !s32i, #cir.int<77> : !s32i, #cir.int<78> : !s32i, #cir.int<79> : !s32i, #cir.int<80> : !s32i, #cir.int<81> : !s32i, #cir.int<82> : !s32i, #cir.int<83> : !s32i, #cir.int<84> : !s32i, #cir.int<85> : !s32i, #cir.int<86> : !s32i, #cir.int<87> : !s32i, #cir.int<88> : !s32i, #cir.int<89> : !s32i, #cir.int<90> : !s32i, #cir.int<91> : !s32i, #cir.int<92> : !s32i, #cir.int<93> : !s32i, #cir.int<94> : !s32i, #cir.int<95> : !s32i] : !cir.vector<64 x !cir.int<u, 1>>
+
+  // LLVM-LABEL: test_kshiftli_mask64
+  // LLVM: [[VAL:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[RES:%.*]] = shufflevector <64 x i1> zeroinitializer, <64 x i1> [[VAL]], <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
+
+  // OGCG-LABEL: test_kshiftli_mask64
+  // OGCG: [[VAL:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: [[RES:%.*]] = shufflevector <64 x i1> zeroinitializer, <64 x i1> [[VAL]], <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
+  return _kshiftli_mask64(A, 32);
+}
+
+__mmask64 test_kshiftri_mask64(__mmask64 A) {
+  // CIR-LABEL: test_kshiftri_mask64
+  // CIR: [[VAL:%.*]] = cir.cast bitcast %{{.*}} : !u64i -> !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: [[SHIFT:%.*]] = cir.const #cir.zero : !cir.vector<64 x !cir.int<u, 1>>
+  // CIR: %{{.*}} = cir.vec.shuffle([[VAL]], [[SHIFT]] : !cir.vector<64 x !cir.int<u, 1>>) [#cir.int<32> : !s32i, #cir.int<33> : !s32i, #cir.int<34> : !s32i, #cir.int<35> : !s32i, #cir.int<36> : !s32i, #cir.int<37> : !s32i, #cir.int<38> : !s32i, #cir.int<39> : !s32i, #cir.int<40> : !s32i, #cir.int<41> : !s32i, #cir.int<42> : !s32i, #cir.int<43> : !s32i, #cir.int<44> : !s32i, #cir.int<45> : !s32i, #cir.int<46> : !s32i, #cir.int<47> : !s32i, #cir.int<48> : !s32i, #cir.int<49> : !s32i, #cir.int<50> : !s32i, #cir.int<51> : !s32i, #cir.int<52> : !s32i, #cir.int<53> : !s32i, #cir.int<54> : !s32i, #cir.int<55> : !s32i, #cir.int<56> : !s32i, #cir.int<57> : !s32i, #cir.int<58> : !s32i, #cir.int<59> : !s32i, #cir.int<60> : !s32i, #cir.int<61> : !s32i, #cir.int<62> : !s32i, #cir.int<63> : !s32i, #cir.int<64> : !s32i, #cir.int<65> : !s32i, #cir.int<66> : !s32i, #cir.int<67> : !s32i, #cir.int<68> : !s32i, #cir.int<69> : !s32i, #cir.int<70> : !s32i, #cir.int<71> : !s32i, #cir.int<72> : !s32i, #cir.int<73> : !s32i, #cir.int<74> : !s32i, #cir.int<75> : !s32i, #cir.int<76> : !s32i, #cir.int<77> : !s32i, #cir.int<78> : !s32i, #cir.int<79> : !s32i, #cir.int<80> : !s32i, #cir.int<81> : !s32i, #cir.int<82> : !s32i, #cir.int<83> : !s32i, #cir.int<84> : !s32i, #cir.int<85> : !s32i, #cir.int<86> : !s32i, #cir.int<87> : !s32i, #cir.int<88> : !s32i, #cir.int<89> : !s32i, #cir.int<90> : !s32i, #cir.int<91> : !s32i, #cir.int<92> : !s32i, #cir.int<93> : !s32i, #cir.int<94> : !s32i, #cir.int<95> : !s32i] : !cir.vector<64 x !cir.int<u, 1>>
+
+  // LLVM-LABEL: test_kshiftri_mask64
+  // LLVM: [[VAL:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // LLVM: [[RES:%.*]] = shufflevector <64 x i1> [[VAL]], <64 x i1> zeroinitializer, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
+
+  // OGCG-LABEL: test_kshiftri_mask64
+  // OGCG: [[VAL:%.*]] = bitcast i64 %{{.*}} to <64 x i1>
+  // OGCG: [[RES:%.*]] = shufflevector <64 x i1> [[VAL]], <64 x i1> zeroinitializer, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95>
+  return _kshiftri_mask64(A, 32);
+}
+
+__mmask32 test_kshiftli_mask32_out_of_range(__mmask32 A) {
+  // CIR-LABEL: test_kshiftli_mask32_out_of_range
+  // CIR: [[VAL:%.*]] = cir.const #cir.int<0> : !u32i
+  // CIR: cir.store [[VAL]], {{%.*}} : !u32i, !cir.ptr<!u32i>
+  // CIR: [[RES:%.*]] = cir.load {{%.*}} : !cir.ptr<!u32i>, !u32i
+  // CIR: cir.return [[RES]] : !u32i
+
+  // LLVM-LABEL: test_kshiftli_mask32_out_of_range
+  // LLVM: store i32 0, ptr [[VAL:%.*]], align 4
+  // LLVM: [[RES:%.*]] = load i32, ptr [[VAL]], align 4
+  // LLVM: ret i32 [[RES]]
+
+  // OGCG-LABEL: test_kshiftli_mask32_out_of_range
+  // OGCG: ret i32 0
+
+  return _kshiftli_mask32(A, 33);
+}
+
+__mmask32 test_kshiftri_mask32_out_of_range(__mmask32 A) {
+  // CIR-LABEL: test_kshiftri_mask32_out_of_range
+  // CIR: [[VAL:%.*]] = cir.const #cir.int<0> : !u32i
+  // CIR: cir.store [[VAL]], {{%.*}} : !u32i, !cir.ptr<!u32i>
+  // CIR: [[RES:%.*]] = cir.load {{%.*}} : !cir.ptr<!u32i>, !u32i
+  // CIR: cir.return [[RES]] : !u32i
+
+  // LLVM-LABEL: test_kshiftri_mask32_out_of_range
+  // LLVM: store i32 0, ptr [[VAL:%.*]], align 4
+  // LLVM: [[RES:%.*]] = load i32, ptr [[VAL]], align 4
+  // LLVM: ret i32 [[RES]]
+
+  // OGCG-LABEL: test_kshiftri_mask32_out_of_range
+  // OGCG: ret i32 0
+
+  return _kshiftri_mask32(A, 33);
+}
diff --git a/clang/test/CIR/CodeGen/global-array-dtor.cpp b/clang/test/CIR/CodeGen/global-array-dtor.cpp
new file mode 100644
index 0000000000000..01277a3f34015
--- /dev/null
+++ b/clang/test/CIR/CodeGen/global-array-dtor.cpp
@@ -0,0 +1,113 @@
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir -mmlir --mlir-print-ir-before=cir-lowering-prepare %s -o %t.cir 2> %t-before.cir
+// RUN: FileCheck --input-file=%t-before.cir %s --check-prefix=CIR-BEFORE-LPP
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+// This duplicates a test case in global-init.cpp, but having it by itself
+// forces the __cxa_atexit function to be emitted for this case, which was
+// broken in the original implementation.
+
+struct ArrayDtor {
+  ~ArrayDtor();
+};
+
+ArrayDtor arrDtor[16];
+
+// CIR-BEFORE-LPP:      cir.global external @arrDtor = #cir.zero : !cir.array<!rec_ArrayDtor x 16>
+// CIR-BEFORE-LPP-SAME:   dtor {
+// CIR-BEFORE-LPP:          %[[THIS:.*]] = cir.get_global @arrDtor : !cir.ptr<!cir.array<!rec_ArrayDtor x 16>>
+// CIR-BEFORE-LPP:          cir.array.dtor %[[THIS]] : !cir.ptr<!cir.array<!rec_ArrayDtor x 16>> {
+// CIR-BEFORE-LPP:          ^bb0(%[[ELEM:.*]]: !cir.ptr<!rec_ArrayDtor>):
+// CIR-BEFORE-LPP:            cir.call @_ZN9ArrayDtorD1Ev(%[[ELEM]]) nothrow : (!cir.ptr<!rec_ArrayDtor>) -> ()
+// CIR-BEFORE-LPP:            cir.yield
+// CIR-BEFORE-LPP:          }
+// CIR-BEFORE-LPP:        }
+
+// CIR: cir.global external @arrDtor = #cir.zero : !cir.array<!rec_ArrayDtor x 16> {alignment = 16 : i64}
+// CIR: cir.func internal private @__cxx_global_array_dtor(%[[ARR_ARG:.*]]: !cir.ptr<!void> {{.*}}) {
+// CIR:   %[[CONST15:.*]] = cir.const #cir.int<15> : !u64i
+// CIR:   %[[BEGIN:.*]] = cir.cast array_to_ptrdecay %[[ARR_ARG]] : !cir.ptr<!void> -> !cir.ptr<!rec_ArrayDtor>
+// CIR:   %[[END:.*]] = cir.ptr_stride %[[BEGIN]], %[[CONST15]] : (!cir.ptr<!rec_ArrayDtor>, !u64i) -> !cir.ptr<!rec_ArrayDtor>
+// CIR:   %[[CUR_ADDR:.*]] = cir.alloca !cir.ptr<!rec_ArrayDtor>, !cir.ptr<!cir.ptr<!rec_ArrayDtor>>, ["__array_idx"]
+// CIR:   cir.store %[[END]], %[[CUR_ADDR]] : !cir.ptr<!rec_ArrayDtor>, !cir.ptr<!cir.ptr<!rec_ArrayDtor>>
+// CIR:   cir.do {
+// CIR:     %[[CUR:.*]] = cir.load %[[CUR_ADDR]] : !cir.ptr<!cir.ptr<!rec_ArrayDtor>>, !cir.ptr<!rec_ArrayDtor>
+// CIR:     cir.call @_ZN9ArrayDtorD1Ev(%[[CUR]]) nothrow : (!cir.ptr<!rec_ArrayDtor>) -> ()
+// CIR:     %[[NEG_ONE:.*]] = cir.const #cir.int<-1> : !s64i
+// CIR:     %[[NEXT:.*]] = cir.ptr_stride %[[CUR]], %[[NEG_ONE]] : (!cir.ptr<!rec_ArrayDtor>, !s64i) -> !cir.ptr<!rec_ArrayDtor>
+// CIR:     cir.store %[[NEXT]], %[[CUR_ADDR]] : !cir.ptr<!rec_ArrayDtor>, !cir.ptr<!cir.ptr<!rec_ArrayDtor>>
+// CIR:     cir.yield
+// CIR:   } while {
+// CIR:     %[[CUR:.*]] = cir.load %[[CUR_ADDR]] : !cir.ptr<!cir.ptr<!rec_ArrayDtor>>, !cir.ptr<!rec_ArrayDtor>
+// CIR:     %[[CMP:.*]] = cir.cmp(ne, %[[CUR]], %[[BEGIN]]) : !cir.ptr<!rec_ArrayDtor>, !cir.bool
+// CIR:     cir.condition(%[[CMP]])
+// CIR:   }
+// CIR:   cir.return
+// CIR: }
+//
+// CIR: cir.func internal private @__cxx_global_var_init() {
+// CIR:   %[[ARR:.*]] = cir.get_global @arrDtor : !cir.ptr<!cir.array<!rec_ArrayDtor x 16>>
+// CIR:   %[[DTOR:.*]] = cir.get_global @__cxx_global_array_dtor : !cir.ptr<!cir.func<(!cir.ptr<!void>)>>
+// CIR:   %[[DTOR_CAST:.*]] = cir.cast bitcast %[[DTOR]] : !cir.ptr<!cir.func<(!cir.ptr<!void>)>> -> !cir.ptr<!cir.func<(!cir.ptr<!void>)>>
+// CIR:   %[[ARR_CAST:.*]] = cir.cast bitcast %[[ARR]] : !cir.ptr<!cir.array<!rec_ArrayDtor x 16>> -> !cir.ptr<!void>
+// CIR:   %[[HANDLE:.*]] = cir.get_global @__dso_handle : !cir.ptr<i8>
+// CIR:   cir.call @__cxa_atexit(%[[DTOR_CAST]], %[[ARR_CAST]], %[[HANDLE]]) : (!cir.ptr<!cir.func<(!cir.ptr<!void>)>>, !cir.ptr<!void>, !cir.ptr<i8>) -> ()
+
+// LLVM: define internal void @__cxx_global_array_dtor(ptr %[[ARR_ARG:.*]]) {
+// LLVM:   %[[BEGIN:.*]] = getelementptr %struct.ArrayDtor, ptr %[[ARR_ARG]], i32 0
+// LLVM:   %[[END:.*]] = getelementptr %struct.ArrayDtor, ptr %[[BEGIN]], i64 15
+// LLVM:   %[[CUR_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[END]], ptr %[[CUR_ADDR]]
+// LLVM:   br label %[[LOOP_BODY:.*]]
+// LLVM: [[LOOP_COND:.*]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[CUR_ADDR]]
+// LLVM:   %[[CMP:.*]] = icmp ne ptr %[[CUR]], %[[BEGIN]]
+// LLVM:   br i1 %[[CMP]], label %[[LOOP_BODY]], label %[[LOOP_END:.*]]
+// LLVM: [[LOOP_BODY]]:
+// LLVM:   %[[CUR:.*]] = load ptr, ptr %[[CUR_ADDR]]
+// LLVM:   call void @_ZN9ArrayDtorD1Ev(ptr %[[CUR]]) #0
+// LLVM:   %[[PREV:.*]] = getelementptr %struct.ArrayDtor, ptr %[[CUR]], i64 -1
+// LLVM:   store ptr %[[PREV]], ptr %[[CUR_ADDR]]
+// LLVM:   br label %[[LOOP_COND]]
+// LLVM: [[LOOP_END]]:
+// LLVM:   ret void
+// LLVM: }
+//
+// LLVM: define internal void @__cxx_global_var_init() {
+// LLVM:   call void @__cxa_atexit(ptr @__cxx_global_array_dtor, ptr @arrDtor, ptr @__dso_handle)
+
+// Note: OGCG defines these functions in reverse order of CIR->LLVM.
+// Note also: OGCG doesn't pass the address of the array to the destructor function.
+//            Instead, it uses the global directly in the helper function.
+
+// OGCG: define internal void @__cxx_global_var_init() {{.*}} section ".text.startup" {
+// OGCG:   call i32 @__cxa_atexit(ptr @__cxx_global_array_dtor, ptr null, ptr @__dso_handle)
+
+// OGCG: define internal void @__cxx_global_array_dtor(ptr noundef %[[ARG:.*]]) {{.*}} section ".text.startup" {
+// OGCG: entry:
+// OGCG:   %[[UNUSED_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[ARG]], ptr %[[UNUSED_ADDR]]
+// OGCG:   br label %[[LOOP_BODY:.*]]
+// OGCG: [[LOOP_BODY]]:
+// OGCG:   %[[PREV:.*]] = phi ptr [ getelementptr inbounds (%struct.ArrayDtor, ptr @arrDtor, i64 16), %entry ], [ %[[CUR:.*]], %[[LOOP_BODY]] ]
+// OGCG:   %[[CUR]] = getelementptr inbounds %struct.ArrayDtor, ptr %[[PREV]], i64 -1
+// OGCG:   call void @_ZN9ArrayDtorD1Ev(ptr noundef nonnull align 1 dereferenceable(1) %[[CUR]])
+// OGCG:   %[[DONE:.*]] = icmp eq ptr %[[CUR]], @arrDtor
+// OGCG:   br i1 %[[DONE]], label %[[LOOP_END:.*]], label %[[LOOP_BODY]]
+// OGCG: [[LOOP_END]]:
+// OGCG:   ret void
+// OGCG: }
+
+// Common init function for all globals with default priority
+
+// CIR: cir.func private @_GLOBAL__sub_I_[[FILENAME:.*]]() {
+// CIR:   cir.call @__cxx_global_var_init() : () -> ()
+
+// LLVM: define void @_GLOBAL__sub_I_[[FILENAME:.*]]()
+// LLVM:   call void @__cxx_global_var_init()
+
+// OGCG: define internal void @_GLOBAL__sub_I_[[FILENAME:.*]]() {{.*}} section ".text.startup" {
+// OGCG:   call void @__cxx_global_var_init()
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index 263301ad4466a..f2a4d8c50ec23 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -984,6 +984,11 @@
 // CHECK-UBSAN-MINIMAL: "-fsanitize={{((signed-integer-overflow|integer-divide-by-zero|shift-base|shift-exponent|unreachable|return|vla-bound|alignment|null|pointer-overflow|float-cast-overflow|array-bounds|enum|bool|builtin|returns-nonnull-attribute|nonnull-attribute|function),?){18}"}}
 // CHECK-UBSAN-MINIMAL: "-fsanitize-minimal-runtime"
 
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=undefined -fsanitize-minimal-runtime -fsanitize-handler-preserve-all-regs %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-UBSAN-MINIMAL-PRESERVE
+// CHECK-UBSAN-MINIMAL-PRESERVE: "-fsanitize={{((signed-integer-overflow|integer-divide-by-zero|shift-base|shift-exponent|unreachable|return|vla-bound|alignment|null|pointer-overflow|float-cast-overflow|array-bounds|enum|bool|builtin|returns-nonnull-attribute|nonnull-attribute|function),?){18}"}}
+// CHECK-UBSAN-MINIMAL-PRESERVE: "-fsanitize-minimal-runtime"
+// CHECK-UBSAN-MINIMAL-PRESERVE: "-fsanitize-handler-preserve-all-regs
+
 // RUN: %clang --target=x86_64-linux-gnu -fsanitize=integer -fsanitize-trap=integer %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-INTSAN-TRAP
 // CHECK-INTSAN-TRAP: "-fsanitize-trap=integer-divide-by-zero,shift-base,shift-exponent,signed-integer-overflow,unsigned-integer-overflow,unsigned-shift-base,implicit-unsigned-integer-truncation,implicit-signed-integer-truncation,implicit-integer-sign-change"
 
diff --git a/clang/test/Misc/opencl-c-3.0.incorrect_define.cl b/clang/test/Misc/opencl-c-3.0.incorrect_define.cl
new file mode 100644
index 0000000000000..7857175e46209
--- /dev/null
+++ b/clang/test/Misc/opencl-c-3.0.incorrect_define.cl
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -verify -triple spir-unknown-unknown -cl-std=CL3.0 -cl-ext=-__opencl_c_fp64,-cl_khr_fp64 %s
+// RUN: %clang_cc1 -verify -triple spir-unknown-unknown -cl-std=clc++2021 -cl-ext=-__opencl_c_fp64,-cl_khr_fp64 %s
+
+#if __opencl_c_ext_fp64_global_atomic_add != 0
+#error "Incorrectly defined __opencl_c_ext_fp64_global_atomic_add"
+#endif
+#if __opencl_c_ext_fp64_local_atomic_add != 0
+#error "Incorrectly defined __opencl_c_ext_fp64_local_atomic_add"
+#endif
+#if __opencl_c_ext_fp64_global_atomic_min_max != 0
+#error "Incorrectly defined __opencl_c_ext_fp64_global_atomic_min_max"
+#endif
+#if __opencl_c_ext_fp64_local_atomic_min_max != 0
+#error "Incorrectly defined __opencl_c_ext_fp64_local_atomic_min_max"
+#endif
+
+// expected-no-diagnostics
diff --git a/clang/tools/c-index-test/CMakeLists.txt b/clang/tools/c-index-test/CMakeLists.txt
index 41e80e66ffa7a..24e7c9692ca56 100644
--- a/clang/tools/c-index-test/CMakeLists.txt
+++ b/clang/tools/c-index-test/CMakeLists.txt
@@ -27,7 +27,6 @@ else()
     libclang
     clangAST
     clangBasic
-    clangDriver
     clangFrontend
     clangIndex
     clangSerialization
diff --git a/clang/tools/c-index-test/core_main.cpp b/clang/tools/c-index-test/core_main.cpp
index c67479fd130ca..5a3086a7fc08f 100644
--- a/clang/tools/c-index-test/core_main.cpp
+++ b/clang/tools/c-index-test/core_main.cpp
@@ -8,7 +8,6 @@
 
 #include "clang/AST/Mangle.h"
 #include "clang/Basic/LangOptions.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
diff --git a/clang/tools/diagtool/CMakeLists.txt b/clang/tools/diagtool/CMakeLists.txt
index 09b2a81790f87..b49619c075c73 100644
--- a/clang/tools/diagtool/CMakeLists.txt
+++ b/clang/tools/diagtool/CMakeLists.txt
@@ -15,6 +15,5 @@ add_clang_tool(diagtool
 clang_target_link_libraries(diagtool
   PRIVATE
   clangBasic
-  clangDriver
   clangFrontend
   )
diff --git a/clang/tools/diagtool/ShowEnabledWarnings.cpp b/clang/tools/diagtool/ShowEnabledWarnings.cpp
index 5b25e656dafa4..bea0288c09358 100644
--- a/clang/tools/diagtool/ShowEnabledWarnings.cpp
+++ b/clang/tools/diagtool/ShowEnabledWarnings.cpp
@@ -9,7 +9,6 @@
 #include "DiagTool.h"
 #include "DiagnosticNames.h"
 #include "clang/Basic/LLVM.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/TextDiagnosticBuffer.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index cc757039cafd0..300d59df1bf7b 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -17,7 +17,6 @@
 #include "clang/Basic/TargetOptions.h"
 #include "clang/CodeGen/ObjectFilePCHContainerWriter.h"
 #include "clang/Config/config.h"
-#include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
@@ -270,7 +269,7 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
   if (Clang->getHeaderSearchOpts().UseBuiltinIncludes &&
       Clang->getHeaderSearchOpts().ResourceDir.empty())
     Clang->getHeaderSearchOpts().ResourceDir =
-        GetResourcesPath(Argv0, MainAddr);
+        CompilerInvocation::GetResourcesPath(Argv0, MainAddr);
 
   /// Create the actual file system.
   Clang->createVirtualFileSystem(llvm::vfs::getRealFileSystem(), DiagsBuffer);
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 32e84248c1b27..f4d6fa72a1dfe 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -38,7 +38,6 @@
 #include "clang/Basic/Stack.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Basic/Version.h"
-#include "clang/Driver/CreateASTUnitFromArgs.h"
 #include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Index/CommentToXML.h"
@@ -4362,7 +4361,7 @@ clang_parseTranslationUnit_Impl(CXIndex CIdx, const char *source_filename,
   LibclangInvocationReporter InvocationReporter(
       *CXXIdx, LibclangInvocationReporter::OperationKind::ParseOperation,
       options, llvm::ArrayRef(*Args), /*InvocationArgs=*/{}, unsaved_files);
-  std::unique_ptr<ASTUnit> Unit = CreateASTUnitFromCommandLine(
+  std::unique_ptr<ASTUnit> Unit = ASTUnit::LoadFromCommandLine(
       Args->data(), Args->data() + Args->size(),
       CXXIdx->getPCHContainerOperations(), DiagOpts, Diags,
       CXXIdx->getClangResourcesPath(), CXXIdx->getStorePreamblesInMemory(),
diff --git a/clang/tools/libclang/CIndexer.cpp b/clang/tools/libclang/CIndexer.cpp
index 853a936b43e37..11d9312b64849 100644
--- a/clang/tools/libclang/CIndexer.cpp
+++ b/clang/tools/libclang/CIndexer.cpp
@@ -16,7 +16,6 @@
 #include "clang/Basic/Version.h"
 #include "clang/Config/config.h"
 #include "clang/Driver/Driver.h"
-#include "clang/Options/OptionUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/FileSystem.h"
@@ -138,7 +137,7 @@ const std::string &CIndexer::getClangResourcesPath() {
 #endif
 
   // Cache our result.
-  ResourcesPath = GetResourcesPath(LibClangPath);
+  ResourcesPath = driver::Driver::GetResourcesPath(LibClangPath);
   return ResourcesPath;
 }
 
diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index b0105f5a5f79f..e0ff7605b68b8 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -65,7 +65,6 @@ set(LIBS
   clangFrontend
   clangIndex
   clangLex
-  clangOptions
   clangRewrite
   clangSema
   clangSerialization
diff --git a/clang/tools/libclang/Indexing.cpp b/clang/tools/libclang/Indexing.cpp
index 75323d70afcfe..c142f142d5071 100644
--- a/clang/tools/libclang/Indexing.cpp
+++ b/clang/tools/libclang/Indexing.cpp
@@ -15,7 +15,6 @@
 #include "CXString.h"
 #include "CXTranslationUnit.h"
 #include "clang/AST/ASTConsumer.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
diff --git a/clang/unittests/Driver/DXCModeTest.cpp b/clang/unittests/Driver/DXCModeTest.cpp
index 130da620b40b5..e0454f190b35a 100644
--- a/clang/unittests/Driver/DXCModeTest.cpp
+++ b/clang/unittests/Driver/DXCModeTest.cpp
@@ -15,7 +15,6 @@
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/TargetOptions.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/ToolChain.h"
 #include "clang/Frontend/CompilerInstance.h"
diff --git a/clang/unittests/Driver/ToolChainTest.cpp b/clang/unittests/Driver/ToolChainTest.cpp
index 8f533790ec501..afa17ff219be2 100644
--- a/clang/unittests/Driver/ToolChainTest.cpp
+++ b/clang/unittests/Driver/ToolChainTest.cpp
@@ -17,7 +17,6 @@
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Basic/TargetOptions.h"
 #include "clang/Driver/Compilation.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 6d769396589ee..008adff1cee2d 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3370,6 +3370,11 @@ TEST_F(TokenAnnotatorTest, UnderstandDesignatedInitializers) {
   ASSERT_EQ(Tokens.size(), 14u) << Tokens;
   EXPECT_TOKEN(Tokens[6], tok::l_square, TT_DesignatedInitializerLSquare);
   EXPECT_BRACE_KIND(Tokens[9], BK_BracedInit);
+
+  Tokens = annotate("Foo foo[] = {[0] = 1, [1] = 2};");
+  ASSERT_EQ(Tokens.size(), 20u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::l_square, TT_DesignatedInitializerLSquare);
+  EXPECT_TOKEN(Tokens[12], tok::l_square, TT_DesignatedInitializerLSquare);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsJavaScript) {
diff --git a/clang/unittests/Frontend/ASTUnitTest.cpp b/clang/unittests/Frontend/ASTUnitTest.cpp
index bf9e4e184b5db..dfdbe90e72f1f 100644
--- a/clang/unittests/Frontend/ASTUnitTest.cpp
+++ b/clang/unittests/Frontend/ASTUnitTest.cpp
@@ -9,8 +9,6 @@
 #include <fstream>
 
 #include "clang/Basic/FileManager.h"
-#include "clang/Driver/CreateASTUnitFromArgs.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/ASTUnit.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
@@ -175,7 +173,7 @@ TEST_F(ASTUnitTest, LoadFromCommandLineEarlyError) {
   auto PCHContainerOps = std::make_shared<PCHContainerOperations>();
   std::unique_ptr<clang::ASTUnit> ErrUnit;
 
-  std::unique_ptr<ASTUnit> AST = CreateASTUnitFromCommandLine(
+  std::unique_ptr<ASTUnit> AST = ASTUnit::LoadFromCommandLine(
       &Args[0], &Args[4], PCHContainerOps, DiagOpts, Diags, "", false, "",
       false, CaptureDiagsKind::All, {}, true, 0, TU_Complete, false, false,
       false, SkipFunctionBodiesScope::None, false, true, false, false,
@@ -203,7 +201,7 @@ TEST_F(ASTUnitTest, LoadFromCommandLineWorkingDirectory) {
   auto PCHContainerOps = std::make_shared<PCHContainerOperations>();
   std::unique_ptr<clang::ASTUnit> ErrUnit;
 
-  std::unique_ptr<ASTUnit> AST = CreateASTUnitFromCommandLine(
+  std::unique_ptr<ASTUnit> AST = ASTUnit::LoadFromCommandLine(
       &Args[0], &Args[4], PCHContainerOps, DiagOpts, Diags, "", false, "",
       false, CaptureDiagsKind::All, {}, true, 0, TU_Complete, false, false,
       false, SkipFunctionBodiesScope::None, false, true, false, false,
diff --git a/clang/unittests/Frontend/CompilerInstanceTest.cpp b/clang/unittests/Frontend/CompilerInstanceTest.cpp
index 39d35b48f394a..cd3fefa1ea994 100644
--- a/clang/unittests/Frontend/CompilerInstanceTest.cpp
+++ b/clang/unittests/Frontend/CompilerInstanceTest.cpp
@@ -8,7 +8,6 @@
 
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Basic/FileManager.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
diff --git a/clang/unittests/Frontend/UtilsTest.cpp b/clang/unittests/Frontend/UtilsTest.cpp
index a82733d57714a..fc411e4af705f 100644
--- a/clang/unittests/Frontend/UtilsTest.cpp
+++ b/clang/unittests/Frontend/UtilsTest.cpp
@@ -9,7 +9,6 @@
 #include "clang/Frontend/Utils.h"
 #include "clang/Basic/Diagnostic.h"
 #include "clang/Basic/TargetOptions.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Lex/PreprocessorOptions.h"
diff --git a/clang/unittests/Sema/CMakeLists.txt b/clang/unittests/Sema/CMakeLists.txt
index 188f6135a60ac..b61ed8c457635 100644
--- a/clang/unittests/Sema/CMakeLists.txt
+++ b/clang/unittests/Sema/CMakeLists.txt
@@ -13,7 +13,6 @@ add_distinct_clang_unittest(SemaTests
   clangAST
   clangASTMatchers
   clangBasic
-  clangDriver
   clangFrontend
   clangParse
   clangSema
diff --git a/clang/unittests/Sema/SemaNoloadLookupTest.cpp b/clang/unittests/Sema/SemaNoloadLookupTest.cpp
index 3944269eff502..e565372698e5e 100644
--- a/clang/unittests/Sema/SemaNoloadLookupTest.cpp
+++ b/clang/unittests/Sema/SemaNoloadLookupTest.cpp
@@ -10,7 +10,6 @@
 #include "clang/AST/DeclarationName.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
diff --git a/clang/unittests/Serialization/ForceCheckFileInputTest.cpp b/clang/unittests/Serialization/ForceCheckFileInputTest.cpp
index b76dcfec96063..edf33ae04230b 100644
--- a/clang/unittests/Serialization/ForceCheckFileInputTest.cpp
+++ b/clang/unittests/Serialization/ForceCheckFileInputTest.cpp
@@ -9,7 +9,6 @@
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Basic/FileManager.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendActions.h"
diff --git a/clang/unittests/Serialization/LoadSpecLazilyTest.cpp b/clang/unittests/Serialization/LoadSpecLazilyTest.cpp
index f55925aeae1f2..d7b55491fddac 100644
--- a/clang/unittests/Serialization/LoadSpecLazilyTest.cpp
+++ b/clang/unittests/Serialization/LoadSpecLazilyTest.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/FrontendActions.h"
diff --git a/clang/unittests/Serialization/ModuleCacheTest.cpp b/clang/unittests/Serialization/ModuleCacheTest.cpp
index df26e54588b9e..e9b8da3dba6af 100644
--- a/clang/unittests/Serialization/ModuleCacheTest.cpp
+++ b/clang/unittests/Serialization/ModuleCacheTest.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/FileManager.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendActions.h"
diff --git a/clang/unittests/Serialization/NoCommentsTest.cpp b/clang/unittests/Serialization/NoCommentsTest.cpp
index 444a082bba907..01bb6999a7c90 100644
--- a/clang/unittests/Serialization/NoCommentsTest.cpp
+++ b/clang/unittests/Serialization/NoCommentsTest.cpp
@@ -9,7 +9,6 @@
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Basic/FileManager.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendActions.h"
diff --git a/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp b/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp
index b826f20ce4d70..55ee72875ead2 100644
--- a/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp
+++ b/clang/unittests/Serialization/PreambleInNamedModulesTest.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendActions.h"
diff --git a/clang/unittests/Serialization/VarDeclConstantInitTest.cpp b/clang/unittests/Serialization/VarDeclConstantInitTest.cpp
index 2be01def49809..743f851fc5fe1 100644
--- a/clang/unittests/Serialization/VarDeclConstantInitTest.cpp
+++ b/clang/unittests/Serialization/VarDeclConstantInitTest.cpp
@@ -9,7 +9,6 @@
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Basic/FileManager.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendActions.h"
diff --git a/clang/unittests/Tooling/RangeSelectorTest.cpp b/clang/unittests/Tooling/RangeSelectorTest.cpp
index a1fcbb023832f..d441da165b09b 100644
--- a/clang/unittests/Tooling/RangeSelectorTest.cpp
+++ b/clang/unittests/Tooling/RangeSelectorTest.cpp
@@ -339,6 +339,13 @@ TEST(RangeSelectorTest, NodeOpExpression) {
   EXPECT_THAT_EXPECTED(select(node("id"), Match), HasValue("3"));
 }
 
+TEST(RangeSelectorTest, NodeOpTypeLoc) {
+  StringRef Code = "namespace ns {struct Foo{};} ns::Foo a;";
+  TestMatch Match =
+      matchCode(Code, varDecl(hasTypeLoc(typeLoc().bind("typeloc"))));
+  EXPECT_THAT_EXPECTED(select(node("typeloc"), Match), HasValue("ns::Foo"));
+}
+
 TEST(RangeSelectorTest, StatementOp) {
   StringRef Code = "int f() { return 3; }";
   TestMatch Match = matchCode(Code, expr().bind("id"));
diff --git a/clang/unittests/Tooling/Syntax/TokensTest.cpp b/clang/unittests/Tooling/Syntax/TokensTest.cpp
index 468ca5ddd2c75..47184cbf5d768 100644
--- a/clang/unittests/Tooling/Syntax/TokensTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -20,7 +20,6 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/TokenKinds.def"
 #include "clang/Basic/TokenKinds.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendAction.h"
 #include "clang/Frontend/Utils.h"
diff --git a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
index dad75854240ef..b2be64fc08f3d 100644
--- a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -13,7 +13,6 @@
 #include "TreeTestBase.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/Basic/LLVM.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendAction.h"
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 4837ac96b9b26..1d4208b6a2aa0 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -326,8 +326,10 @@ void ScudoCombinedTest<Config>::BasicTest(scudo::uptr SizeLog) {
     }
   }
 
-  Allocator->printStats();
-  Allocator->printFragmentationInfo();
+  if (TEST_HAS_FAILURE) {
+    Allocator->printStats();
+    Allocator->printFragmentationInfo();
+  }
 }
 
 #define SCUDO_MAKE_BASIC_TEST(SizeLog)                                         \
diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
index 1f5df28fd7771..3a087c497b1a9 100644
--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
@@ -230,9 +230,11 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, BasicPrimary) {
   }
   SizeClassAllocator.destroy(nullptr);
   Allocator->releaseToOS(scudo::ReleaseToOS::Force);
-  scudo::ScopedString Str;
-  Allocator->getStats(&Str);
-  Str.output();
+  if (TEST_HAS_FAILURE) {
+    scudo::ScopedString Str;
+    Allocator->getStats(&Str);
+    Str.output();
+  }
 }
 
 struct SmallRegionsConfig {
@@ -289,10 +291,12 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
 
   SizeClassAllocator.destroy(nullptr);
   Allocator.releaseToOS(scudo::ReleaseToOS::Force);
-  scudo::ScopedString Str;
-  Allocator.getStats(&Str);
-  Str.output();
   EXPECT_EQ(AllocationFailed, true);
+  if (TEST_HAS_FAILURE) {
+    scudo::ScopedString Str;
+    Allocator.getStats(&Str);
+    Str.output();
+  }
   Allocator.unmapTestOnly();
 }
 
@@ -328,9 +332,11 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryIterate) {
   }
   SizeClassAllocator.destroy(nullptr);
   Allocator->releaseToOS(scudo::ReleaseToOS::Force);
-  scudo::ScopedString Str;
-  Allocator->getStats(&Str);
-  Str.output();
+  if (TEST_HAS_FAILURE) {
+    scudo::ScopedString Str;
+    Allocator->getStats(&Str);
+    Str.output();
+  }
 }
 
 SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryThreaded) {
@@ -385,11 +391,13 @@ SCUDO_TYPED_TEST(ScudoPrimaryTest, PrimaryThreaded) {
   for (auto &T : Threads)
     T.join();
   Allocator->releaseToOS(scudo::ReleaseToOS::Force);
-  scudo::ScopedString Str;
-  Allocator->getStats(&Str);
-  Allocator->getFragmentationInfo(&Str);
-  Allocator->getMemoryGroupFragmentationInfo(&Str);
-  Str.output();
+  if (TEST_HAS_FAILURE) {
+    scudo::ScopedString Str;
+    Allocator->getStats(&Str);
+    Allocator->getFragmentationInfo(&Str);
+    Allocator->getMemoryGroupFragmentationInfo(&Str);
+    Str.output();
+  }
 }
 
 // Through a simple allocation that spans two pages, verify that releaseToOS
diff --git a/compiler-rt/lib/scudo/standalone/tests/quarantine_test.cpp b/compiler-rt/lib/scudo/standalone/tests/quarantine_test.cpp
index 54d42edc374e5..e3e983be54574 100644
--- a/compiler-rt/lib/scudo/standalone/tests/quarantine_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/quarantine_test.cpp
@@ -216,9 +216,11 @@ TEST(ScudoQuarantineTest, GlobalQuarantine) {
   Quarantine.drainAndRecycle(&Cache, Cb);
   EXPECT_EQ(Cache.getSize(), 0UL);
 
-  scudo::ScopedString Str;
-  Quarantine.getStats(&Str);
-  Str.output();
+  if (TEST_HAS_FAILURE) {
+    scudo::ScopedString Str;
+    Quarantine.getStats(&Str);
+    Str.output();
+  }
 }
 
 struct PopulateQuarantineThread {
@@ -248,9 +250,11 @@ TEST(ScudoQuarantineTest, ThreadedGlobalQuarantine) {
   for (scudo::uptr I = 0; I < NumberOfThreads; I++)
     pthread_join(T[I].Thread, 0);
 
-  scudo::ScopedString Str;
-  Quarantine.getStats(&Str);
-  Str.output();
+  if (TEST_HAS_FAILURE) {
+    scudo::ScopedString Str;
+    Quarantine.getStats(&Str);
+    Str.output();
+  }
 
   for (scudo::uptr I = 0; I < NumberOfThreads; I++)
     Quarantine.drainAndRecycle(&T[I].Cache, Cb);
diff --git a/compiler-rt/lib/scudo/standalone/tests/size_class_map_test.cpp b/compiler-rt/lib/scudo/standalone/tests/size_class_map_test.cpp
index 05b5835ff0bb6..73b2823e4c9d1 100644
--- a/compiler-rt/lib/scudo/standalone/tests/size_class_map_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/size_class_map_test.cpp
@@ -12,8 +12,10 @@
 
 template <class SizeClassMap> void testSizeClassMap() {
   typedef SizeClassMap SCMap;
-  scudo::printMap<SCMap>();
   scudo::validateMap<SCMap>();
+  if (TEST_HAS_FAILURE) {
+    scudo::printMap<SCMap>();
+  }
 }
 
 TEST(ScudoSizeClassMapTest, DefaultSizeClassMap) {
diff --git a/compiler-rt/lib/ubsan_minimal/ubsan_minimal_handlers.cpp b/compiler-rt/lib/ubsan_minimal/ubsan_minimal_handlers.cpp
index ebc36a8583e05..480c5917877a3 100644
--- a/compiler-rt/lib/ubsan_minimal/ubsan_minimal_handlers.cpp
+++ b/compiler-rt/lib/ubsan_minimal/ubsan_minimal_handlers.cpp
@@ -12,6 +12,22 @@ static void message(const char *msg) { ubsan_message(msg); }
 static void message(const char *msg) { (void)write(2, msg, strlen(msg)); }
 #endif
 
+// If for some reason we cannot build the runtime with preserve_all, don't
+// emit any symbol. Programs that need them will fail to link, but that is
+// better than randomly corrupted registers.
+// Some architectures don't support preserve_all (but clang still has the)
+// attribute. For now, only support x86-64 and aarch64.
+#if defined(__clang__) && defined(__has_cpp_attribute) &&                      \
+    (defined(__x86_64__) || defined(__aarch64__))
+#if __has_cpp_attribute(clang::preserve_all)
+#define PRESERVE_HANDLERS true
+#else
+#define PRESERVE_HANDLERS false
+#endif
+#else
+#define PRESERVE_HANDLERS false
+#endif
+
 static const int kMaxCallerPcs = 20;
 static __sanitizer::atomic_uintptr_t caller_pcs[kMaxCallerPcs];
 // Number of elements in caller_pcs. A special value of kMaxCallerPcs + 1 means
@@ -85,6 +101,18 @@ SANITIZER_INTERFACE_WEAK_DEF(void, __ubsan_report_error, const char *kind,
   }
 }
 
+#if PRESERVE_HANDLERS
+SANITIZER_INTERFACE_WEAK_DEF(void, __ubsan_report_error_preserve,
+                             const char *kind, uintptr_t caller)
+[[clang::preserve_all]] {
+  // Additional indirecton so the user can override this with their own
+  // preserve_all function. This would allow, e.g., a function that reports the
+  // first error only, so for all subsequent calls we can skip the register save
+  // / restore.
+  __ubsan_report_error(kind, caller);
+}
+#endif
+
 SANITIZER_INTERFACE_WEAK_DEF(void, __ubsan_report_error_fatal, const char *kind,
                              uintptr_t caller) {
   // Use another handlers, in case it's already overriden.
@@ -119,6 +147,16 @@ void NORETURN CheckFailed(const char *file, int, const char *cond, u64, u64) {
 
 #define INTERFACE extern "C" __attribute__((visibility("default")))
 
+#if PRESERVE_HANDLERS
+#define HANDLER_PRESERVE(name, kind)                                           \
+  INTERFACE void __ubsan_handle_##name##_minimal_preserve()                    \
+      [[clang::preserve_all]] {                                                \
+    __ubsan_report_error_preserve(kind, GET_CALLER_PC());                      \
+  }
+#else
+#define HANDLER_PRESERVE(name, kind)
+#endif
+
 #define HANDLER_RECOVER(name, kind)                                            \
   INTERFACE void __ubsan_handle_##name##_minimal() {                           \
     __ubsan_report_error(kind, GET_CALLER_PC());                               \
@@ -133,7 +171,8 @@ void NORETURN CheckFailed(const char *file, int, const char *cond, u64, u64) {
 
 #define HANDLER(name, kind)                                                    \
   HANDLER_RECOVER(name, kind)                                                  \
-  HANDLER_NORECOVER(name, kind)
+  HANDLER_NORECOVER(name, kind)                                                \
+  HANDLER_PRESERVE(name, kind)
 
 HANDLER(type_mismatch, "type-mismatch")
 HANDLER(alignment_assumption, "alignment-assumption")
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index ea22fb0babc46..dce01cc9743b3 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -971,7 +971,9 @@ def target_page_size():
             stdin=subprocess.PIPE,
             stdout=subprocess.PIPE,
         )
-        out, err = proc.communicate(b'import os; print(os.sysconf("SC_PAGESIZE"))')
+        out, err = proc.communicate(
+            b'import os; print(os.sysconf("SC_PAGESIZE") if hasattr(os, "sysconf") else "")'
+        )
         return int(out)
     except:
         return 4096
diff --git a/compiler-rt/test/ubsan_minimal/TestCases/test-darwin-interface.c b/compiler-rt/test/ubsan_minimal/TestCases/test-darwin-interface.c
index 849401ef78741..f7702c99ee709 100644
--- a/compiler-rt/test/ubsan_minimal/TestCases/test-darwin-interface.c
+++ b/compiler-rt/test/ubsan_minimal/TestCases/test-darwin-interface.c
@@ -8,6 +8,7 @@
 // RUN: sed -e 's/.*"\(.*libclang_rt.ubsan_minimal_osx_dynamic.dylib\)".*/\1/' | \
 // RUN: tr -d '\n' > %t.dylib_path1
 // RUN: nm -jgU %{readfile:%t.dylib_path1} | grep "^___ubsan_handle" \
+// RUN:  | grep -vE "_minimal_preserve" \
 // RUN:  | sed 's/_minimal//g' \
 // RUN:  > %t.minimal.symlist
 //
diff --git a/flang/include/flang/Lower/Support/ReductionProcessor.h b/flang/include/flang/Lower/Support/ReductionProcessor.h
index 905784d25fdb2..d171101c6768e 100644
--- a/flang/include/flang/Lower/Support/ReductionProcessor.h
+++ b/flang/include/flang/Lower/Support/ReductionProcessor.h
@@ -39,6 +39,13 @@ namespace omp {
 
 class ReductionProcessor {
 public:
+  using GenInitValueCBTy =
+      std::function<mlir::Value(fir::FirOpBuilder &builder, mlir::Location loc,
+                                mlir::Type type, mlir::Value ompOrig)>;
+  using GenCombinerCBTy = std::function<void(
+      fir::FirOpBuilder &builder, mlir::Location loc, mlir::Type type,
+      mlir::Value op1, mlir::Value op2, bool isByRef)>;
+
   // TODO: Move this enumeration to the OpenMP dialect
   enum ReductionIdentifier {
     ID,
@@ -57,6 +64,9 @@ class ReductionProcessor {
     IEOR
   };
 
+  static bool doReductionByRef(mlir::Type reductionType);
+  static bool doReductionByRef(mlir::Value reductionVar);
+
   static ReductionIdentifier
   getReductionType(const omp::clause::ProcedureDesignator &pd);
 
@@ -108,6 +118,14 @@ class ReductionProcessor {
                                           ReductionIdentifier redId,
                                           mlir::Type type, mlir::Value op1,
                                           mlir::Value op2);
+  /// Creates an OpenMP reduction declaration and inserts it into the provided
+  /// symbol table. The init and combiner regions are generated by the callback
+  /// functions genCombinerCB and genInitValueCB.
+  template <typename DeclareRedType>
+  static DeclareRedType createDeclareReductionHelper(
+      AbstractConverter &converter, llvm::StringRef reductionOpName,
+      mlir::Type type, mlir::Location loc, bool isByRef,
+      GenCombinerCBTy genCombinerCB, GenInitValueCBTy genInitValueCB);
 
   /// Creates an OpenMP reduction declaration and inserts it into the provided
   /// symbol table. The declaration has a constant initializer with the neutral
diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
index ae7d566920656..977bc0f4ee58c 100644
--- a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -47,6 +47,8 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
   void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genClusterBlockIndex(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genClusterDimBlocks(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
   template <const char *fctName, int extent>
   fir::ExtendedValue genLDXXFunc(mlir::Type,
@@ -60,6 +62,7 @@ struct CUDAIntrinsicLibrary : IntrinsicLibrary {
   mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
   void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genThisCluster(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt
index fb74b3dcb280e..bb0b4a39cec9b 100644
--- a/flang/lib/Frontend/CMakeLists.txt
+++ b/flang/lib/Frontend/CMakeLists.txt
@@ -75,6 +75,7 @@ add_flang_library(flangFrontend
 
   CLANG_LIBS
   clangBasic
+  clangDriver
   clangOptions
 )
 
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 76f7329d2d126..7cac9acefe702 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -325,9 +325,10 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
   for (auto *a : args.filtered(clang::options::OPT_fpass_plugin_EQ))
     opts.LLVMPassPlugins.push_back(a->getValue());
 
-  opts.Reciprocals = clang::parseMRecipOption(diags, args);
+  opts.Reciprocals = clang::driver::tools::parseMRecipOption(diags, args);
 
-  opts.PreferVectorWidth = clang::parseMPreferVectorWidthOption(diags, args);
+  opts.PreferVectorWidth =
+      clang::driver::tools::parseMPreferVectorWidthOption(diags, args);
 
   // -fembed-offload-object option
   for (auto *a : args.filtered(clang::options::OPT_fembed_offload_object_EQ))
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 915c8b4a5c6ce..3c72c41c4d4cb 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -13,6 +13,7 @@
 #include "ClauseProcessor.h"
 #include "flang/Lower/OpenMP/Utils.h"
 
+#include "flang/Lower/ConvertCall.h"
 #include "flang/Lower/ConvertExprToHLFIR.h"
 #include "flang/Lower/OpenMP/Clauses.h"
 #include "flang/Lower/PFTBuilder.h"
@@ -402,6 +403,65 @@ bool ClauseProcessor::processInclusive(
   return false;
 }
 
+bool ClauseProcessor::processInitializer(
+    lower::SymMap &symMap, const parser::OmpClause::Initializer &inp,
+    ReductionProcessor::GenInitValueCBTy &genInitValueCB) const {
+  if (auto *clause = findUniqueClause<omp::clause::Initializer>()) {
+    genInitValueCB = [&, clause](fir::FirOpBuilder &builder, mlir::Location loc,
+                                 mlir::Type type, mlir::Value ompOrig) {
+      lower::SymMapScope scope(symMap);
+      const parser::OmpInitializerExpression &iexpr = inp.v.v;
+      const parser::OmpStylizedInstance &styleInstance = iexpr.v.front();
+      const std::list<parser::OmpStylizedDeclaration> &declList =
+          std::get<std::list<parser::OmpStylizedDeclaration>>(styleInstance.t);
+      mlir::Value ompPrivVar;
+      for (const parser::OmpStylizedDeclaration &decl : declList) {
+        auto &name = std::get<parser::ObjectName>(decl.var.t);
+        assert(name.symbol && "Name does not have a symbol");
+        mlir::Value addr = builder.createTemporary(loc, ompOrig.getType());
+        fir::StoreOp::create(builder, loc, ompOrig, addr);
+        fir::FortranVariableFlagsEnum extraFlags = {};
+        fir::FortranVariableFlagsAttr attributes =
+            Fortran::lower::translateSymbolAttributes(builder.getContext(),
+                                                      *name.symbol, extraFlags);
+        auto declareOp = hlfir::DeclareOp::create(
+            builder, loc, addr, name.ToString(), nullptr, {}, nullptr, nullptr,
+            0, attributes);
+        if (name.ToString() == "omp_priv")
+          ompPrivVar = declareOp.getResult(0);
+        symMap.addVariableDefinition(*name.symbol, declareOp);
+      }
+      // Lower the expression/function call
+      lower::StatementContext stmtCtx;
+      mlir::Value result = common::visit(
+          common::visitors{
+              [&](const evaluate::ProcedureRef &procRef) -> mlir::Value {
+                convertCallToHLFIR(loc, converter, procRef, std::nullopt,
+                                   symMap, stmtCtx);
+                auto privVal = fir::LoadOp::create(builder, loc, ompPrivVar);
+                return privVal;
+              },
+              [&](const auto &expr) -> mlir::Value {
+                mlir::Value exprResult = fir::getBase(convertExprToValue(
+                    loc, converter, clause->v, symMap, stmtCtx));
+                // Conversion can either give a value or a refrence to a value,
+                // we need to return the reduction type, so an optional load may
+                // be generated.
+                if (auto refType = llvm::dyn_cast<fir::ReferenceType>(
+                        exprResult.getType()))
+                  if (ompPrivVar.getType() == refType)
+                    exprResult = fir::LoadOp::create(builder, loc, exprResult);
+                return exprResult;
+              }},
+          clause->v.u);
+      stmtCtx.finalizeAndPop();
+      return result;
+    };
+    return true;
+  }
+  return false;
+}
+
 bool ClauseProcessor::processMergeable(
     mlir::omp::MergeableClauseOps &result) const {
   return markClauseOccurrence<omp::clause::Mergeable>(result.mergeable);
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index 00cf9bbe4c48b..c05e8d8d3f47e 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -18,6 +18,7 @@
 #include "flang/Lower/Bridge.h"
 #include "flang/Lower/DirectivesCommon.h"
 #include "flang/Lower/OpenMP/Clauses.h"
+#include "flang/Lower/Support/ReductionProcessor.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Parser/dump-parse-tree.h"
 #include "flang/Parser/parse-tree.h"
@@ -88,6 +89,9 @@ class ClauseProcessor {
   bool processHint(mlir::omp::HintClauseOps &result) const;
   bool processInclusive(mlir::Location currentLocation,
                         mlir::omp::InclusiveClauseOps &result) const;
+  bool processInitializer(
+      lower::SymMap &symMap, const parser::OmpClause::Initializer &inp,
+      ReductionProcessor::GenInitValueCBTy &genInitValueCB) const;
   bool processMergeable(mlir::omp::MergeableClauseOps &result) const;
   bool processNogroup(mlir::omp::NogroupClauseOps &result) const;
   bool processNowait(mlir::omp::NowaitClauseOps &result) const;
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index b1a3c3d3c5439..dc49a8118b0a5 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -981,7 +981,22 @@ Init make(const parser::OmpClause::Init &inp,
 
 Initializer make(const parser::OmpClause::Initializer &inp,
                  semantics::SemanticsContext &semaCtx) {
-  llvm_unreachable("Empty: initializer");
+  const parser::OmpInitializerExpression &iexpr = inp.v.v;
+  const parser::OmpStylizedInstance &styleInstance = iexpr.v.front();
+  const parser::OmpStylizedInstance::Instance &instance =
+      std::get<parser::OmpStylizedInstance::Instance>(styleInstance.t);
+  if (const auto *as = std::get_if<parser::AssignmentStmt>(&instance.u)) {
+    auto &expr = std::get<parser::Expr>(as->t);
+    return Initializer{makeExpr(expr, semaCtx)};
+  } else if (const auto *call = std::get_if<parser::CallStmt>(&instance.u)) {
+    if (call->typedCall) {
+      const auto &procRef = *call->typedCall;
+      semantics::SomeExpr evalProcRef{procRef};
+      return Initializer{evalProcRef};
+    }
+  }
+
+  llvm_unreachable("Unexpected initializer");
 }
 
 InReduction make(const parser::OmpClause::InReduction &inp,
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 921caf6f9087e..98a4e7851bf17 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -17,13 +17,16 @@
 #include "DataSharingProcessor.h"
 #include "Decomposer.h"
 #include "flang/Common/idioms.h"
+#include "flang/Evaluate/type.h"
 #include "flang/Lower/Bridge.h"
 #include "flang/Lower/ConvertExpr.h"
+#include "flang/Lower/ConvertExprToHLFIR.h"
 #include "flang/Lower/ConvertVariable.h"
 #include "flang/Lower/DirectivesCommon.h"
 #include "flang/Lower/OpenMP/Clauses.h"
 #include "flang/Lower/OpenMP/Utils.h"
 #include "flang/Lower/StatementContext.h"
+#include "flang/Lower/Support/ReductionProcessor.h"
 #include "flang/Lower/SymbolMap.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
@@ -2859,7 +2862,6 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   // TODO: Add private syms and vars.
   args.reduction.syms = reductionSyms;
   args.reduction.vars = clauseOps.reductionVars;
-
   return genOpWithBody<mlir::omp::TeamsOp>(
       OpWithBodyGenInfo(converter, symTable, semaCtx, loc, eval,
                         llvm::omp::Directive::OMPD_teams)
@@ -3596,12 +3598,156 @@ genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
     TODO(converter.getCurrentLocation(), "OmpDeclareVariantDirective");
 }
 
+static ReductionProcessor::GenCombinerCBTy
+processReductionCombiner(lower::AbstractConverter &converter,
+                         lower::SymMap &symTable,
+                         semantics::SemanticsContext &semaCtx,
+                         const parser::OmpReductionSpecifier &specifier) {
+  ReductionProcessor::GenCombinerCBTy genCombinerCB;
+  const auto &combinerExpression =
+      std::get<std::optional<parser::OmpCombinerExpression>>(specifier.t)
+          .value();
+  const parser::OmpStylizedInstance &combinerInstance =
+      combinerExpression.v.front();
+  const parser::OmpStylizedInstance::Instance &instance =
+      std::get<parser::OmpStylizedInstance::Instance>(combinerInstance.t);
+
+  const auto *as = std::get_if<parser::AssignmentStmt>(&instance.u);
+  if (!as) {
+    TODO(converter.getCurrentLocation(),
+         "A combiner that is a subroutine call is not yet supported");
+  }
+  auto &expr = std::get<parser::Expr>(as->t);
+  genCombinerCB = [&](fir::FirOpBuilder &builder, mlir::Location loc,
+                      mlir::Type type, mlir::Value lhs, mlir::Value rhs,
+                      bool isByRef) {
+    const auto &evalExpr = makeExpr(expr, semaCtx);
+    lower::SymMapScope scope(symTable);
+    const std::list<parser::OmpStylizedDeclaration> &declList =
+        std::get<std::list<parser::OmpStylizedDeclaration>>(combinerInstance.t);
+    for (const parser::OmpStylizedDeclaration &decl : declList) {
+      auto &name = std::get<parser::ObjectName>(decl.var.t);
+      mlir::Value addr = lhs;
+      mlir::Type type = lhs.getType();
+      bool isRhs = name.ToString() == std::string("omp_in");
+      if (isRhs) {
+        addr = rhs;
+        type = rhs.getType();
+      }
+
+      assert(name.symbol && "Reduction object name does not have a symbol");
+      if (!fir::conformsWithPassByRef(type)) {
+        addr = builder.createTemporary(loc, type);
+        fir::StoreOp::create(builder, loc, isRhs ? rhs : lhs, addr);
+      }
+      fir::FortranVariableFlagsEnum extraFlags = {};
+      fir::FortranVariableFlagsAttr attributes =
+          Fortran::lower::translateSymbolAttributes(builder.getContext(),
+                                                    *name.symbol, extraFlags);
+      auto declareOp =
+          hlfir::DeclareOp::create(builder, loc, addr, name.ToString(), nullptr,
+                                   {}, nullptr, nullptr, 0, attributes);
+      symTable.addVariableDefinition(*name.symbol, declareOp);
+    }
+
+    lower::StatementContext stmtCtx;
+    mlir::Value result = fir::getBase(
+        convertExprToValue(loc, converter, evalExpr, symTable, stmtCtx));
+    if (auto refType = llvm::dyn_cast<fir::ReferenceType>(result.getType()))
+      if (lhs.getType() == refType.getElementType())
+        result = fir::LoadOp::create(builder, loc, result);
+    stmtCtx.finalizeAndPop();
+    if (isByRef) {
+      fir::StoreOp::create(builder, loc, result, lhs);
+      mlir::omp::YieldOp::create(builder, loc, lhs);
+    } else {
+      mlir::omp::YieldOp::create(builder, loc, result);
+    }
+  };
+  return genCombinerCB;
+}
+
+// Checks that the reduction type is either a trivial type or a derived type of
+// trivial types.
+static bool isSimpleReductionType(mlir::Type reductionType) {
+  if (fir::isa_trivial(reductionType))
+    return true;
+  if (auto recordTy = mlir::dyn_cast<fir::RecordType>(reductionType)) {
+    for (auto [_, fieldType] : recordTy.getTypeList()) {
+      if (!fir::isa_trivial(fieldType))
+        return false;
+    }
+  }
+  return true;
+}
+
+// Getting the type from a symbol compared to a DeclSpec is simpler since we do
+// not need to consider derived vs intrinsic types. Semantics is guaranteed to
+// generate these symbols.
+static mlir::Type
+getReductionType(lower::AbstractConverter &converter,
+                 const parser::OmpReductionSpecifier &specifier) {
+  const auto &combinerExpression =
+      std::get<std::optional<parser::OmpCombinerExpression>>(specifier.t)
+          .value();
+  const parser::OmpStylizedInstance &combinerInstance =
+      combinerExpression.v.front();
+  const std::list<parser::OmpStylizedDeclaration> &declList =
+      std::get<std::list<parser::OmpStylizedDeclaration>>(combinerInstance.t);
+  const parser::OmpStylizedDeclaration &decl = declList.front();
+  const auto &name = std::get<parser::ObjectName>(decl.var.t);
+  const auto &symbol = semantics::SymbolRef(*name.symbol);
+  mlir::Type reductionType = converter.genType(symbol);
+
+  if (!isSimpleReductionType(reductionType))
+    TODO(converter.getCurrentLocation(),
+         "declare reduction currently only supports trival types or derived "
+         "types containing trivial types");
+  return reductionType;
+}
+
 static void genOMP(
     lower::AbstractConverter &converter, lower::SymMap &symTable,
     semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
     const parser::OpenMPDeclareReductionConstruct &declareReductionConstruct) {
-  if (!semaCtx.langOptions().OpenMPSimd)
-    TODO(converter.getCurrentLocation(), "OpenMPDeclareReductionConstruct");
+  if (semaCtx.langOptions().OpenMPSimd)
+    return;
+
+  const parser::OmpArgumentList &args{declareReductionConstruct.v.Arguments()};
+  const parser::OmpArgument &arg{args.v.front()};
+  const auto &specifier = std::get<parser::OmpReductionSpecifier>(arg.u);
+
+  if (std::get<parser::OmpTypeNameList>(specifier.t).v.size() > 1)
+    TODO(converter.getCurrentLocation(),
+         "multiple types in declare reduction is not yet supported");
+
+  mlir::Type reductionType = getReductionType(converter, specifier);
+  ReductionProcessor::GenCombinerCBTy genCombinerCB =
+      processReductionCombiner(converter, symTable, semaCtx, specifier);
+  const parser::OmpClauseList &initializer =
+      declareReductionConstruct.v.Clauses();
+  if (initializer.v.size() > 0) {
+    List<Clause> clauses = makeClauses(initializer, semaCtx);
+    ReductionProcessor::GenInitValueCBTy genInitValueCB;
+    ClauseProcessor cp(converter, semaCtx, clauses);
+    const parser::OmpClause::Initializer &iclause{
+        std::get<parser::OmpClause::Initializer>(initializer.v.front().u)};
+    cp.processInitializer(symTable, iclause, genInitValueCB);
+    const auto &identifier =
+        std::get<parser::OmpReductionIdentifier>(specifier.t);
+    const auto &designator =
+        std::get<parser::ProcedureDesignator>(identifier.u);
+    const auto &reductionName = std::get<parser::Name>(designator.u);
+    bool isByRef = ReductionProcessor::doReductionByRef(reductionType);
+    ReductionProcessor::createDeclareReductionHelper<
+        mlir::omp::DeclareReductionOp>(
+        converter, reductionName.ToString(), reductionType,
+        converter.getCurrentLocation(), isByRef, genCombinerCB, genInitValueCB);
+  } else {
+    TODO(converter.getCurrentLocation(),
+         "declare reduction without an initializer clause is not yet "
+         "supported");
+  }
 }
 
 static void
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 55b3a36261233..1d57edfaa8b16 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -797,6 +797,28 @@ static void processTileSizesFromOpenMPConstruct(
   }
 }
 
+static pft::Evaluation *getNestedDoConstruct(pft::Evaluation &eval) {
+  for (pft::Evaluation &nested : eval.getNestedEvaluations()) {
+    // In an OpenMPConstruct there can be compiler directives:
+    // 1 <<OpenMPConstruct>>
+    //     2 CompilerDirective: !unroll
+    //     <<DoConstruct>> -> 8
+    if (nested.getIf<parser::CompilerDirective>())
+      continue;
+    // Within a DoConstruct, there can be compiler directives, plus
+    // there is a DoStmt before the body:
+    // <<DoConstruct>> -> 8
+    //     3 NonLabelDoStmt -> 7: do i = 1, n
+    //     <<DoConstruct>> -> 7
+    if (nested.getIf<parser::NonLabelDoStmt>())
+      continue;
+    assert(nested.getIf<parser::DoConstruct>() &&
+           "Unexpected construct in the nested evaluations");
+    return &nested;
+  }
+  llvm_unreachable("Expected do loop to be in the nested evaluations");
+}
+
 /// Populates the sizes vector with values if the given OpenMPConstruct
 /// contains a loop construct with an inner tiling construct.
 void collectTileSizesFromOpenMPConstruct(
@@ -819,7 +841,7 @@ int64_t collectLoopRelatedInfo(
   int64_t numCollapse = 1;
 
   // Collect the loops to collapse.
-  lower::pft::Evaluation *doConstructEval = &eval.getFirstNestedEvaluation();
+  lower::pft::Evaluation *doConstructEval = getNestedDoConstruct(eval);
   if (doConstructEval->getIf<parser::DoConstruct>()->IsDoConcurrent()) {
     TODO(currentLocation, "Do Concurrent in Worksharing loop construct");
   }
@@ -845,7 +867,7 @@ void collectLoopRelatedInfo(
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
   // Collect the loops to collapse.
-  lower::pft::Evaluation *doConstructEval = &eval.getFirstNestedEvaluation();
+  lower::pft::Evaluation *doConstructEval = getNestedDoConstruct(eval);
   if (doConstructEval->getIf<parser::DoConstruct>()->IsDoConcurrent()) {
     TODO(currentLocation, "Do Concurrent in Worksharing loop construct");
   }
@@ -886,9 +908,8 @@ void collectLoopRelatedInfo(
     iv.push_back(bounds->name.thing.symbol);
     loopVarTypeSize = std::max(loopVarTypeSize,
                                bounds->name.thing.symbol->GetUltimate().size());
-    collapseValue--;
-    doConstructEval =
-        &*std::next(doConstructEval->getNestedEvaluations().begin());
+    if (--collapseValue)
+      doConstructEval = getNestedDoConstruct(*doConstructEval);
   } while (collapseValue > 0);
 
   convertLoopBounds(converter, currentLocation, result, loopVarTypeSize);
diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
index 605a5b6b20b94..721cb45cd7d24 100644
--- a/flang/lib/Lower/Support/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -501,7 +501,7 @@ static mlir::Type unwrapSeqOrBoxedType(mlir::Type ty) {
 template <typename OpType>
 static void createReductionAllocAndInitRegions(
     AbstractConverter &converter, mlir::Location loc, OpType &reductionDecl,
-    const ReductionProcessor::ReductionIdentifier redId, mlir::Type type,
+    ReductionProcessor::GenInitValueCBTy genInitValueCB, mlir::Type type,
     bool isByRef) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   auto yield = [&](mlir::Value ret) { genYield<OpType>(builder, loc, ret); };
@@ -523,9 +523,8 @@ static void createReductionAllocAndInitRegions(
 
   mlir::Type ty = fir::unwrapRefType(type);
   builder.setInsertionPointToEnd(initBlock);
-  mlir::Value initValue = ReductionProcessor::getReductionInitValue(
-      loc, unwrapSeqOrBoxedType(ty), redId, builder);
-
+  mlir::Value initValue =
+      genInitValueCB(builder, loc, ty, initBlock->getArgument(0));
   if (isByRef) {
     populateByRefInitAndCleanupRegions(
         converter, loc, type, initValue, initBlock,
@@ -536,7 +535,7 @@ static void createReductionAllocAndInitRegions(
         /*isDoConcurrent*/ std::is_same_v<OpType, fir::DeclareReductionOp>);
   }
 
-  if (fir::isa_trivial(ty)) {
+  if (fir::isa_trivial(ty) || fir::isa_derived(ty)) {
     if (isByRef) {
       // alloc region
       builder.setInsertionPointToEnd(allocBlock);
@@ -556,18 +555,18 @@ static void createReductionAllocAndInitRegions(
   yield(boxAlloca);
 }
 
-template <typename OpType>
-OpType ReductionProcessor::createDeclareReduction(
+template <typename DeclareRedType>
+DeclareRedType ReductionProcessor::createDeclareReductionHelper(
     AbstractConverter &converter, llvm::StringRef reductionOpName,
-    const ReductionIdentifier redId, mlir::Type type, mlir::Location loc,
-    bool isByRef) {
+    mlir::Type type, mlir::Location loc, bool isByRef,
+    GenCombinerCBTy genCombinerCB, GenInitValueCBTy genInitValueCB) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::OpBuilder::InsertionGuard guard(builder);
   mlir::ModuleOp module = builder.getModule();
 
   assert(!reductionOpName.empty());
 
-  auto decl = module.lookupSymbol<OpType>(reductionOpName);
+  auto decl = module.lookupSymbol<DeclareRedType>(reductionOpName);
   if (decl)
     return decl;
 
@@ -576,23 +575,54 @@ OpType ReductionProcessor::createDeclareReduction(
   if (!isByRef)
     type = valTy;
 
-  decl = OpType::create(modBuilder, loc, reductionOpName, type);
-  createReductionAllocAndInitRegions(converter, loc, decl, redId, type,
+  decl = DeclareRedType::create(modBuilder, loc, reductionOpName, type);
+  createReductionAllocAndInitRegions(converter, loc, decl, genInitValueCB, type,
                                      isByRef);
-
   builder.createBlock(&decl.getReductionRegion(),
                       decl.getReductionRegion().end(), {type, type},
                       {loc, loc});
-
   builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
   mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
   mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
-  genCombiner<OpType>(builder, loc, redId, type, op1, op2, isByRef);
-
+  genCombinerCB(builder, loc, type, op1, op2, isByRef);
   return decl;
 }
 
-static bool doReductionByRef(mlir::Value reductionVar) {
+template <typename OpType>
+OpType ReductionProcessor::createDeclareReduction(
+    AbstractConverter &converter, llvm::StringRef reductionOpName,
+    const ReductionIdentifier redId, mlir::Type type, mlir::Location loc,
+    bool isByRef) {
+  auto genInitValueCB = [&](fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Type type, mlir::Value val) {
+    mlir::Type ty = fir::unwrapRefType(type);
+    mlir::Value initValue = ReductionProcessor::getReductionInitValue(
+        loc, unwrapSeqOrBoxedType(ty), redId, builder);
+    return initValue;
+  };
+  auto genCombinerCB = [&](fir::FirOpBuilder &builder, mlir::Location loc,
+                           mlir::Type type, mlir::Value op1, mlir::Value op2,
+                           bool isByRef) {
+    genCombiner<OpType>(builder, loc, redId, type, op1, op2, isByRef);
+  };
+
+  return createDeclareReductionHelper<OpType>(converter, reductionOpName, type,
+                                              loc, isByRef, genCombinerCB,
+                                              genInitValueCB);
+}
+
+bool ReductionProcessor::doReductionByRef(mlir::Type reductionType) {
+  if (forceByrefReduction)
+    return true;
+
+  if (!fir::isa_trivial(fir::unwrapRefType(reductionType)) &&
+      !fir::isa_derived(fir::unwrapRefType(reductionType)))
+    return true;
+
+  return false;
+}
+
+bool ReductionProcessor::doReductionByRef(mlir::Value reductionVar) {
   if (forceByrefReduction)
     return true;
 
@@ -600,10 +630,7 @@ static bool doReductionByRef(mlir::Value reductionVar) {
           mlir::dyn_cast<hlfir::DeclareOp>(reductionVar.getDefiningOp()))
     reductionVar = declare.getMemref();
 
-  if (!fir::isa_trivial(fir::unwrapRefType(reductionVar.getType())))
-    return true;
-
-  return false;
+  return doReductionByRef(reductionVar.getType());
 }
 
 template <typename OpType, typename RedOperatorListTy>
@@ -614,6 +641,8 @@ bool ReductionProcessor::processReductionArguments(
     llvm::SmallVectorImpl<bool> &reduceVarByRef,
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
     const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
   if constexpr (std::is_same_v<RedOperatorListTy,
                                omp::clause::ReductionOperatorList>) {
     // For OpenMP reduction clauses, check if the reduction operator is
@@ -627,7 +656,13 @@ bool ReductionProcessor::processReductionArguments(
               std::get_if<omp::clause::ProcedureDesignator>(&redOperator.u)) {
         if (!ReductionProcessor::supportedIntrinsicProcReduction(
                 *reductionIntrinsic)) {
-          return false;
+          // If not an intrinsic is has to be a custom reduction op, and should
+          // be available in the module.
+          semantics::Symbol *sym = reductionIntrinsic->v.sym();
+          mlir::ModuleOp module = builder.getModule();
+          auto decl = module.lookupSymbol<OpType>(getRealName(sym).ToString());
+          if (!decl)
+            return false;
         }
       } else {
         return false;
@@ -637,7 +672,6 @@ bool ReductionProcessor::processReductionArguments(
 
   // Reduction variable processing common to both intrinsic operators and
   // procedure designators
-  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   mlir::OpBuilder::InsertPoint dcIP;
   constexpr bool isDoConcurrent =
       std::is_same_v<OpType, fir::DeclareReductionOp>;
@@ -741,7 +775,13 @@ bool ReductionProcessor::processReductionArguments(
                          &redOperator.u)) {
         if (!ReductionProcessor::supportedIntrinsicProcReduction(
                 *reductionIntrinsic)) {
-          TODO(currentLocation, "Unsupported intrinsic proc reduction");
+          // Custom reductions we can just add to the symbols without
+          // generating the declare reduction op.
+          semantics::Symbol *sym = reductionIntrinsic->v.sym();
+          reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
+              builder.getContext(), sym->name().ToString()));
+          ++idx;
+          continue;
         }
         redId = getReductionType(*reductionIntrinsic);
         reductionName =
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
index f67129dfa6730..a0d9678683e44 100644
--- a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -368,6 +368,16 @@ static constexpr IntrinsicHandler cudaHandlers[]{
          &CI::genNVVMTime<mlir::NVVM::Clock64Op>),
      {},
      /*isElemental=*/false},
+    {"cluster_block_index",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genClusterBlockIndex),
+     {},
+     /*isElemental=*/false},
+    {"cluster_dim_blocks",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genClusterDimBlocks),
+     {},
+     /*isElemental=*/false},
     {"fence_proxy_async",
      static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
          &CI::genFenceProxyAsync),
@@ -457,6 +467,10 @@ static constexpr IntrinsicHandler cudaHandlers[]{
      static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp),
      {},
      /*isElemental=*/false},
+    {"this_cluster",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisCluster),
+     {},
+     /*isElemental=*/false},
     {"this_grid",
      static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid),
      {},
@@ -981,6 +995,60 @@ CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
       .getResult(0);
 }
 
+static void insertValueAtPos(fir::FirOpBuilder &builder, mlir::Location loc,
+                             fir::RecordType recTy, mlir::Value base,
+                             mlir::Value dim, unsigned fieldPos) {
+  auto fieldName = recTy.getTypeList()[fieldPos].first;
+  mlir::Type fieldTy = recTy.getTypeList()[fieldPos].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(base.getContext());
+  mlir::Value fieldIndex =
+      fir::FieldIndexOp::create(builder, loc, fieldIndexType, fieldName, recTy,
+                                /*typeParams=*/mlir::ValueRange{});
+  mlir::Value coord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(fieldTy), base, fieldIndex);
+  fir::StoreOp::create(builder, loc, dim, coord);
+}
+
+// CLUSTER_BLOCK_INDEX
+mlir::Value
+CUDAIntrinsicLibrary::genClusterBlockIndex(mlir::Type resultType,
+                                           llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+  mlir::Value x = mlir::NVVM::BlockInClusterIdXOp::create(builder, loc, i32Ty);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  x = mlir::arith::AddIOp::create(builder, loc, x, one);
+  insertValueAtPos(builder, loc, recTy, res, x, 0);
+  mlir::Value y = mlir::NVVM::BlockInClusterIdYOp::create(builder, loc, i32Ty);
+  y = mlir::arith::AddIOp::create(builder, loc, y, one);
+  insertValueAtPos(builder, loc, recTy, res, y, 1);
+  mlir::Value z = mlir::NVVM::BlockInClusterIdZOp::create(builder, loc, i32Ty);
+  z = mlir::arith::AddIOp::create(builder, loc, z, one);
+  insertValueAtPos(builder, loc, recTy, res, z, 2);
+  return res;
+}
+
+// CLUSTER_DIM_BLOCKS
+mlir::Value
+CUDAIntrinsicLibrary::genClusterDimBlocks(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+  mlir::Value x = mlir::NVVM::ClusterDimBlocksXOp::create(builder, loc, i32Ty);
+  insertValueAtPos(builder, loc, recTy, res, x, 0);
+  mlir::Value y = mlir::NVVM::ClusterDimBlocksYOp::create(builder, loc, i32Ty);
+  insertValueAtPos(builder, loc, recTy, res, y, 1);
+  mlir::Value z = mlir::NVVM::ClusterDimBlocksZOp::create(builder, loc, i32Ty);
+  insertValueAtPos(builder, loc, recTy, res, z, 2);
+  return res;
+}
+
 // FENCE_PROXY_ASYNC
 void CUDAIntrinsicLibrary::genFenceProxyAsync(
     llvm::ArrayRef<fir::ExtendedValue> args) {
@@ -1122,6 +1190,44 @@ void CUDAIntrinsicLibrary::genSyncWarp(
   mlir::NVVM::SyncWarpOp::create(builder, loc, fir::getBase(args[0]));
 }
 
+// THIS_CLUSTER
+mlir::Value
+CUDAIntrinsicLibrary::genThisCluster(mlir::Type resultType,
+                                     llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  // SIZE
+  mlir::Value size = mlir::NVVM::ClusterDim::create(builder, loc, i32Ty);
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  // RANK
+  mlir::Value rank = mlir::NVVM::ClusterId::create(builder, loc, i32Ty);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+
+  return res;
+}
+
 // THIS_GRID
 mlir::Value
 CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType,
diff --git a/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp b/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp
index 0b0e6bd9ecf34..5fa77fb2080df 100644
--- a/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp
+++ b/flang/lib/Optimizer/OpenMP/MarkDeclareTarget.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/TypeSwitch.h"
 
 namespace flangomp {
 #define GEN_PASS_DEF_MARKDECLARETARGETPASS
@@ -31,9 +32,93 @@ namespace {
 class MarkDeclareTargetPass
     : public flangomp::impl::MarkDeclareTargetPassBase<MarkDeclareTargetPass> {
 
-  void markNestedFuncs(mlir::omp::DeclareTargetDeviceType parentDevTy,
-                       mlir::omp::DeclareTargetCaptureClause parentCapClause,
-                       bool parentAutomap, mlir::Operation *currOp,
+  struct ParentInfo {
+    mlir::omp::DeclareTargetDeviceType devTy;
+    mlir::omp::DeclareTargetCaptureClause capClause;
+    bool automap;
+  };
+
+  void processSymbolRef(mlir::SymbolRefAttr symRef, ParentInfo parentInfo,
+                        llvm::SmallPtrSet<mlir::Operation *, 16> visited) {
+    if (auto currFOp =
+            getOperation().lookupSymbol<mlir::func::FuncOp>(symRef)) {
+      auto current = llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
+          currFOp.getOperation());
+
+      if (current.isDeclareTarget()) {
+        auto currentDt = current.getDeclareTargetDeviceType();
+
+        // Found the same function twice, with different device_types,
+        // mark as Any as it belongs to both
+        if (currentDt != parentInfo.devTy &&
+            currentDt != mlir::omp::DeclareTargetDeviceType::any) {
+          current.setDeclareTarget(mlir::omp::DeclareTargetDeviceType::any,
+                                   current.getDeclareTargetCaptureClause(),
+                                   current.getDeclareTargetAutomap());
+        }
+      } else {
+        current.setDeclareTarget(parentInfo.devTy, parentInfo.capClause,
+                                 parentInfo.automap);
+      }
+
+      markNestedFuncs(parentInfo, currFOp, visited);
+    }
+  }
+
+  void processReductionRefs(std::optional<mlir::ArrayAttr> symRefs,
+                            ParentInfo parentInfo,
+                            llvm::SmallPtrSet<mlir::Operation *, 16> visited) {
+    if (!symRefs)
+      return;
+
+    for (auto symRef : symRefs->getAsRange<mlir::SymbolRefAttr>()) {
+      if (auto declareReductionOp =
+              getOperation().lookupSymbol<mlir::omp::DeclareReductionOp>(
+                  symRef)) {
+        markNestedFuncs(parentInfo, declareReductionOp, visited);
+      }
+    }
+  }
+
+  void
+  processReductionClauses(mlir::Operation *op, ParentInfo parentInfo,
+                          llvm::SmallPtrSet<mlir::Operation *, 16> visited) {
+    llvm::TypeSwitch<mlir::Operation &>(*op)
+        .Case([&](mlir::omp::LoopOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::ParallelOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::SectionsOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::SimdOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::TargetOp op) {
+          processReductionRefs(op.getInReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::TaskgroupOp op) {
+          processReductionRefs(op.getTaskReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::TaskloopOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+          processReductionRefs(op.getInReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::TaskOp op) {
+          processReductionRefs(op.getInReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::TeamsOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Case([&](mlir::omp::WsloopOp op) {
+          processReductionRefs(op.getReductionSyms(), parentInfo, visited);
+        })
+        .Default([](mlir::Operation &) {});
+  }
+
+  void markNestedFuncs(ParentInfo parentInfo, mlir::Operation *currOp,
                        llvm::SmallPtrSet<mlir::Operation *, 16> visited) {
     if (visited.contains(currOp))
       return;
@@ -43,33 +128,10 @@ class MarkDeclareTargetPass
       if (auto callOp = llvm::dyn_cast<mlir::CallOpInterface>(op)) {
         if (auto symRef = llvm::dyn_cast_if_present<mlir::SymbolRefAttr>(
                 callOp.getCallableForCallee())) {
-          if (auto currFOp =
-                  getOperation().lookupSymbol<mlir::func::FuncOp>(symRef)) {
-            auto current = llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
-                currFOp.getOperation());
-
-            if (current.isDeclareTarget()) {
-              auto currentDt = current.getDeclareTargetDeviceType();
-
-              // Found the same function twice, with different device_types,
-              // mark as Any as it belongs to both
-              if (currentDt != parentDevTy &&
-                  currentDt != mlir::omp::DeclareTargetDeviceType::any) {
-                current.setDeclareTarget(
-                    mlir::omp::DeclareTargetDeviceType::any,
-                    current.getDeclareTargetCaptureClause(),
-                    current.getDeclareTargetAutomap());
-              }
-            } else {
-              current.setDeclareTarget(parentDevTy, parentCapClause,
-                                       parentAutomap);
-            }
-
-            markNestedFuncs(parentDevTy, parentCapClause, parentAutomap,
-                            currFOp, visited);
-          }
+          processSymbolRef(symRef, parentInfo, visited);
         }
       }
+      processReductionClauses(op, parentInfo, visited);
     });
   }
 
@@ -82,10 +144,10 @@ class MarkDeclareTargetPass
           functionOp.getOperation());
       if (declareTargetOp.isDeclareTarget()) {
         llvm::SmallPtrSet<mlir::Operation *, 16> visited;
-        markNestedFuncs(declareTargetOp.getDeclareTargetDeviceType(),
-                        declareTargetOp.getDeclareTargetCaptureClause(),
-                        declareTargetOp.getDeclareTargetAutomap(), functionOp,
-                        visited);
+        ParentInfo parentInfo{declareTargetOp.getDeclareTargetDeviceType(),
+                              declareTargetOp.getDeclareTargetCaptureClause(),
+                              declareTargetOp.getDeclareTargetAutomap()};
+        markNestedFuncs(parentInfo, functionOp, visited);
       }
     }
 
@@ -96,12 +158,13 @@ class MarkDeclareTargetPass
     // the contents of the device clause
     getOperation()->walk([&](mlir::omp::TargetOp tarOp) {
       llvm::SmallPtrSet<mlir::Operation *, 16> visited;
-      markNestedFuncs(
-          /*parentDevTy=*/mlir::omp::DeclareTargetDeviceType::nohost,
-          /*parentCapClause=*/mlir::omp::DeclareTargetCaptureClause::to,
-          /*parentAutomap=*/false, tarOp, visited);
+      ParentInfo parentInfo = {
+          /*devTy=*/mlir::omp::DeclareTargetDeviceType::nohost,
+          /*capClause=*/mlir::omp::DeclareTargetCaptureClause::to,
+          /*automap=*/false,
+      };
+      markNestedFuncs(parentInfo, tarOp, visited);
     });
   }
 };
-
 } // namespace
diff --git a/flang/lib/Semantics/check-omp-loop.cpp b/flang/lib/Semantics/check-omp-loop.cpp
index 6d83ee62c3b73..ef237a01d0f7a 100644
--- a/flang/lib/Semantics/check-omp-loop.cpp
+++ b/flang/lib/Semantics/check-omp-loop.cpp
@@ -267,7 +267,7 @@ void OmpStructureChecker::CheckNestedBlock(const parser::OpenMPLoopConstruct &x,
   for (auto &stmt : body) {
     if (auto *dir{parser::Unwrap<parser::CompilerDirective>(stmt)}) {
       context_.Say(dir->source,
-          "Compiler directives are not allowed inside OpenMP loop constructs"_err_en_US);
+          "Compiler directives are not allowed inside OpenMP loop constructs"_warn_en_US);
     } else if (parser::Unwrap<parser::DoConstruct>(stmt)) {
       ++nestedCount;
     } else if (auto *omp{parser::Unwrap<parser::OpenMPLoopConstruct>(stmt)}) {
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index ebc9df25fef24..8d08356dc6f14 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2419,13 +2419,6 @@ void OmpAttributeVisitor::PrivatizeAssociatedLoopIndexAndCheckLoopLevel(
         }
       }
       CheckAssocLoopLevel(level, GetAssociatedClause());
-    } else {
-      unsigned version{context_.langOptions().OpenMPVersion};
-      context_.Say(GetContext().directiveSource,
-          "A DO loop must follow the %s directive"_err_en_US,
-          parser::ToUpperCaseLetters(
-              llvm::omp::getOpenMPDirectiveName(GetContext().directive, version)
-                  .str()));
     }
   }
 }
diff --git a/flang/module/cooperative_groups.f90 b/flang/module/cooperative_groups.f90
index b8875f72f8079..8bb4af3afa791 100644
--- a/flang/module/cooperative_groups.f90
+++ b/flang/module/cooperative_groups.f90
@@ -14,6 +14,12 @@ module cooperative_groups
 
 implicit none
 
+type :: cluster_group
+  type(c_devptr), private :: handle
+  integer(4) :: size
+  integer(4) :: rank
+end type cluster_group
+
 type :: grid_group
   type(c_devptr), private :: handle
   integer(4) :: size
@@ -32,6 +38,27 @@ module cooperative_groups
   integer(4) :: rank
 end type thread_group
 
+interface
+  attributes(device) function cluster_block_index()
+    import
+    type(dim3) :: cluster_block_index
+  end function
+end interface
+
+interface
+  attributes(device) function cluster_dim_blocks()
+    import
+    type(dim3) :: cluster_dim_blocks
+  end function
+end interface
+
+interface
+  attributes(device) function this_cluster()
+    import
+    type(cluster_group) :: this_cluster
+  end function
+end interface
+
 interface
   attributes(device) function this_grid()
     import
diff --git a/flang/test/Lower/CUDA/cuda-cluster.cuf b/flang/test/Lower/CUDA/cuda-cluster.cuf
new file mode 100644
index 0000000000000..78cca15b11dab
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-cluster.cuf
@@ -0,0 +1,55 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+attributes(global) subroutine test_this_cluster()
+  use cooperative_groups
+  type(cluster_group) :: cluster
+
+  cluster = this_cluster()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_this_cluster() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: %{{.*}} = fir.alloca !fir.type<_QMcooperative_groupsTcluster_group
+! CHECK: %[[RES:.*]] = fir.alloca !fir.type<_QMcooperative_groupsTcluster_group{_QMcooperative_groupsTcluster_group.handle:!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>,size:i32,rank:i32}>
+! CHECK: %[[RANK:.*]] = nvvm.read.ptx.sreg.cluster.ctarank : i32
+! CHECK: %[[RANK_1:.*]] = arith.addi %[[RANK]], %c1{{.*}} : i32 
+! CHECK: %[[RANK_COORD:.*]] = fir.coordinate_of %[[RES]], rank : (!fir.ref<!fir.type<_QMcooperative_groupsTcluster_group{_QMcooperative_groupsTcluster_group.handle:!fir.type<_QM__fortran_builtinsT__builtin_c_devptr{cptr:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>,size:i32,rank:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[RANK_1]] to %[[RANK_COORD]] : !fir.ref<i32>
+  
+attributes(global) subroutine test_cluster_dim_blocks()
+  use cooperative_groups
+  type(dim3) :: clusterDim
+
+  clusterDim = cluster_dim_blocks()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cluster_dim_blocks() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: %[[X:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.x : i32
+! CHECK: %[[COORD_X:.*]] = fir.coordinate_of %{{.*}}, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[X]] to %[[COORD_X]] : !fir.ref<i32>
+! CHECK: %[[Y:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.y : i32
+! CHECK: %[[COORD_Y:.*]] = fir.coordinate_of %{{.*}}, y : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[Y]] to %[[COORD_Y]] : !fir.ref<i32>
+! CHECK: %[[Z:.*]] = nvvm.read.ptx.sreg.cluster.nctaid.z : i32
+! CHECK: %[[COORD_Z:.*]] = fir.coordinate_of %{{.*}}, z : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[Z]] to %[[COORD_Z]] : !fir.ref<i32>
+
+attributes(global) subroutine test_cluster_block_index()
+  use cooperative_groups
+  type(dim3) :: blockIndex
+
+  blockIndex = cluster_block_index()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_cluster_block_index() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: %[[X:.*]] = nvvm.read.ptx.sreg.cluster.ctaid.x : i32
+! CHECK: %[[X1:.*]] = arith.addi %[[X]], %c1{{.*}} : i32 
+! CHECK: %[[COORD_X:.*]] = fir.coordinate_of %{{.*}}, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[X1]] to %[[COORD_X]] : !fir.ref<i32>
+! CHECK: %[[Y:.*]] = nvvm.read.ptx.sreg.cluster.ctaid.y : i32
+! CHECK: %[[Y1:.*]] = arith.addi %[[Y]], %c1{{.*}} : i32 
+! CHECK: %[[COORD_Y:.*]] = fir.coordinate_of %{{.*}}, y : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[Y1]] to %[[COORD_Y]] : !fir.ref<i32>
+! CHECK: %[[Z:.*]] = nvvm.read.ptx.sreg.cluster.ctaid.z : i32
+! CHECK: %[[Z1:.*]] = arith.addi %[[Z]], %c1{{.*}} : i32 
+! CHECK: %[[COORD_Z:.*]] = fir.coordinate_of %{{.*}}, z : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
+! CHECK: fir.store %[[Z1]] to %[[COORD_Z]] : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declare-reduction-advanced-types.f90 b/flang/test/Lower/OpenMP/Todo/omp-declare-reduction-advanced-types.f90
new file mode 100644
index 0000000000000..e40e3d9285adc
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/omp-declare-reduction-advanced-types.f90
@@ -0,0 +1,19 @@
+! This test checks lowering of OpenMP declare reduction with non-trivial types
+
+! RUN: not %flang_fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s
+
+module mymod
+  type advancedtype
+     integer(4)::myarray(10)
+     integer(4)::val
+     integer(4)::otherval
+  end type advancedtype
+  !CHECK: not yet implemented: declare reduction currently only supports trival types or derived types containing trivial types
+  !$omp declare reduction(myreduction: advancedtype: omp_out = omp_in) initializer(omp_priv = omp_orig)
+end module mymod
+
+program mymaxtest
+  use mymod
+
+end program
+
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declare-reduction-initsub.f90 b/flang/test/Lower/OpenMP/Todo/omp-declare-reduction-initsub.f90
deleted file mode 100644
index f2c744599fdc5..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/omp-declare-reduction-initsub.f90
+++ /dev/null
@@ -1,28 +0,0 @@
-! This test checks lowering of OpenMP declare reduction Directive, with initialization
-! via a subroutine. This functionality is currently not implemented.
-
-! RUN: not %flang_fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s
-
-!CHECK: not yet implemented: OpenMPDeclareReductionConstruct
-subroutine initme(x,n)
-  integer x,n
-  x=n
-end subroutine initme
-
-function func(x, n, init)
-  integer func
-  integer x(n)
-  integer res
-  interface
-     subroutine initme(x,n)
-       integer x,n
-     end subroutine initme
-  end interface
-!$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,0))
-  res=init
-!$omp simd reduction(red_add:res)
-  do i=1,n
-     res=res+x(i)
-  enddo
-  func=res
-end function func
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 b/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90
deleted file mode 100644
index 11e83cc710f05..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90
+++ /dev/null
@@ -1,10 +0,0 @@
-! This test checks lowering of OpenMP declare reduction Directive.
-
-! RUN: not %flang_fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s
-
-subroutine declare_red()
-  integer :: my_var
-  !CHECK: not yet implemented: OpenMPDeclareReductionConstruct
-  !$omp declare reduction (my_red : integer : omp_out = omp_in) initializer (omp_priv = 0)
-  my_var = 0
-end subroutine declare_red
diff --git a/flang/test/Lower/OpenMP/declare-target-deferred-marking-reductions.f90 b/flang/test/Lower/OpenMP/declare-target-deferred-marking-reductions.f90
new file mode 100644
index 0000000000000..66697ef6bbe70
--- /dev/null
+++ b/flang/test/Lower/OpenMP/declare-target-deferred-marking-reductions.f90
@@ -0,0 +1,37 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 -fopenmp-is-device %s -o - | FileCheck %s
+
+program main
+    use, intrinsic ::  iso_c_binding
+    implicit none
+    interface
+    subroutine myinit(priv, orig) bind(c,name="myinit")
+        use, intrinsic :: iso_c_binding
+        implicit none
+        integer::priv, orig
+    end subroutine myinit
+
+    function mycombine(lhs, rhs) bind(c,name="mycombine")
+        use, intrinsic :: iso_c_binding
+        implicit none
+        integer::lhs, rhs, mycombine
+    end function mycombine
+ end interface
+     !$omp declare reduction(myreduction:integer:omp_out = mycombine(omp_out, omp_in)) initializer(myinit(omp_priv, omp_orig))
+
+    integer :: i, s, a(10)
+    !$omp target
+    s = 0
+    !$omp do reduction(myreduction:s)
+    do i = 1, 10
+       s = mycombine(s, a(i))
+    enddo
+    !$omp end do
+    !$omp end target
+ end program main
+
+!CHECK: func.func {{.*}} @myinit(!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-SAME: {{.*}}, omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to), automap = false>{{.*}}
+!CHECK-LABEL: func.func {{.*}} @mycombine(!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-SAME: {{.*}}, omp.declare_target = #omp.declaretarget<device_type = (nohost), capture_clause = (to), automap = false>{{.*}}
+
diff --git a/flang/test/Lower/OpenMP/omp-declare-reduction-derivedtype.f90 b/flang/test/Lower/OpenMP/omp-declare-reduction-derivedtype.f90
new file mode 100644
index 0000000000000..36bb131e677a3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/omp-declare-reduction-derivedtype.f90
@@ -0,0 +1,112 @@
+! This test checks lowering of OpenMP declare reduction Directive, with initialization
+! via a subroutine. This functionality is currently not implemented.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+module maxtype_mod
+  implicit none
+
+  type maxtype
+     integer::sumval
+     integer::maxval
+  end type maxtype
+
+contains
+
+  subroutine initme(x,n)
+    type(maxtype) :: x,n
+    x%sumval=0
+    x%maxval=0
+  end subroutine initme
+
+  function mycombine(lhs, rhs)
+    type(maxtype) :: lhs, rhs
+    type(maxtype) :: mycombine
+    mycombine%sumval = lhs%sumval + rhs%sumval
+    mycombine%maxval = max(lhs%maxval, rhs%maxval)
+  end function mycombine
+
+  function func(x, n, init)
+    type(maxtype) :: func
+    integer :: n, i
+    type(maxtype) :: x(n)
+    type(maxtype) :: init
+    type(maxtype) :: res
+!$omp declare reduction(red_add_max:maxtype:omp_out=mycombine(omp_out,omp_in)) initializer(initme(omp_priv,omp_orig))
+    res=init
+!$omp simd reduction(red_add_max:res)
+    do i=1,n
+       res=mycombine(res,x(i))
+    enddo
+    func=res
+  end function func
+
+end module maxtype_mod
+!CHECK:  omp.declare_reduction @red_add_max : [[MAXTYPE:.*]] init {
+!CHECK:  ^bb0(%[[OMP_ORIG_ARG_I:.*]]: [[MAXTYPE]]):
+!CHECK:    %[[OMP_PRIV:.*]] = fir.alloca [[MAXTYPE]]
+!CHECK:    %[[OMP_ORIG:.*]] = fir.alloca [[MAXTYPE]]
+!CHECK:    fir.store %[[OMP_ORIG_ARG_I]] to %[[OMP_ORIG]] : !fir.ref<[[MAXTYPE]]>
+!CHECK:    %[[OMP_ORIG_DECL:.*]]:2 = hlfir.declare %[[OMP_ORIG]] {uniq_name = "omp_orig"} : (!fir.ref<[[MAXTYPE]]>) -> (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>)
+!CHECK:    fir.store %[[OMP_ORIG_ARG_I]] to %[[OMP_PRIV]] : !fir.ref<[[MAXTYPE]]>
+!CHECK:    %[[OMP_PRIV_DECL:.*]]:2 = hlfir.declare %[[OMP_PRIV]] {uniq_name = "omp_priv"} : (!fir.ref<[[MAXTYPE]]>) -> (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>)
+!CHECK:    fir.call @_QMmaxtype_modPinitme(%[[OMP_PRIV_DECL]]#0, %[[OMP_ORIG_DECL]]#0) fastmath<contract> : (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>) -> ()
+!CHECK:    %[[OMP_PRIV_VAL:.*]] = fir.load %[[OMP_PRIV_DECL]]#0 : !fir.ref<[[MAXTYPE]]>
+!CHECK:    omp.yield(%[[OMP_PRIV_VAL]] : [[MAXTYPE]])
+!CHECK:  } combiner {
+!CHECK:  ^bb0(%[[LHS_ARG:.*]]: [[MAXTYPE]], %[[RHS_ARG:.*]]: [[MAXTYPE]]):
+!CHECK:    %[[RESULT:.*]] = fir.alloca [[MAXTYPE]] {bindc_name = ".result"}
+!CHECK:    %[[OMP_OUT:.*]] = fir.alloca [[MAXTYPE]]
+!CHECK:    %[[OMP_IN:.*]] = fir.alloca [[MAXTYPE]]
+!CHECK:    fir.store %[[RHS_ARG]] to %[[OMP_IN]] : !fir.ref<[[MAXTYPE]]>
+!CHECK:    %[[OMP_IN_DECL:.*]]:2 = hlfir.declare %[[OMP_IN]] {uniq_name = "omp_in"} : (!fir.ref<[[MAXTYPE]]>) -> (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>)
+!CHECK:    fir.store %[[LHS_ARG]] to %[[OMP_OUT]] : !fir.ref<[[MAXTYPE]]>
+!CHECK:    %[[OMP_OUT_DECL:.*]]:2 = hlfir.declare %[[OMP_OUT]] {uniq_name = "omp_out"} : (!fir.ref<[[MAXTYPE]]>) -> (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>)
+!CHECK:    %[[COMBINE_RESULT:.*]] = fir.call @_QMmaxtype_modPmycombine(%[[OMP_OUT_DECL]]#0, %[[OMP_IN_DECL]]#0) fastmath<contract> : (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>) -> [[MAXTYPE]]
+!CHECK:    fir.save_result %[[COMBINE_RESULT]] to %[[RESULT]] : [[MAXTYPE]], !fir.ref<[[MAXTYPE]]>
+!CHECK:    %[[TMPRESULT:.*]]:2 = hlfir.declare %[[RESULT]] {uniq_name = ".tmp.func_result"} : (!fir.ref<[[MAXTYPE]]>) -> (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>)
+!CHECK:    %false = arith.constant false
+!CHECK:    %[[EXPRRESULT:.*]] = hlfir.as_expr %[[TMPRESULT]]#0 move %false : (!fir.ref<[[MAXTYPE]]>, i1) -> !hlfir.expr<[[MAXTYPE]]>
+!CHECK:    %[[ASSOCIATE:.*]]:3 = hlfir.associate %[[EXPRRESULT]] {adapt.valuebyref} : (!hlfir.expr<[[MAXTYPE]]>) -> (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>, i1)
+!CHECK:    %[[RESULT_VAL:.*]] = fir.load %[[ASSOCIATE]]#0 : !fir.ref<[[MAXTYPE]]>
+!CHECK:    hlfir.end_associate %[[ASSOCIATE]]#1, %[[ASSOCIATE]]#2 : !fir.ref<[[MAXTYPE]]>, i1
+!CHECK:    omp.yield(%[[RESULT_VAL]] : [[MAXTYPE]])
+!CHECK:  }
+
+!CHECK:  func.func @_QMmaxtype_modPinitme(%[[X_ARG:.*]]: !fir.ref<[[MAXTYPE]]> {fir.bindc_name = "x"}, %[[N_ARG:.*]]: !fir.ref<[[MAXTYPE]]> {fir.bindc_name = "n"}) {
+!CHECK:    %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+!CHECK:    %[[N_DECL:.*]]:2 = hlfir.declare %[[N_ARG]] dummy_scope %[[SCOPE]] arg 2 {uniq_name = "_QMmaxtype_modFinitmeEn"} : (!fir.ref<[[MAXTYPE]]>, !fir.dscope) -> (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>)
+!CHECK:    %[[X_DECL:.*]]:2 = hlfir.declare %[[X_ARG]] dummy_scope %[[SCOPE]] arg 1 {uniq_name = "_QMmaxtype_modFinitmeEx"} : (!fir.ref<[[MAXTYPE]]>, !fir.dscope) -> (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>)
+!CHECK:    %[[ZERO_0:.*]] = arith.constant 0 : i32
+!CHECK:    %[[X_DESIGNATE_SUMVAL:.*]] = hlfir.designate %[[X_DECL]]#0{"sumval"}   : (!fir.ref<[[MAXTYPE]]>) -> !fir.ref<i32>
+!CHECK:    hlfir.assign %[[ZERO_0]] to %[[X_DESIGNATE_SUMVAL]] : i32, !fir.ref<i32>
+!CHECK:    %[[ZERO_1:.*]] = arith.constant 0 : i32
+!CHECK:    %[[X_DESIGNATE_MAXVAL:.*]] = hlfir.designate %[[X_DECL]]#0{"maxval"}   : (!fir.ref<[[MAXTYPE]]>) -> !fir.ref<i32>
+!CHECK:    hlfir.assign %[[ZERO_1]] to %[[X_DESIGNATE_MAXVAL]] : i32, !fir.ref<i32>
+!CHECK:    return
+!CHECK:  }
+
+
+!CHECK:  func.func @_QMmaxtype_modPmycombine(%[[LHS:.*]]: !fir.ref<[[MAXTYPE]]> {fir.bindc_name = "lhs"}, %[[RHS:.*]]: !fir.ref<[[MAXTYPE]]> {fir.bindc_name = "rhs"}) -> [[MAXTYPE]] {
+!CHECK:    %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+!CHECK:    %[[LHS_DECL:.*]]:2 = hlfir.declare %[[LHS]] dummy_scope %[[SCOPE]] arg 1 {uniq_name = "_QMmaxtype_modFmycombineElhs"} : (!fir.ref<[[MAXTYPE]]>, !fir.dscope) -> (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>)
+!CHECK:    %[[RESULT_ALLOC:.*]] = fir.alloca [[MAXTYPE]] {bindc_name = "mycombine", uniq_name = "_QMmaxtype_modFmycombineEmycombine"}
+!CHECK:    %[[RESULT_DECL:.*]]:2 = hlfir.declare %[[RESULT_ALLOC]] {uniq_name = "_QMmaxtype_modFmycombineEmycombine"} : (!fir.ref<[[MAXTYPE]]>) -> (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>)
+!CHECK:    %[[RHS_DECL:.*]]:2 = hlfir.declare %[[RHS]] dummy_scope %[[SCOPE]] arg 2 {uniq_name = "_QMmaxtype_modFmycombineErhs"} : (!fir.ref<[[MAXTYPE]]>, !fir.dscope) -> (!fir.ref<[[MAXTYPE]]>, !fir.ref<[[MAXTYPE]]>)
+!CHECK:    %[[LHS_DESIGNATE_SUMVAL:.*]] = hlfir.designate %[[LHS_DECL]]#0{"sumval"}   : (!fir.ref<[[MAXTYPE]]>) -> !fir.ref<i32>
+!CHECK:    %[[LHS_SUMVAL:.*]] = fir.load %[[LHS_DESIGNATE_SUMVAL]] : !fir.ref<i32>
+!CHECK:    %[[RHS_DESIGNATE_SUMVAL:.*]] = hlfir.designate %[[RHS_DECL]]#0{"sumval"}   : (!fir.ref<[[MAXTYPE]]>) -> !fir.ref<i32>
+!CHECK:    %[[RHS_SUMVAL:.*]] = fir.load %[[RHS_DESIGNATE_SUMVAL]] : !fir.ref<i32>
+!CHECK:    %[[SUM:.*]] = arith.addi %[[LHS_SUMVAL]], %[[RHS_SUMVAL]] : i32
+!CHECK:    %[[RESULT_DESIGNATE_SUMVAL:.*]] = hlfir.designate %[[RESULT_DECL]]#0{"sumval"}   : (!fir.ref<[[MAXTYPE]]>) -> !fir.ref<i32>
+!CHECK:    hlfir.assign %[[SUM]] to %[[RESULT_DESIGNATE_SUMVAL]] : i32, !fir.ref<i32>
+!CHECK:    %[[LHS_DESIGNATE_MAXVAL:.*]] = hlfir.designate %[[LHS_DECL]]#0{"maxval"}   : (!fir.ref<[[MAXTYPE]]>) -> !fir.ref<i32>
+!CHECK:    %[[LHS_MAXVAL:.*]] = fir.load %[[LHS_DESIGNATE_MAXVAL]] : !fir.ref<i32>
+!CHECK:    %[[RHS_DESIGNATE_MAXVAL:.*]] = hlfir.designate %[[RHS_DECL]]#0{"maxval"}   : (!fir.ref<[[MAXTYPE]]>) -> !fir.ref<i32>
+!CHECK:    %[[RHS_MAXVAL:.*]] = fir.load %[[RHS_DESIGNATE_MAXVAL]] : !fir.ref<i32>
+!CHECK:    %[[CMP:.*]] = arith.cmpi sgt, %[[LHS_MAXVAL]], %[[RHS_MAXVAL]] : i32
+!CHECK:    %[[MAX_VAL:.*]] = arith.select %[[CMP]], %[[LHS_MAXVAL]], %[[RHS_MAXVAL]] : i32
+!CHECK:    %[[RESULT_DESIGNAGE_MAXVAL:.*]] = hlfir.designate %[[RESULT_DECL]]#0{"maxval"}   : (!fir.ref<[[MAXTYPE]]>) -> !fir.ref<i32>
+!CHECK:    hlfir.assign %[[MAX_VAL]] to %[[RESULT_DESIGNAGE_MAXVAL]] : i32, !fir.ref<i32>
+!CHECK:    %[[RESULT:.*]] = fir.load %[[RESULT_DECL]]#0 : !fir.ref<[[MAXTYPE]]>
+!CHECK:    return %[[RESULT]] : [[MAXTYPE]]
+!CHECK:  }
diff --git a/flang/test/Lower/OpenMP/omp-declare-reduction-initsub.f90 b/flang/test/Lower/OpenMP/omp-declare-reduction-initsub.f90
new file mode 100644
index 0000000000000..4aacc7cb2efba
--- /dev/null
+++ b/flang/test/Lower/OpenMP/omp-declare-reduction-initsub.f90
@@ -0,0 +1,59 @@
+! This test checks lowering of OpenMP declare reduction Directive, with initialization
+! via a subroutine. This functionality is currently not implemented.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+
+subroutine initme(x,n)
+  integer x,n
+  x=0
+end subroutine initme
+
+function func(x, n, init)
+  integer func
+  integer x(n)
+  integer res
+  interface
+     subroutine initme(x,n)
+       integer x,n
+     end subroutine initme
+  end interface
+!CHECK:  omp.declare_reduction @red_add : i32 init {
+!CHECK: ^bb0(%[[OMP_ORIG_ARG_I:.*]]: i32):
+!CHECK:    %[[OMP_PRIV:.*]] = fir.alloca i32
+!CHECK:    %[[OMP_ORIG:.*]] = fir.alloca i32
+!CHECK:    fir.store %[[OMP_ORIG_ARG_I]] to %[[OMP_ORIG]] : !fir.ref<i32>
+!CHECK:    %[[OMP_ORIG_DECL:.*]]:2 = hlfir.declare %[[OMP_ORIG]] {uniq_name = "omp_orig"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    fir.store %[[OMP_ORIG_ARG_I]] to %[[OMP_PRIV]] : !fir.ref<i32>
+!CHECK:    %[[OMP_PRIV_DECL:.*]]:2 = hlfir.declare %[[OMP_PRIV]] {uniq_name = "omp_priv"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    fir.call @_QPinitme(%[[OMP_PRIV_DECL]]#0, %[[OMP_ORIG_DECL]]#0) fastmath<contract> : (!fir.ref<i32>, !fir.ref<i32>) -> ()
+!CHECK:    %[[OMP_PRIV_VAL:.*]] = fir.load %[[OMP_PRIV_DECL]]#0 : !fir.ref<i32>
+!CHECK:    omp.yield(%[[OMP_PRIV_VAL]] : i32)
+!CHECK:  } combiner {
+!CHECK:  ^bb0(%[[LHS_ARG:.*]]: i32, %[[RHS_ARG:.*]]: i32):
+!CHECK:    %[[OMP_OUT:.*]] = fir.alloca i32
+!CHECK:    %[[OMP_IN:.*]] = fir.alloca i32
+!CHECK:    fir.store %[[RHS_ARG]] to %[[OMP_IN]] : !fir.ref<i32>
+!CHECK:    %[[OMP_IN_DECL:.*]]:2 = hlfir.declare %[[OMP_IN]] {uniq_name = "omp_in"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    fir.store %[[LHS_ARG]] to %[[OMP_OUT]] : !fir.ref<i32>
+!CHECK:    %[[OMP_OUT_DECL:.*]]:2 = hlfir.declare %[[OMP_OUT]] {uniq_name = "omp_out"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[OMP_OUT_VAL:.*]] = fir.load %[[OMP_OUT_DECL]]#0 : !fir.ref<i32>
+!CHECK:    %[[OMP_IN_VAL:.*]] = fir.load %[[OMP_IN_DECL]]#0 : !fir.ref<i32>
+!CHECK:    %[[SUM:.*]] = arith.addi %[[OMP_OUT_VAL]], %[[OMP_IN_VAL]] : i32
+!CHECK:    omp.yield(%[[SUM]] : i32)
+!CHECK:  }
+!CHECK:  func.func @_QPinitme(%[[X:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}, %[[N:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}) {
+!CHECK:    %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+!CHECK:    %[[N_DECL:.*]]:2 = hlfir.declare %[[N]] dummy_scope %[[SCOPE]] arg 2 {uniq_name = "_QFinitmeEn"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %[[OMP_OUT]] arg 1 {uniq_name = "_QFinitmeEx"} : (!fir.ref<i32>, !fir.dscope) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[CONST_0:.*]] = arith.constant 0 : i32
+!CHECK:    hlfir.assign %[[CONST_0]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    return
+!CHECK:  }
+!$omp declare reduction(red_add:integer(4):omp_out=omp_out+omp_in) initializer(initme(omp_priv,omp_orig))
+  res=init
+!$omp simd reduction(red_add:res)
+  do i=1,n
+     res=res+x(i)
+  enddo
+  func=res
+end function func
diff --git a/flang/test/Lower/OpenMP/omp-declare-reduction.f90 b/flang/test/Lower/OpenMP/omp-declare-reduction.f90
new file mode 100644
index 0000000000000..a41f6b214b9d8
--- /dev/null
+++ b/flang/test/Lower/OpenMP/omp-declare-reduction.f90
@@ -0,0 +1,33 @@
+! This test checks lowering of OpenMP declare reduction Directive.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=52 %s -o - | FileCheck %s
+
+subroutine declare_red()
+  integer :: my_var
+!CHECK: omp.declare_reduction @my_red : i32 init {
+!CHECK: ^bb0(%[[OMP_ORIG_ARG_I:.*]]: i32):
+!CHECK:    %[[OMP_PRIV:.*]] = fir.alloca i32
+!CHECK:    %[[OMP_ORIG:.*]] = fir.alloca i32
+!CHECK:    fir.store %[[OMP_ORIG_ARG_I]] to %[[OMP_ORIG]] : !fir.ref<i32>
+!CHECK:    %[[OMP_ORIG_DECL:.*]]:2 = hlfir.declare %[[OMP_ORIG]] {uniq_name = "omp_orig"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    fir.store %[[OMP_ORIG_ARG_I]] to %[[OMP_PRIV]] : !fir.ref<i32>
+!CHECK:    %[[OMP_PRIV_DECL:.*]]:2 = hlfir.declare %[[OMP_PRIV]] {uniq_name = "omp_priv"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[CONST_0:.*]] = arith.constant 0 : i32
+!CHECK:    omp.yield(%[[CONST_0]] : i32)
+!CHECK: } combiner {
+!CHECK:  ^bb0(%[[LHS_ARG:.*]]: i32, %[[RHS_ARG:.*]]: i32):
+!CHECK:    %[[OMP_OUT:.*]] = fir.alloca i32
+!CHECK:    %[[OMP_IN:.*]] = fir.alloca i32
+!CHECK:    fir.store %[[RHS_ARG]] to %[[OMP_IN]] : !fir.ref<i32>
+!CHECK:    %[[OMP_IN_DECL:.*]]:2 = hlfir.declare %[[OMP_IN]] {uniq_name = "omp_in"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    fir.store %[[LHS_ARG]] to %[[OMP_OUT]] : !fir.ref<i32>
+!CHECK:    %[[OMP_OUT_DECL:.*]]:2 = hlfir.declare %[[OMP_OUT]] {uniq_name = "omp_out"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[OMP_OUT_VAL:.*]] = fir.load %[[OMP_OUT_DECL]]#0 : !fir.ref<i32>
+!CHECK:    %[[OMP_IN_VAL:.*]] = fir.load %[[OMP_IN_DECL]]#0 : !fir.ref<i32>
+!CHECK:    %[[SUM:.*]] = arith.addi %[[OMP_OUT_VAL]], %[[OMP_IN_VAL]] : i32
+!CHECK:    omp.yield(%[[SUM]] : i32)
+!CHECK: }
+
+  !$omp declare reduction (my_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv = 0)
+  my_var = 0
+end subroutine declare_red
diff --git a/flang/test/Lower/identical-block-merge-disable.f90 b/flang/test/Lower/identical-block-merge-disable.f90
index cc3120a3b6f67..ff3ff67b7b213 100644
--- a/flang/test/Lower/identical-block-merge-disable.f90
+++ b/flang/test/Lower/identical-block-merge-disable.f90
@@ -6,7 +6,7 @@ MODULE DMUMPS_SOL_LR
 IMPLICIT NONE
 
 TYPE BLR_STRUC_T
-  INTEGER, DIMENSION(:), POINTER  :: PANELS_L 
+  INTEGER, DIMENSION(:), POINTER  :: PANELS_L
   INTEGER, DIMENSION(:), POINTER  :: PANELS_U
   INTEGER, DIMENSION(:), POINTER :: BEGS_BLR_STATIC
 END TYPE BLR_STRUC_T
@@ -32,7 +32,7 @@ SUBROUTINE DMUMPS_SOL_FWD_LR_SU( IWHDLR, MTYPE )
     ENDIF
   ENDIF
 
-END SUBROUTINE DMUMPS_SOL_FWD_LR_SU 
+END SUBROUTINE DMUMPS_SOL_FWD_LR_SU
 
 END MODULE DMUMPS_SOL_LR
 
diff --git a/flang/test/Lower/implicit-interface.f90 b/flang/test/Lower/implicit-interface.f90
index f924a3f5d3e9e..0be50203b2c01 100644
--- a/flang/test/Lower/implicit-interface.f90
+++ b/flang/test/Lower/implicit-interface.f90
@@ -22,7 +22,7 @@ subroutine test_passing_char_array
   ! CHECK-DAG: %[[c3:.*]] = arith.constant 3 : index
   ! CHECK-DAG: %[[xbuff:.*]] = fir.convert %[[xarray]] : (!fir.ref<!fir.array<4x!fir.char<1,3>>>) -> !fir.ref<!fir.char<1,?>>
   ! CHECK: %[[boxchar:.*]] = fir.emboxchar %[[xbuff]], %[[c3]] : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-  ! CHECK: fir.call @_QPsub_taking_a_char_array(%[[boxchar]]) {{.*}}: (!fir.boxchar<1>) -> () 
+  ! CHECK: fir.call @_QPsub_taking_a_char_array(%[[boxchar]]) {{.*}}: (!fir.boxchar<1>) -> ()
 end subroutine
 
 ! TODO more implicit interface cases with/without explicit interface
diff --git a/flang/test/Lower/inline_directive.f90 b/flang/test/Lower/inline_directive.f90
index 347df85f05dda..5748690d5914d 100644
--- a/flang/test/Lower/inline_directive.f90
+++ b/flang/test/Lower/inline_directive.f90
@@ -11,7 +11,7 @@ subroutine test_inline()
   y = g(x)
   !CHECK:  %[[VAL_4:.*]] = fir.call @_QFtest_inlinePg(%[[VAL_1]]) fastmath<contract> {inline_attr = #fir.inline_attrs<always_inline>} : (!fir.ref<i32>) -> i32
   !CHECK:  fir.store %[[VAL_4]] to %[[VAL_3]] : !fir.ref<i32>
-  
+
   !dir$ forceinline
   call f(x, y)
   !CHECK:  fir.call @_QFtest_inlinePf(%[[VAL_1]], %[[VAL_3]]) fastmath<contract> {inline_attr = #fir.inline_attrs<always_inline>} : (!fir.ref<i32>, !fir.ref<i32>) -> ()
diff --git a/flang/test/Lower/io-statement-1.f90 b/flang/test/Lower/io-statement-1.f90
index ac7874594d2fc..ecf2e2d287774 100644
--- a/flang/test/Lower/io-statement-1.f90
+++ b/flang/test/Lower/io-statement-1.f90
@@ -19,7 +19,7 @@
   ! CHECK: call {{.*}}BeginFlush
   ! CHECK: call {{.*}}EndIoStatement
   flush(8)
-  
+
   ! CHECK: call {{.*}}BeginRewind
   ! CHECK: call {{.*}}EndIoStatement
   rewind(8)
diff --git a/flang/test/Lower/io-write.f90 b/flang/test/Lower/io-write.f90
index 234fcdabeaac8..3159dcf771bb7 100644
--- a/flang/test/Lower/io-write.f90
+++ b/flang/test/Lower/io-write.f90
@@ -1,6 +1,6 @@
 ! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
-! Test that IO item calls stackrestore in the right place 
+! Test that IO item calls stackrestore in the right place
 
 ! CHECK-LABEL: func.func @_QQmain() {
   character(3) string
diff --git a/flang/test/Lower/location.f90 b/flang/test/Lower/location.f90
index 95bf2260fc107..cdde1cc4cb40a 100644
--- a/flang/test/Lower/location.f90
+++ b/flang/test/Lower/location.f90
@@ -3,7 +3,7 @@
 program test
 include 'location0.inc'
 
-end 
+end
 
 ! CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "TEST"} {
 ! CHECK: fir.call @_FortranAioOutputAscii(%{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<i8>, !fir.ref<i8>, i64) -> i1 loc(fused<#fir<loc_kind_array[ base,  inclusion,  inclusion]>>["{{.*}}location1.inc":1:10, "{{.*}}location0.inc":1:1, "{{.*}}location.f90":4:1])
diff --git a/flang/test/Lower/module_definition.f90 b/flang/test/Lower/module_definition.f90
index 0a05364ca473c..a96bc919c6730 100644
--- a/flang/test/Lower/module_definition.f90
+++ b/flang/test/Lower/module_definition.f90
@@ -36,7 +36,7 @@ module m1
 ! file.
 module modEq1
   ! Equivalence, no initialization
-  real :: x1(10), x2(10), x3(10) 
+  real :: x1(10), x2(10), x3(10)
   ! Equivalence with initialization
   real :: y1 = 42.
   real :: y2(10)
diff --git a/flang/test/Lower/module_use.f90 b/flang/test/Lower/module_use.f90
index 92acbfbee0b9e..e7f56f57b2c7e 100644
--- a/flang/test/Lower/module_use.f90
+++ b/flang/test/Lower/module_use.f90
@@ -36,7 +36,7 @@ real function modCommon1Use()
   ! CHECK-DAG: fir.address_of(@named2_) : !fir.ref<!fir.array<4xi8>>
   ! CHECK-DAG: fir.address_of(@__BLNK__) : !fir.ref<!fir.array<4xi8>>
   ! CHECK-DAG: fir.address_of(@named1_) : !fir.ref<!fir.array<4xi8>>
-  modCommon1Use = x_blank + x_named1 + i_named2 
+  modCommon1Use = x_blank + x_named1 + i_named2
 end function
 
 
diff --git a/flang/test/Lower/module_use_in_same_file.f90 b/flang/test/Lower/module_use_in_same_file.f90
index 9e51bee14fd7a..7ef64f0bc160e 100644
--- a/flang/test/Lower/module_use_in_same_file.f90
+++ b/flang/test/Lower/module_use_in_same_file.f90
@@ -35,7 +35,7 @@ real function m2use_rename()
 ! Module modEq2 defines data that is equivalenced
 module modEq2
   ! Equivalence, no initialization
-  real :: x1(10), x2(10), x3(10) 
+  real :: x1(10), x2(10), x3(10)
   ! Equivalence with initialization
   real :: y1 = 42.
   real :: y2(10)
@@ -109,7 +109,7 @@ real function test_no_equiv_conflicts()
   use modEq2
   ! Same equivalences as in modEq2. Test that lowering does not mixes
   ! up the equivalence based on the similar offset inside the scope.
-  real :: x1l(10), x2l(10), x3l(10) 
+  real :: x1l(10), x2l(10), x3l(10)
   real :: y1l = 42.
   real :: y2l(10)
   save :: x1l, x2l, x3l, y1l, y2l
diff --git a/flang/test/Lower/namelist-common-block.f90 b/flang/test/Lower/namelist-common-block.f90
index d16f886dadb5b..54d4b9e552510 100644
--- a/flang/test/Lower/namelist-common-block.f90
+++ b/flang/test/Lower/namelist-common-block.f90
@@ -8,7 +8,7 @@ program nml_common
   real, pointer :: p(:)
   namelist /t/i,p
   common /c/i,p
-  
+
   allocate(p(2))
   call print_t()
 contains
diff --git a/flang/test/Lower/nested-where.f90 b/flang/test/Lower/nested-where.f90
index 28aced2325813..1e379667b7b03 100644
--- a/flang/test/Lower/nested-where.f90
+++ b/flang/test/Lower/nested-where.f90
@@ -313,7 +313,7 @@ program nested_where
   ! CHECK:  fir.call @_FortranARaggedArrayDeallocate(%[[VAL_278]]) {{.*}}: (!fir.llvm_ptr<i8>) -> ()
   ! CHECK:  %[[VAL_280:.*]] = fir.convert %[[VAL_4]] : (!fir.ref<tuple<i64, !fir.heap<!fir.array<?xi8>>, !fir.heap<!fir.array<?xi64>>>>) -> !fir.llvm_ptr<i8>
   ! CHECK:  fir.call @_FortranARaggedArrayDeallocate(%[[VAL_280]]) {{.*}}: (!fir.llvm_ptr<i8>) -> ()
-  
+
   integer :: a(3) = 0
   logical :: mask1(3) = (/ .true.,.false.,.true. /)
   logical :: mask2(3) = (/ .true.,.true.,.false. /)
diff --git a/flang/test/Lower/nullify-polymorphic.f90 b/flang/test/Lower/nullify-polymorphic.f90
index 58eaa11f29f64..9a6a2795d3057 100644
--- a/flang/test/Lower/nullify-polymorphic.f90
+++ b/flang/test/Lower/nullify-polymorphic.f90
@@ -13,7 +13,7 @@ module poly
   contains
     procedure, nopass :: proc1 => proc1_p2
   end type
-  
+
 contains
 
   subroutine proc1_p1()
@@ -23,7 +23,7 @@ subroutine proc1_p1()
   subroutine proc1_p2()
     print*, 'call proc1_p2'
   end subroutine
-  
+
   subroutine test_nullify()
     class(p1), pointer :: c
 
@@ -31,7 +31,7 @@ subroutine test_nullify()
     call c%proc1()
 
     nullify(c) ! c dynamic type must be reset to p1
-  
+
     call c%proc1()
   end subroutine
 end module
@@ -45,7 +45,7 @@ program test
 ! CHECK: %[[C_DESC:.*]] = fir.alloca !fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>> {bindc_name = "c", uniq_name = "_QMpolyFtest_nullifyEc"}
 ! CHECK: %[[C_DESC_DECL:.*]]:2 = hlfir.declare %[[C_DESC]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMpolyFtest_nullifyEc"} : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>)
 ! CHECK: %{{.*}} = fir.call @_FortranAPointerAllocate(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, i1, !fir.box<none>, !fir.ref<i8>, i32) -> i32
-! CHECK: %[[DECLARED_TYPE_DESC:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}> 
+! CHECK: %[[DECLARED_TYPE_DESC:.*]] = fir.type_desc !fir.type<_QMpolyTp1{a:i32,b:i32}>
 ! CHECK: %[[C_DESC_CAST:.*]] = fir.convert %[[C_DESC_DECL]]#0 : (!fir.ref<!fir.class<!fir.ptr<!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[TYPE_DESC_CAST:.*]] = fir.convert %[[DECLARED_TYPE_DESC]] : (!fir.tdesc<!fir.type<_QMpolyTp1{a:i32,b:i32}>>) -> !fir.ref<none>
 ! CHECK: %[[RANK:.*]] = arith.constant 0 : i32
diff --git a/flang/test/Lower/pointer-association-polymorphic.f90 b/flang/test/Lower/pointer-association-polymorphic.f90
index 7d166e1423cfa..a82eebb1c5958 100644
--- a/flang/test/Lower/pointer-association-polymorphic.f90
+++ b/flang/test/Lower/pointer-association-polymorphic.f90
@@ -143,9 +143,9 @@ subroutine test_pointer()
 
 ! CHECK: %[[C4_LOAD:.*]] = fir.load %[[C4_DESC]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>
 ! CHECK: %[[C4_REBOX:.*]] = fir.rebox %[[C4_LOAD]](%{{.*}}) : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, !fir.shift<1>) -> !fir.class<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>
-! CHECK: %[[PA_CONV:.*]] = fir.convert %[[PA_DESC]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>> 
-! CHECK: %[[C4_REBOX_CONV:.*]] = fir.convert %[[C4_REBOX]] : (!fir.class<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.box<none> 
-! CHECK: fir.call @_FortranAPointerAssociate(%[[PA_CONV]], %[[C4_REBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> () 
+! CHECK: %[[PA_CONV:.*]] = fir.convert %[[PA_DESC]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
+! CHECK: %[[C4_REBOX_CONV:.*]] = fir.convert %[[C4_REBOX]] : (!fir.class<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.box<none>
+! CHECK: fir.call @_FortranAPointerAssociate(%[[PA_CONV]], %[[C4_REBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> ()
 ! CHECK-LABEL: fir.do_loop
 ! CHECK: %[[PA_LOAD:.*]] = fir.load %[[PA_DESC]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>
 ! CHECK: %[[PA_COORD:.*]] = fir.coordinate_of %[[PA_LOAD]], %{{.*}} : (!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, i64) -> !fir.ref<!fir.type<_QMpolyTp1{a:i32,b:i32}>>
@@ -165,8 +165,8 @@ subroutine test_pointer()
 ! CHECK: %[[SLICE:.*]] = fir.slice %[[C2_INDEX]], %[[C4_INDEX]], %[[C1_INDEX]] : (index, index, index) -> !fir.slice<1>
 ! CHECK: %[[SLICE_REBOX:.*]] = fir.rebox %[[C4_LOAD]](%[[SHIFT]]) [%[[SLICE]]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, !fir.shift<1>, !fir.slice<1>) -> !fir.class<!fir.array<3x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>
 ! CHECK: %[[PA_CONV:.*]] = fir.convert %[[PA_DESC]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK: %[[SLICE_REBOX_CONV:.*]] = fir.convert %[[SLICE_REBOX]] : (!fir.class<!fir.array<3x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.box<none> 
-! CHECK: fir.call @_FortranAPointerAssociate(%[[PA_CONV]], %[[SLICE_REBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> () 
+! CHECK: %[[SLICE_REBOX_CONV:.*]] = fir.convert %[[SLICE_REBOX]] : (!fir.class<!fir.array<3x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>) -> !fir.box<none>
+! CHECK: fir.call @_FortranAPointerAssociate(%[[PA_CONV]], %[[SLICE_REBOX_CONV]]) {{.*}} : (!fir.ref<!fir.box<none>>, !fir.box<none>) -> ()
 ! CHECK-LABEL: fir.do_loop
 ! CHECK: %[[PA_LOAD:.*]] = fir.load %[[PA_DESC]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>>
 ! CHECK: %[[PA_COORD:.*]] = fir.coordinate_of %[[PA_LOAD]], %{{.*}} : (!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolyTp1{a:i32,b:i32}>>>>, i64) -> !fir.ref<!fir.type<_QMpolyTp1{a:i32,b:i32}>>
diff --git a/flang/test/Lower/pointer-disassociate.f90 b/flang/test/Lower/pointer-disassociate.f90
index fb70fd7795b2e..6dfabddb83291 100644
--- a/flang/test/Lower/pointer-disassociate.f90
+++ b/flang/test/Lower/pointer-disassociate.f90
@@ -113,9 +113,9 @@ subroutine test_polymorphic_null(p)
 end subroutine
 ! CHECK-LABEL:   func.func @_QPtest_polymorphic_null(
 ! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFtest_polymorphic_nullTt>>>>>
-! CHECK:  %[[VAL_1:.*]] = fir.type_desc !fir.type<_QFtest_polymorphic_nullTt> 
+! CHECK:  %[[VAL_1:.*]] = fir.type_desc !fir.type<_QFtest_polymorphic_nullTt>
 ! CHECK:  %[[VAL_2:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFtest_polymorphic_nullTt>>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_1]] : (!fir.tdesc<!fir.type<_QFtest_polymorphic_nullTt>>) -> !fir.ref<none> 
+! CHECK:  %[[VAL_3:.*]] = fir.convert %[[VAL_1]] : (!fir.tdesc<!fir.type<_QFtest_polymorphic_nullTt>>) -> !fir.ref<none>
 ! CHECK:  %[[VAL_4:.*]] = arith.constant 1 : i32
 ! CHECK:  %[[VAL_5:.*]] = arith.constant 0 : i32
 ! CHECK:  fir.call @_FortranAPointerNullifyDerived(%[[VAL_2]], %[[VAL_3]], %[[VAL_4]], %[[VAL_5]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.ref<none>, i32, i32) -> ()
diff --git a/flang/test/Lower/polymorphic-temp.f90 b/flang/test/Lower/polymorphic-temp.f90
index ac3cbdba6646d..391ec2b48a1be 100644
--- a/flang/test/Lower/polymorphic-temp.f90
+++ b/flang/test/Lower/polymorphic-temp.f90
@@ -24,7 +24,7 @@ subroutine test_temp_from_intrinsic_spread()
 
     call pass_unlimited_poly_1d(spread(p, dim=1, ncopies=2))
     call pass_unlimited_poly_1d(spread(pa(1), dim=1, ncopies=2))
-    
+
   end subroutine
 
 ! CHECK-LABEL: func.func @_QMpoly_tmpPtest_temp_from_intrinsic_spread() {
@@ -220,7 +220,7 @@ subroutine test_merge_intrinsic2(a, b, i)
 ! CHECK: %[[LOAD_I:.*]] = fir.load %[[I]] : !fir.ref<i32>
 ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
 ! CHECK: %[[CMPI:.*]] = arith.cmpi eq, %[[LOAD_I]], %[[C1]] : i32
-! CHECK: %[[A_REBOX:.*]] = fir.rebox %[[LOAD_A]] : (!fir.class<!fir.heap<!fir.type<_QMpoly_tmpTp1{a:i32}>>>) -> !fir.box<!fir.heap<!fir.type<_QMpoly_tmpTp1{a:i32}>>> 
+! CHECK: %[[A_REBOX:.*]] = fir.rebox %[[LOAD_A]] : (!fir.class<!fir.heap<!fir.type<_QMpoly_tmpTp1{a:i32}>>>) -> !fir.box<!fir.heap<!fir.type<_QMpoly_tmpTp1{a:i32}>>>
 ! CHECK: %{{.*}} = arith.select %[[CMPI]], %[[A_REBOX]], %[[LOAD_B]] : !fir.box<!fir.heap<!fir.type<_QMpoly_tmpTp1{a:i32}>>>
 
   subroutine check_unlimited_poly(a)
diff --git a/flang/test/Lower/polymorphic-types.f90 b/flang/test/Lower/polymorphic-types.f90
index a06e0a29b6ae8..93d08a4aee380 100644
--- a/flang/test/Lower/polymorphic-types.f90
+++ b/flang/test/Lower/polymorphic-types.f90
@@ -1,6 +1,6 @@
 ! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
 
-! Tests the different possible type involving polymorphic entities. 
+! Tests the different possible type involving polymorphic entities.
 
 module polymorphic_types
   type p1
diff --git a/flang/test/Lower/polymorphic.f90 b/flang/test/Lower/polymorphic.f90
index bc4eed54282df..689e2233dcf10 100644
--- a/flang/test/Lower/polymorphic.f90
+++ b/flang/test/Lower/polymorphic.f90
@@ -126,7 +126,7 @@ subroutine check()
 ! CHECK: %[[CLASS1:.*]] = fir.embox %[[DT1]] : (!fir.ref<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK: fir.call @_QMpolymorphic_testPprint(%[[CLASS1]]) {{.*}}: (!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> ()
 ! CHECK: %[[BOX2:.*]] = fir.embox %[[DT2]] : (!fir.ref<!fir.type<_QMpolymorphic_testTp2{a:i32,b:i32,c:f32}>>) -> !fir.class<!fir.type<_QMpolymorphic_testTp2{a:i32,b:i32,c:f32}>>
-! CHECK: %[[CLASS2:.*]] = fir.convert %[[BOX2]] : (!fir.class<!fir.type<_QMpolymorphic_testTp2{a:i32,b:i32,c:f32}>>) -> !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> 
+! CHECK: %[[CLASS2:.*]] = fir.convert %[[BOX2]] : (!fir.class<!fir.type<_QMpolymorphic_testTp2{a:i32,b:i32,c:f32}>>) -> !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>
 ! CHECK: fir.call @_QMpolymorphic_testPprint(%[[CLASS2]]) {{.*}}: (!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> ()
 
   subroutine test_allocate_unlimited_polymorphic_non_derived()
@@ -316,7 +316,7 @@ subroutine nullify_pointer_array(a)
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPnullify_pointer_array(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.type<_QMpolymorphic_testTp3{p:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp3>>>>}>> {fir.bindc_name = "a"}) {
 ! CHECK: %[[COORD_P:.*]] = fir.coordinate_of %[[ARG0]], p : (!fir.ref<!fir.type<_QMpolymorphic_testTp3{p:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp3>>>>}>>) -> !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp3{p:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp3>>>>}>>>>>
-! CHECK: %[[TYPE_DESC:.*]] = fir.type_desc !fir.type<_QMpolymorphic_testTp3{p:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp3>>>>}> 
+! CHECK: %[[TYPE_DESC:.*]] = fir.type_desc !fir.type<_QMpolymorphic_testTp3{p:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp3>>>>}>
 ! CHECK: %[[CONV_P:.*]] = fir.convert %[[COORD_P]] : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp3{p:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp3>>>>}>>>>>) -> !fir.ref<!fir.box<none>>
 ! CHECK: %[[CONV_TDESC:.*]] = fir.convert %[[TYPE_DESC]] : (!fir.tdesc<!fir.type<_QMpolymorphic_testTp3{p:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QMpolymorphic_testTp3>>>>}>>) -> !fir.ref<none>
 ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
@@ -517,7 +517,7 @@ subroutine test_elemental_assign()
 
   subroutine host_assoc(this)
     class(p1) :: this
-    
+
     call internal
   contains
     subroutine internal
@@ -779,7 +779,7 @@ function unlimited_polymorphic_alloc_array_ret()
   subroutine test_unlimited_polymorphic_alloc_array_ret()
     select type (a => unlimited_polymorphic_alloc_array_ret())
       type is (real)
-        print*, 'type is real' 
+        print*, 'type is real'
     end select
   end subroutine
 
@@ -795,7 +795,7 @@ subroutine test_unlimited_polymorphic_intentout(a)
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPtest_unlimited_polymorphic_intentout(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<none> {fir.bindc_name = "a"}) {
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]] : (!fir.class<none>) -> !fir.box<none>
-! CHECK: fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> () 
+! CHECK: fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> ()
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]] : (!fir.class<none>) -> !fir.box<none>
 ! CHECK: fir.call @_FortranAInitialize(%[[BOX_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 
@@ -806,7 +806,7 @@ subroutine test_polymorphic_intentout(a)
 ! CHECK-LABEL: func.func @_QMpolymorphic_testPtest_polymorphic_intentout(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>> {fir.bindc_name = "a"}) {
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.box<none>
-! CHECK: fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> () 
+! CHECK: fir.call @_FortranADestroy(%[[BOX_NONE]]) {{.*}} : (!fir.box<none>) -> ()
 ! CHECK: %[[BOX_NONE:.*]] = fir.convert %[[ARG0]] : (!fir.class<!fir.type<_QMpolymorphic_testTp1{a:i32,b:i32}>>) -> !fir.box<none>
 ! CHECK: fir.call @_FortranAInitialize(%[[BOX_NONE]], %{{.*}}, %{{.*}}) {{.*}} : (!fir.box<none>, !fir.ref<i8>, i32) -> ()
 
diff --git a/flang/test/Lower/pre-fir-tree02.f90 b/flang/test/Lower/pre-fir-tree02.f90
index 65c33e9b364fe..d61dc801eea60 100644
--- a/flang/test/Lower/pre-fir-tree02.f90
+++ b/flang/test/Lower/pre-fir-tree02.f90
@@ -148,7 +148,7 @@ subroutine incr(i)
 module test
   !! When derived type processing is implemented, remove all instances of:
   !!  - !![disable]
-  !!  -  COM: 
+  !!  -  COM:
   !![disable]type :: a_type
   !![disable]  integer :: x
   !![disable]end type
diff --git a/flang/test/Lower/procedure-declarations.f90 b/flang/test/Lower/procedure-declarations.f90
index b0dee600f563d..95c8607a97e59 100644
--- a/flang/test/Lower/procedure-declarations.f90
+++ b/flang/test/Lower/procedure-declarations.f90
@@ -4,7 +4,7 @@
 ! (passing a procedure and calling it), with and without definitions.
 ! Check that the definition type prevail if available and that casts are inserted to
 ! accommodate for the signature mismatch in the different location due to implicit
-! typing rules and Fortran loose interface compatibility rule history. 
+! typing rules and Fortran loose interface compatibility rule history.
 
 
 ! Note: all the cases where their is a definition are exactly the same,
@@ -25,7 +25,7 @@ subroutine call_foo(i)
   ! %[[argconvert:*]] = fir.convert %arg0 :
   ! fir.call @_QPfoo(%[[argconvert]]) {{.*}}: (!fir.ref<!fir.array<2x5xi32>>) -> ()
   call foo(i)
-end subroutine 
+end subroutine
 ! CHECK-LABEL: func @_QPfoo(
 ! CHECK-SAME: %{{.*}}: !fir.ref<!fir.array<2x5xi32>>{{.*}}) {
 subroutine foo(i)
@@ -41,7 +41,7 @@ subroutine call_foo2(i)
   ! %[[argconvert:*]] = fir.convert %arg0 :
   ! fir.call @_QPfoo2(%[[argconvert]]) {{.*}}: (!fir.ref<!fir.array<2x5xi32>>) -> ()
   call foo2(i)
-end subroutine 
+end subroutine
 ! CHECK-LABEL: func @_QPpass_foo2() {
 subroutine pass_foo2()
   external :: foo2
@@ -64,7 +64,7 @@ subroutine call_foo3(i)
   ! %[[argconvert:*]] = fir.convert %arg0 :
   ! fir.call @_QPfoo3(%[[argconvert]]) {{.*}}: (!fir.ref<!fir.array<2x5xi32>>) -> ()
   call foo3(i)
-end subroutine 
+end subroutine
 ! CHECK-LABEL: func @_QPfoo3(
 ! CHECK-SAME: %{{.*}}: !fir.ref<!fir.array<2x5xi32>>{{.*}}) {
 subroutine foo3(i)
@@ -93,7 +93,7 @@ subroutine call_foo4(i)
   ! %[[argconvert:*]] = fir.convert %arg0 :
   ! fir.call @_QPfoo4(%[[argconvert]]) {{.*}}: (!fir.ref<!fir.array<2x5xi32>>) -> ()
   call foo4(i)
-end subroutine 
+end subroutine
 ! CHECK-LABEL: func @_QPpass_foo4() {
 subroutine pass_foo4()
   external :: foo4
@@ -123,7 +123,7 @@ subroutine call_foo5(i)
   ! %[[argconvert:*]] = fir.convert %arg0 :
   ! fir.call @_QPfoo5(%[[argconvert]]) {{.*}}: (!fir.ref<!fir.array<2x5xi32>>) -> ()
   call foo5(i)
-end subroutine 
+end subroutine
 
 
 ! Test when there is no definition (declaration at the end of the mlir module)
@@ -136,7 +136,7 @@ subroutine call_foo6(i)
   integer :: i(10)
   ! CHECK-NOT: convert
   call foo6(i)
-end subroutine 
+end subroutine
 ! CHECK-LABEL: func @_QPpass_foo6() {
 subroutine pass_foo6()
   external :: foo6
@@ -160,7 +160,7 @@ function call_foo7(i)
   ! CHECK: %[[funccast:.*]] = fir.convert %[[f]] : (() -> ()) -> ((!fir.ref<!fir.array<10xi32>>) -> f32)
   ! CHECK: fir.call %[[funccast]](%arg0) {{.*}}: (!fir.ref<!fir.array<10xi32>>) -> f32
   call_foo7 =  foo7(i)
-end function 
+end function
 
 
 ! call, call with different type
@@ -170,14 +170,14 @@ subroutine call_foo8(i)
   integer :: i(10)
   ! CHECK-NOT: convert
   call foo8(i)
-end subroutine 
+end subroutine
 ! CHECK-LABEL: func @_QPcall_foo8_2(
 ! CHECK-SAME: %{{.*}}: !fir.ref<!fir.array<2x5xi32>>{{.*}}) {
 subroutine call_foo8_2(i)
   integer :: i(2, 5)
   ! %[[argconvert:*]] = fir.convert %arg0 :
   call foo8(i)
-end subroutine 
+end subroutine
 
 ! Test that target attribute is lowered in declaration of functions that are
 ! not defined in this file.
diff --git a/flang/test/Lower/read-write-buffer.f90 b/flang/test/Lower/read-write-buffer.f90
index cfa25c8a0ad6e..ff4fabca2869c 100644
--- a/flang/test/Lower/read-write-buffer.f90
+++ b/flang/test/Lower/read-write-buffer.f90
@@ -15,8 +15,8 @@ subroutine test_array_format
   ! CHECK: %[[fmtArg:.*]] = fir.zero_bits !fir.ref<i8>
   ! CHECK: %[[fmtLenArg:.*]] = fir.zero_bits i64
   ! CHECK: %[[fmtDesc:.*]] = fir.convert %[[fmtBox]] : (!fir.box<!fir.array<2x!fir.char<1,10>>>) -> !fir.box<none>
-  ! CHECK: fir.call @_FortranAioBeginExternalFormattedOutput(%[[fmtArg]], %[[fmtLenArg]], %[[fmtDesc]], {{.*}}) 
-  write(*, array) 
+  ! CHECK: fir.call @_FortranAioBeginExternalFormattedOutput(%[[fmtArg]], %[[fmtLenArg]], %[[fmtDesc]], {{.*}})
+  write(*, array)
 end subroutine
 
 ! A test to check the buffer and it's length.
diff --git a/flang/test/Lower/select-type.f90 b/flang/test/Lower/select-type.f90
index e2ca87ad447c9..246b653390b59 100644
--- a/flang/test/Lower/select-type.f90
+++ b/flang/test/Lower/select-type.f90
@@ -38,7 +38,7 @@ function negate(this)
     allocate(negate, source=this)
     negate%a = -this%a
   end function
-  
+
   subroutine select_type1(a)
     class(p1), intent(in) :: a
 
@@ -275,7 +275,7 @@ subroutine select_type5(a)
 ! CHECK-LABEL: func.func @_QMselect_type_lower_testPselect_type5(
 ! CHECK-SAME: %[[ARG0:.*]]: !fir.class<none> {fir.bindc_name = "a"})
 ! CHECK: fir.select_type %[[ARG0]] : !fir.class<none>
-! CHECK-SAME: [#fir.type_is<i8>, ^[[I8_BLK:.*]], #fir.type_is<i32>, ^[[I32_BLK:.*]], #fir.type_is<f32>, ^[[F32_BLK:.*]], #fir.type_is<!fir.logical<4>>, ^[[LOG_BLK:.*]], #fir.type_is<!fir.char<1,?>>, ^[[CHAR_BLK:.*]], unit, ^[[DEFAULT:.*]]] 
+! CHECK-SAME: [#fir.type_is<i8>, ^[[I8_BLK:.*]], #fir.type_is<i32>, ^[[I32_BLK:.*]], #fir.type_is<f32>, ^[[F32_BLK:.*]], #fir.type_is<!fir.logical<4>>, ^[[LOG_BLK:.*]], #fir.type_is<!fir.char<1,?>>, ^[[CHAR_BLK:.*]], unit, ^[[DEFAULT:.*]]]
 ! CHECK: ^[[I8_BLK]]
 ! CHECK: ^[[I32_BLK]]
 ! CHECK: ^[[F32_BLK]]
@@ -467,7 +467,7 @@ subroutine select_type8(a)
 ! CHECK: %[[SELECTOR:.*]] = fir.rebox %[[ARG0]] : (!fir.class<!fir.array<?xnone>>) -> !fir.class<!fir.array<?xnone>>
 ! CHECK: fir.select_type %[[SELECTOR]] : !fir.class<!fir.array<?xnone>> [#fir.type_is<i32>, ^{{.*}}, #fir.type_is<f32>, ^{{.*}}, #fir.type_is<!fir.char<1,?>>, ^bb{{.*}}, unit, ^{{.*}}]
 ! CHECK: ^bb{{.*}}:
-! CHECK:  %[[BOX:.*]] = fir.convert %[[SELECTOR]] : (!fir.class<!fir.array<?xnone>>) -> !fir.box<!fir.array<?xi32>> 
+! CHECK:  %[[BOX:.*]] = fir.convert %[[SELECTOR]] : (!fir.class<!fir.array<?xnone>>) -> !fir.box<!fir.array<?xi32>>
 ! CHECK:  %[[C0:.*]] = arith.constant 0 : index
 ! CHECK:  %[[SELECTOR_DIMS:.*]]:3 = fir.box_dims %[[BOX]], %[[C0]] : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
 ! CHECK:  %[[ARRAY_LOAD:.*]] = fir.array_load %[[BOX]] : (!fir.box<!fir.array<?xi32>>) -> !fir.array<?xi32>
@@ -482,7 +482,7 @@ subroutine select_type8(a)
 ! CHECK:  fir.array_merge_store %[[ARRAY_LOAD]], %[[LOOP_RES]] to %[[BOX]] : !fir.array<?xi32>, !fir.array<?xi32>, !fir.box<!fir.array<?xi32>>
 ! CHECK:  cf.br ^{{.*}}
 ! CHECK: ^bb{{.*}}:
-! CHECK:  %[[BOX:.*]] = fir.convert %[[SELECTOR]] : (!fir.class<!fir.array<?xnone>>) -> !fir.box<!fir.array<?xf32>> 
+! CHECK:  %[[BOX:.*]] = fir.convert %[[SELECTOR]] : (!fir.class<!fir.array<?xnone>>) -> !fir.box<!fir.array<?xf32>>
 ! CHECK:  %[[C0:.*]] = arith.constant 0 : index
 ! CHECK:  %[[SELECTOR_DIMS:.*]]:3 = fir.box_dims %[[BOX]], %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 ! CHECK:  %[[ARRAY_LOAD:.*]] = fir.array_load %[[BOX]] : (!fir.box<!fir.array<?xf32>>) -> !fir.array<?xf32>
@@ -497,7 +497,7 @@ subroutine select_type8(a)
 ! CHECK:  fir.array_merge_store %[[ARRAY_LOAD]], %[[LOOP_RES]] to %[[BOX]] : !fir.array<?xf32>, !fir.array<?xf32>, !fir.box<!fir.array<?xf32>>
 ! CHECK:  cf.br ^{{.*}}
 ! CHECK: ^bb{{.*}}:
-! CHECK:  %[[BOX:.*]] = fir.convert %{{[0-9]+}} : (!fir.class<!fir.array<?xnone>>) -> !fir.box<!fir.array<?x!fir.char<1,?>>> 
+! CHECK:  %[[BOX:.*]] = fir.convert %{{[0-9]+}} : (!fir.class<!fir.array<?xnone>>) -> !fir.box<!fir.array<?x!fir.char<1,?>>>
 ! CHECK:  cf.br ^bb{{.*}}
 ! CHECK: ^bb{{.*}}:
 ! CHECK:  %[[EXACT_BOX:.*]] = fir.convert %[[SELECTOR]] : (!fir.class<!fir.array<?xnone>>) -> !fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp1{a:i32,b:i32}>>>
@@ -517,7 +517,7 @@ subroutine select_type8(a)
 ! CHECK:  %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[EXACT_BOX]], %[[C0]] : (!fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp1{a:i32,b:i32}>>>, index) -> (index, index, index)
 ! CHECK:  %[[C1:.*]] = arith.constant 1 : index
 ! CHECK:  %[[SLICE:.*]] = fir.slice %[[C1]], %[[BOX_DIMS]]#1, %[[C1]] path %[[FIELD_B]] : (index, index, index, !fir.field) -> !fir.slice<1>
-! CHECK:  %[[ARRAY_LOAD:.*]] = fir.array_load %[[EXACT_BOX]] [%[[SLICE]]] : (!fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp1{a:i32,b:i32}>>>, !fir.slice<1>) -> !fir.array<?xi32> 
+! CHECK:  %[[ARRAY_LOAD:.*]] = fir.array_load %[[EXACT_BOX]] [%[[SLICE]]] : (!fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp1{a:i32,b:i32}>>>, !fir.slice<1>) -> !fir.array<?xi32>
 ! CHECK:  %[[DO_RES:.*]] = fir.do_loop %[[IND:.*]] = %{{.*}} to %{{.*}} step %c{{.*}} unordered iter_args(%[[ARG:.*]] = %[[ARRAY_LOAD]]) -> (!fir.array<?xi32>) {
 ! CHECK:    %[[ARR_UP:.*]] = fir.array_update %[[ARG]], %{{.*}}, %[[IND]] : (!fir.array<?xi32>, i32, index) -> !fir.array<?xi32>
 ! CHECK:    fir.result %[[ARR_UP]] : !fir.array<?xi32>
@@ -599,7 +599,7 @@ subroutine select_type9(a)
 ! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[EXACT_BOX]], %[[C0]] : (!fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp1{a:i32,b:i32}>>>, index) -> (index, index, index)
 ! CHECK: %[[C1:.*]] = arith.constant 1 : index
 ! CHECK: %[[SLICE:.*]] = fir.slice %[[C1]], %[[BOX_DIMS]]#1, %[[C1]] path %[[FIELD_B]] : (index, index, index, !fir.field) -> !fir.slice<1>
-! CHECK: %[[ARRAY_LOAD:.*]] = fir.array_load %[[EXACT_BOX]] [%[[SLICE]]] : (!fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp1{a:i32,b:i32}>>>, !fir.slice<1>) -> !fir.array<?xi32> 
+! CHECK: %[[ARRAY_LOAD:.*]] = fir.array_load %[[EXACT_BOX]] [%[[SLICE]]] : (!fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp1{a:i32,b:i32}>>>, !fir.slice<1>) -> !fir.array<?xi32>
 ! CHECK: %[[DO_RES:.*]] = fir.do_loop %[[IND:.*]] = %{{.*}} to %{{.*}} step %c{{.*}} unordered iter_args(%[[ARG:.*]] = %[[ARRAY_LOAD]]) -> (!fir.array<?xi32>) {
 ! CHECK:   %[[ARR_UP:.*]] = fir.array_update %[[ARG]], %{{.*}}, %[[IND]] : (!fir.array<?xi32>, i32, index) -> !fir.array<?xi32>
 ! CHECK:   fir.result %[[ARR_UP]] : !fir.array<?xi32>
@@ -607,7 +607,7 @@ subroutine select_type9(a)
 ! CHECK: fir.array_merge_store %[[ARRAY_LOAD]], %[[DO_RES]] to %[[EXACT_BOX]][%[[SLICE]]] : !fir.array<?xi32>, !fir.array<?xi32>, !fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp1{a:i32,b:i32}>>>, !fir.slice<1>
 ! CHECK: cf.br ^bb{{.*}}
 ! CHECK: ^bb{{.*}}:
-! CHECK: %[[EXACT_BOX:.*]] = fir.convert %[[SELECTOR]] : (!fir.class<!fir.array<?x!fir.type<_QMselect_type_lower_testTp1{a:i32,b:i32}>>>) -> !fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp2{a:i32,b:i32,c:i32}>>> 
+! CHECK: %[[EXACT_BOX:.*]] = fir.convert %[[SELECTOR]] : (!fir.class<!fir.array<?x!fir.type<_QMselect_type_lower_testTp1{a:i32,b:i32}>>>) -> !fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp2{a:i32,b:i32,c:i32}>>>
 ! CHECK: %[[FIELD_A:.*]] = fir.field_index a, !fir.type<_QMselect_type_lower_testTp2{a:i32,b:i32,c:i32}>
 ! CHECK: %[[C0:.*]] = arith.constant 0 : index
 ! CHECK: %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[EXACT_BOX]], %[[C0]] : (!fir.box<!fir.array<?x!fir.type<_QMselect_type_lower_testTp2{a:i32,b:i32,c:i32}>>>, index) -> (index, index, index)
diff --git a/flang/test/Lower/statement-function.f90 b/flang/test/Lower/statement-function.f90
index fe07649e669af..9dd26d4f58273 100644
--- a/flang/test/Lower/statement-function.f90
+++ b/flang/test/Lower/statement-function.f90
@@ -21,7 +21,7 @@ real function test_stmt_0(x)
 
 ! Check this is not lowered as a simple macro: e.g. argument is only
 ! evaluated once even if it appears in several placed inside the
-! statement function expression 
+! statement function expression
 ! CHECK-LABEL: func @_QPtest_stmt_only_eval_arg_once() -> f32
 real(4) function test_stmt_only_eval_arg_once()
   real(4) :: only_once, x1
diff --git a/flang/test/Lower/variable.f90 b/flang/test/Lower/variable.f90
index 76d4a26838bf7..52cdb96d89c63 100644
--- a/flang/test/Lower/variable.f90
+++ b/flang/test/Lower/variable.f90
@@ -4,7 +4,7 @@
 subroutine s
   ! CHECK-DAG: fir.alloca !fir.box<!fir.heap<i32>> {{{.*}}uniq_name = "{{.*}}Eally"}
   integer, allocatable :: ally
-  ! CHECK-DAG: fir.alloca !fir.box<!fir.ptr<i32>> {{{.*}}uniq_name = "{{.*}}Epointy"} 
+  ! CHECK-DAG: fir.alloca !fir.box<!fir.ptr<i32>> {{{.*}}uniq_name = "{{.*}}Epointy"}
   integer, pointer :: pointy
   ! CHECK-DAG: fir.alloca i32 {{{.*}}fir.target{{.*}}uniq_name = "{{.*}}Ebullseye"}
   integer, target :: bullseye
diff --git a/flang/test/Lower/volatile-allocatable.f90 b/flang/test/Lower/volatile-allocatable.f90
index c33d368c5858f..3147698114115 100644
--- a/flang/test/Lower/volatile-allocatable.f90
+++ b/flang/test/Lower/volatile-allocatable.f90
@@ -42,7 +42,7 @@ subroutine test_scalar_volatile()
   ! Deferred-length characters
   allocate(character(20) :: c1)
   c1 = "volatile character"
-  
+
   ! Allocation with components
   allocate(v3)
   deallocate(v1, v2, v3, c1)
@@ -53,24 +53,24 @@ subroutine test_volatile_asynchronous()
   use derived_types
   class(base_type), allocatable, volatile, asynchronous :: v1(:)
   integer, allocatable, volatile, asynchronous :: i1(:)
-  
+
   allocate(v1(4))
   allocate(i1(4), source=[1, 2, 3, 4])
-  
+
   deallocate(v1, i1)
 end subroutine
 
 subroutine test_select_base_type_volatile()
   use derived_types
   class(base_type), allocatable, volatile :: v(:)
-  
+
   allocate(v(2))
-  
+
   select type(v)
   class is (base_type)
     v(1)%i = 100
   end select
-  
+
   deallocate(v)
 end subroutine
 
@@ -79,12 +79,12 @@ subroutine test_mold_allocation()
   use derived_types
   type(comp_type) :: template
   type(comp_type), allocatable, volatile :: v(:)
-  
+
   template%str = "mold test"
   template%arr = [5, 6]
-  
+
   allocate(v(3), mold=template)
-  
+
   deallocate(v)
 end subroutine
 
@@ -93,28 +93,28 @@ subroutine test_unlimited_polymorphic()
   use derived_types
   class(*), allocatable, volatile :: up
   class(*), allocatable, volatile :: upa(:)
-  
+
   ! Scalar allocation
   allocate(integer :: up)
   select type(up)
     type is (integer)
       up = 123
   end select
-  
+
   ! Array allocation with source
   allocate(character(10) :: up)
   select type(up)
     type is (character(*))
       up = "class(*)"
   end select
-  
+
   ! Array allocation
   allocate(real :: upa(3))
   select type(upa)
     type is (real)
       upa = [1.1, 2.2, 3.3]
   end select
-  
+
   deallocate(up, upa)
 end subroutine
 
diff --git a/flang/test/Lower/volatile-openmp1.f90 b/flang/test/Lower/volatile-openmp1.f90
index 07d81a1aeb240..74ab3858c8958 100644
--- a/flang/test/Lower/volatile-openmp1.f90
+++ b/flang/test/Lower/volatile-openmp1.f90
@@ -5,7 +5,7 @@ program main
 integer::n,i
 a=0
 n=1000
-!$omp parallel 
+!$omp parallel
 !$omp do reduction(+:a)
   do i=1,n
     a=a+1
diff --git a/flang/test/Semantics/OpenMP/compiler-directives-loop.f90 b/flang/test/Semantics/OpenMP/compiler-directives-loop.f90
new file mode 100644
index 0000000000000..48b9529c95dc2
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/compiler-directives-loop.f90
@@ -0,0 +1,21 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=60 %s -o - | FileCheck %s
+
+! Check that this compiles successfully, but not rely on any specific output.
+
+!CHECK: omp.parallel
+
+program omp_cdir_crash
+  implicit none
+  integer, parameter :: n = 10
+  real :: a(n)
+  integer :: i
+
+!$omp parallel do
+!dir$ unroll
+  do i = 1, n
+    a(i) = real(i)
+  end do
+!$omp end parallel do
+
+  print *, 'a(1)=', a(1), ' a(n)=', a(n)
+end program omp_cdir_crash
diff --git a/flang/test/Semantics/OpenMP/loop-association.f90 b/flang/test/Semantics/OpenMP/loop-association.f90
index 9c79a91429fdf..4e63cafb3fda1 100644
--- a/flang/test/Semantics/OpenMP/loop-association.f90
+++ b/flang/test/Semantics/OpenMP/loop-association.f90
@@ -33,10 +33,9 @@
   END DO outer
 
   ! Accept directives between parallel do and actual loop.
-  !ERROR: A DO loop must follow the PARALLEL DO directive
   !$OMP PARALLEL DO
   !WARNING: Unrecognized compiler directive was ignored [-Wignored-directive]
-  !ERROR: Compiler directives are not allowed inside OpenMP loop constructs
+  !WARNING: Compiler directives are not allowed inside OpenMP loop constructs
   !DIR$ VECTOR ALIGNED
   DO 20 i=1,N
      a = a + 0.5
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_flag_clear.h b/libclc/clc/include/clc/atomic/clc_atomic_flag_clear.h
new file mode 100644
index 0000000000000..fee7c0506abc1
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_flag_clear.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FLAG_CLEAR_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FLAG_CLEAR_H__
+
+#include <clc/internal/clc.h>
+
+#define __CLC_DECLARE_ATOMIC_FLAG_CLEAR(ADDRSPACE)                             \
+  _CLC_OVERLOAD _CLC_DECL void __clc_atomic_flag_clear(                        \
+      ADDRSPACE int *Ptr, int MemoryOrder, int MemoryScope);
+
+__CLC_DECLARE_ATOMIC_FLAG_CLEAR(global)
+__CLC_DECLARE_ATOMIC_FLAG_CLEAR(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DECLARE_ATOMIC_FLAG_CLEAR()
+#endif
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FLAG_CLEAR_H__
diff --git a/libclc/clc/include/clc/atomic/clc_atomic_flag_test_and_set.h b/libclc/clc/include/clc/atomic/clc_atomic_flag_test_and_set.h
new file mode 100644
index 0000000000000..afc373204ad70
--- /dev/null
+++ b/libclc/clc/include/clc/atomic/clc_atomic_flag_test_and_set.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_ATOMIC_CLC_ATOMIC_FLAG_TEST_AND_SET_H__
+#define __CLC_ATOMIC_CLC_ATOMIC_FLAG_TEST_AND_SET_H__
+
+#include <clc/internal/clc.h>
+
+#define __CLC_DECLARE_ATOMIC_FLAG_TEST_AND_SET(ADDRSPACE)                      \
+  _CLC_OVERLOAD _CLC_DECL bool __clc_atomic_flag_test_and_set(                 \
+      ADDRSPACE int *Ptr, int MemoryOrder, int MemoryScope);
+
+__CLC_DECLARE_ATOMIC_FLAG_TEST_AND_SET(global)
+__CLC_DECLARE_ATOMIC_FLAG_TEST_AND_SET(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DECLARE_ATOMIC_FLAG_TEST_AND_SET()
+#endif
+
+#endif // __CLC_ATOMIC_CLC_ATOMIC_FLAG_TEST_AND_SET_H__
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index ee4f771799e8e..64fc6b4827a1a 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -9,6 +9,8 @@ atomic/clc_atomic_fetch_min.cl
 atomic/clc_atomic_fetch_or.cl
 atomic/clc_atomic_fetch_sub.cl
 atomic/clc_atomic_fetch_xor.cl
+atomic/clc_atomic_flag_clear.cl
+atomic/clc_atomic_flag_test_and_set.cl
 atomic/clc_atomic_inc.cl
 atomic/clc_atomic_load.cl
 atomic/clc_atomic_store.cl
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_flag_clear.cl b/libclc/clc/lib/generic/atomic/clc_atomic_flag_clear.cl
new file mode 100644
index 0000000000000..e03e63bd82d5a
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_flag_clear.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_flag_clear.h>
+#include <clc/atomic/clc_atomic_store.h>
+
+#define __CLC_ATOMIC_FLAG_FALSE 0
+
+#define __CLC_DEFINE_ATOMIC_FLAG_CLEAR(ADDRSPACE)                              \
+  _CLC_OVERLOAD _CLC_DEF void __clc_atomic_flag_clear(                         \
+      ADDRSPACE int *Ptr, int MemoryOrder, int MemoryScope) {                  \
+    __clc_atomic_store(Ptr, __CLC_ATOMIC_FLAG_FALSE, MemoryOrder,              \
+                       MemoryScope);                                           \
+  }
+
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR(global)
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR()
+#endif
diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_flag_test_and_set.cl b/libclc/clc/lib/generic/atomic/clc_atomic_flag_test_and_set.cl
new file mode 100644
index 0000000000000..4a033e3532af9
--- /dev/null
+++ b/libclc/clc/lib/generic/atomic/clc_atomic_flag_test_and_set.cl
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_exchange.h>
+#include <clc/atomic/clc_atomic_flag_test_and_set.h>
+
+#define __CLC_ATOMIC_FLAG_TRUE 1
+
+#define __CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(ADDRSPACE)                       \
+  _CLC_OVERLOAD _CLC_DEF bool __clc_atomic_flag_test_and_set(                  \
+      ADDRSPACE int *Ptr, int MemoryOrder, int MemoryScope) {                  \
+    return (bool)__clc_atomic_exchange(Ptr, __CLC_ATOMIC_FLAG_TRUE,            \
+                                       MemoryOrder, MemoryScope);              \
+  }
+
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(global)
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET()
+#endif
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_flag_clear.h b/libclc/opencl/include/clc/opencl/atomic/atomic_flag_clear.h
new file mode 100644
index 0000000000000..2fcd3eef43a65
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_flag_clear.h
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FLAG_CLEAR_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FLAG_CLEAR_H__
+
+#include <clc/opencl/opencl-base.h>
+#include <clc/opencl/types.h>
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+_CLC_OVERLOAD _CLC_DECL void atomic_flag_clear(volatile __global atomic_flag *);
+_CLC_OVERLOAD _CLC_DECL void atomic_flag_clear(volatile __local atomic_flag *);
+#if defined(__opencl_c_generic_address_space)
+_CLC_OVERLOAD _CLC_DECL void atomic_flag_clear(volatile atomic_flag *);
+#endif // defined(__opencl_c_generic_address_space)
+#endif
+
+#if defined(__opencl_c_atomic_scope_device)
+_CLC_OVERLOAD _CLC_DECL void
+atomic_flag_clear_explicit(volatile __global atomic_flag *, memory_order);
+_CLC_OVERLOAD _CLC_DECL void
+atomic_flag_clear_explicit(volatile __local atomic_flag *, memory_order);
+#if defined(__opencl_c_generic_address_space)
+_CLC_OVERLOAD _CLC_DECL void atomic_flag_clear_explicit(volatile atomic_flag *,
+                                                        memory_order);
+#endif // defined(__opencl_c_generic_address_space)
+#endif
+
+_CLC_OVERLOAD _CLC_DECL void
+atomic_flag_clear_explicit(volatile __global atomic_flag *, memory_order,
+                           memory_scope);
+_CLC_OVERLOAD _CLC_DECL void
+atomic_flag_clear_explicit(volatile __local atomic_flag *, memory_order,
+                           memory_scope);
+#if defined(__opencl_c_generic_address_space)
+_CLC_OVERLOAD _CLC_DECL void
+atomic_flag_clear_explicit(volatile atomic_flag *, memory_order, memory_scope);
+#endif // defined(__opencl_c_generic_address_space)
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FLAG_CLEAR_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_flag_test_and_set.h b/libclc/opencl/include/clc/opencl/atomic/atomic_flag_test_and_set.h
new file mode 100644
index 0000000000000..6e3a8e403d5da
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_flag_test_and_set.h
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FLAG_TEST_AND_SET_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_FLAG_TEST_AND_SET_H__
+
+#include <clc/opencl/opencl-base.h>
+#include <clc/opencl/types.h>
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+_CLC_OVERLOAD _CLC_DECL bool
+atomic_flag_test_and_set(volatile __global atomic_flag *);
+_CLC_OVERLOAD _CLC_DECL bool
+atomic_flag_test_and_set(volatile __local atomic_flag *);
+#if defined(__opencl_c_generic_address_space)
+_CLC_OVERLOAD _CLC_DECL bool atomic_flag_test_and_set(volatile atomic_flag *);
+#endif // defined(__opencl_c_generic_address_space)
+#endif
+
+#if defined(__opencl_c_atomic_scope_device)
+_CLC_OVERLOAD _CLC_DECL bool
+atomic_flag_test_and_set_explicit(volatile __global atomic_flag *,
+                                  memory_order);
+_CLC_OVERLOAD _CLC_DECL bool
+atomic_flag_test_and_set_explicit(volatile __local atomic_flag *, memory_order);
+#if defined(__opencl_c_generic_address_space)
+_CLC_OVERLOAD _CLC_DECL bool
+atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order);
+#endif // defined(__opencl_c_generic_address_space)
+#endif
+
+_CLC_OVERLOAD _CLC_DECL bool
+atomic_flag_test_and_set_explicit(volatile __global atomic_flag *, memory_order,
+                                  memory_scope);
+_CLC_OVERLOAD _CLC_DECL bool
+atomic_flag_test_and_set_explicit(volatile __local atomic_flag *, memory_order,
+                                  memory_scope);
+#if defined(__opencl_c_generic_address_space)
+_CLC_OVERLOAD _CLC_DECL bool
+atomic_flag_test_and_set_explicit(volatile atomic_flag *, memory_order,
+                                  memory_scope);
+#endif // defined(__opencl_c_generic_address_space)
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_FLAG_TEST_AND_SET_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_init.h b/libclc/opencl/include/clc/opencl/atomic/atomic_init.h
new file mode 100644
index 0000000000000..6a2b938fdd52f
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_init.h
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_ATOMIC_ATOMIC_INIT_H__
+#define __CLC_OPENCL_ATOMIC_ATOMIC_INIT_H__
+
+#include <clc/opencl/opencl-base.h>
+
+#define __CLC_ATOMIC_GENTYPE __CLC_XCONCAT(atomic_, __CLC_GENTYPE)
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_init.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <clc/opencl/atomic/atomic_init.inc>
+#include <clc/math/gentype.inc>
+
+#undef __CLC_ATOMIC_GENTYPE
+
+#endif // __CLC_OPENCL_ATOMIC_ATOMIC_INIT_H__
diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_init.inc b/libclc/opencl/include/clc/opencl/atomic/atomic_init.inc
new file mode 100644
index 0000000000000..80135fae4b39d
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/atomic/atomic_init.inc
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if defined(__opencl_c_fp64) && (defined(cl_khr_int64_base_atomics) &&         \
+                                 defined(cl_khr_int64_extended_atomics))
+#define __CLC_HAVE_64_ATOMIC
+#endif
+#if defined(__CLC_FPSIZE) &&                                                   \
+    (__CLC_FPSIZE < 64 || defined(__CLC_HAVE_64_ATOMIC))
+#define __CLC_HAVE_FP_ATOMIC
+#endif
+#if defined(__CLC_GENSIZE) &&                                                  \
+    ((__CLC_GENSIZE == 32) ||                                                  \
+     (__CLC_GENSIZE == 64 && defined(__CLC_HAVE_64_ATOMIC)))
+#define __CLC_HAVE_INT_ATOMIC
+#endif
+#if defined(__CLC_HAVE_FP_ATOMIC) || defined(__CLC_HAVE_INT_ATOMIC)
+
+#define __CLC_DECL_ATOMIC(ADDRSPACE)                                           \
+  _CLC_OVERLOAD _CLC_DECL void atomic_init(                                    \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value);
+
+__CLC_DECL_ATOMIC(global)
+__CLC_DECL_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DECL_ATOMIC()
+#endif
+
+#undef __CLC_DECL_ATOMIC
+
+#endif // __CLC_HAVE_FP_ATOMIC || __CLC_HAVE_INT_ATOMIC
+
+#undef __CLC_HAVE_INT_ATOMIC
+#undef __CLC_HAVE_FP_ATOMIC
+#undef __CLC_HAVE_64_ATOMIC
+
+#endif // __CLC_SCALAR
diff --git a/libclc/opencl/include/clc/opencl/types.h b/libclc/opencl/include/clc/opencl/types.h
new file mode 100644
index 0000000000000..b1be88f21bdaa
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/types.h
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_TYPES_H__
+#define __CLC_OPENCL_TYPES_H__
+
+// Copied from clang/lib/Headers/opencl-c-base.h
+
+typedef enum memory_scope {
+  memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+  memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+#if defined(__opencl_c_atomic_scope_all_devices)
+  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >= 202100)
+  memory_scope_all_devices = memory_scope_all_svm_devices,
+#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_3_0 || __OPENCL_CPP_VERSION__ >=
+       // 202100)
+#endif // defined(__opencl_c_atomic_scope_all_devices)
+/**
+ * Subgroups have different requirements on forward progress, so just test
+ * all the relevant macros.
+ * CL 3.0 sub-groups "they are not guaranteed to make independent forward
+ * progress" KHR subgroups "Subgroups within a workgroup are independent, make
+ * forward progress with respect to each other"
+ */
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) ||                \
+    defined(__opencl_c_subgroups)
+  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+#endif
+} memory_scope;
+
+typedef enum memory_order {
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+#if defined(__opencl_c_atomic_order_seq_cst)
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
+#endif
+} memory_order;
+
+#endif // __CLC_OPENCL_TYPES_H__
diff --git a/libclc/opencl/include/clc/opencl/utils.h b/libclc/opencl/include/clc/opencl/utils.h
new file mode 100644
index 0000000000000..42b948b8d30d2
--- /dev/null
+++ b/libclc/opencl/include/clc/opencl/utils.h
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_OPENCL_UTILS_H__
+#define __CLC_OPENCL_UTILS_H__
+
+#include <clc/internal/clc.h>
+#include <clc/opencl/types.h>
+
+// INTEL_FEATURE_PISA
+static _CLC_INLINE int __opencl_get_clang_memory_scope(memory_scope scope) {
+  switch (scope) {
+  case __OPENCL_MEMORY_SCOPE_WORK_ITEM:
+    return __MEMORY_SCOPE_SINGLE;
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) ||                \
+    defined(__opencl_c_subgroups)
+  case __OPENCL_MEMORY_SCOPE_SUB_GROUP:
+    return __MEMORY_SCOPE_WVFRNT;
+#endif
+  case __OPENCL_MEMORY_SCOPE_WORK_GROUP:
+    return __MEMORY_SCOPE_WRKGRP;
+  case __OPENCL_MEMORY_SCOPE_DEVICE:
+    return __MEMORY_SCOPE_DEVICE;
+  default:
+    return __MEMORY_SCOPE_SYSTEM;
+  }
+}
+// end INTEL_FEATURE_PISA
+
+#endif // __CLC_OPENCL_UTILS_H__
diff --git a/libclc/opencl/lib/generic/SOURCES b/libclc/opencl/lib/generic/SOURCES
index 61757efbcaad7..94a333e765b18 100644
--- a/libclc/opencl/lib/generic/SOURCES
+++ b/libclc/opencl/lib/generic/SOURCES
@@ -22,7 +22,10 @@ atomic/atomic_fetch_min.cl
 atomic/atomic_fetch_or.cl
 atomic/atomic_fetch_sub.cl
 atomic/atomic_fetch_xor.cl
+atomic/atomic_flag_clear.cl
+atomic/atomic_flag_test_and_set.cl
 atomic/atomic_inc.cl
+atomic/atomic_init.cl
 atomic/atomic_load.cl
 atomic/atomic_max.cl
 atomic/atomic_min.cl
diff --git a/libclc/opencl/lib/generic/atomic/atomic_flag_clear.cl b/libclc/opencl/lib/generic/atomic/atomic_flag_clear.cl
new file mode 100644
index 0000000000000..c9f944903f831
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_flag_clear.cl
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_flag_clear.h>
+#include <clc/opencl/atomic/atomic_flag_clear.h>
+#include <clc/opencl/utils.h>
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#define __CLC_DEFINE_ATOMIC_FLAG_CLEAR(ADDRSPACE)                              \
+  _CLC_OVERLOAD _CLC_DEF void atomic_flag_clear(                               \
+      volatile ADDRSPACE atomic_flag *object) {                                \
+    __clc_atomic_flag_clear((ADDRSPACE int *)object, __ATOMIC_SEQ_CST,         \
+                            __MEMORY_SCOPE_DEVICE);                            \
+  }
+
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR(global)
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR(local)
+#if defined(__opencl_c_generic_address_space)
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR()
+#endif
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
+
+#if defined(__opencl_c_atomic_scope_device)
+
+#define __CLC_DEFINE_ATOMIC_FLAG_CLEAR_ORDER(ADDRSPACE)                        \
+  _CLC_OVERLOAD _CLC_DEF void atomic_flag_clear_explicit(                      \
+      volatile ADDRSPACE atomic_flag *object, memory_order order) {            \
+    __clc_atomic_flag_clear((ADDRSPACE int *)object, order,                    \
+                            __MEMORY_SCOPE_DEVICE);                            \
+  }
+
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR_ORDER(global)
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR_ORDER(local)
+#if defined(__opencl_c_generic_address_space)
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR_ORDER()
+#endif
+
+#endif // defined(__opencl_c_atomic_scope_device)
+
+#define __CLC_DEFINE_ATOMIC_FLAG_CLEAR_ORDER_SCOPE(ADDRSPACE)                  \
+  _CLC_OVERLOAD _CLC_DEF void atomic_flag_clear_explicit(                      \
+      volatile ADDRSPACE atomic_flag *object, memory_order order,              \
+      memory_scope scope) {                                                    \
+    __clc_atomic_flag_clear((ADDRSPACE int *)object, order,                    \
+                            __opencl_get_clang_memory_scope(scope));           \
+  }
+
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR_ORDER_SCOPE(global)
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR_ORDER_SCOPE(local)
+#if defined(__opencl_c_generic_address_space)
+__CLC_DEFINE_ATOMIC_FLAG_CLEAR_ORDER_SCOPE()
+#endif
diff --git a/libclc/opencl/lib/generic/atomic/atomic_flag_test_and_set.cl b/libclc/opencl/lib/generic/atomic/atomic_flag_test_and_set.cl
new file mode 100644
index 0000000000000..e58079ee226a1
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_flag_test_and_set.cl
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/atomic/clc_atomic_flag_test_and_set.h>
+#include <clc/opencl/atomic/atomic_flag_test_and_set.h>
+#include <clc/opencl/utils.h>
+
+#if defined(__opencl_c_atomic_order_seq_cst) &&                                \
+    defined(__opencl_c_atomic_scope_device)
+
+#define __CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(ADDRSPACE)                       \
+  _CLC_OVERLOAD _CLC_DEF bool atomic_flag_test_and_set(                        \
+      volatile ADDRSPACE atomic_flag *object) {                                \
+    return __clc_atomic_flag_test_and_set(                                     \
+        (ADDRSPACE int *)object, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE);     \
+  }
+
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(global)
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(local)
+#if defined(__opencl_c_generic_address_space)
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET()
+#endif
+
+#undef __CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET
+
+#endif // defined(__opencl_c_atomic_order_seq_cst) &&
+       // defined(__opencl_c_atomic_scope_device)
+
+#if defined(__opencl_c_atomic_scope_device)
+
+#define __CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(ADDRSPACE)                       \
+  _CLC_OVERLOAD _CLC_DEF bool atomic_flag_test_and_set_explicit(               \
+      volatile ADDRSPACE atomic_flag *object, memory_order order) {            \
+    return __clc_atomic_flag_test_and_set((ADDRSPACE int *)object, order,      \
+                                          __MEMORY_SCOPE_DEVICE);              \
+  }
+
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(global)
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(local)
+#if defined(__opencl_c_generic_address_space)
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET()
+#endif
+
+#undef __CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET
+
+#endif // defined(__opencl_c_atomic_scope_device)
+
+#define __CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(ADDRSPACE)                       \
+  _CLC_OVERLOAD _CLC_DEF bool atomic_flag_test_and_set_explicit(               \
+      volatile ADDRSPACE atomic_flag *object, memory_order order,              \
+      memory_scope scope) {                                                    \
+    return __clc_atomic_flag_test_and_set(                                     \
+        (ADDRSPACE int *)object, order,                                        \
+        __opencl_get_clang_memory_scope(scope));                               \
+  }
+
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(global)
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET(local)
+#if defined(__opencl_c_generic_address_space)
+__CLC_DEFINE_ATOMIC_FLAG_TEST_AND_SET()
+#endif
diff --git a/libclc/opencl/lib/generic/atomic/atomic_init.cl b/libclc/opencl/lib/generic/atomic/atomic_init.cl
new file mode 100644
index 0000000000000..b688d9b04da74
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_init.cl
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <clc/opencl/atomic/atomic_init.h>
+#include <clc/opencl/utils.h>
+
+#define __CLC_ATOMIC_GENTYPE __CLC_XCONCAT(atomic_, __CLC_GENTYPE)
+
+#define __CLC_BODY <atomic_init.inc>
+#include <clc/integer/gentype.inc>
+
+#define __CLC_BODY <atomic_init.inc>
+#include <clc/math/gentype.inc>
diff --git a/libclc/opencl/lib/generic/atomic/atomic_init.inc b/libclc/opencl/lib/generic/atomic/atomic_init.inc
new file mode 100644
index 0000000000000..2e23df76a8c15
--- /dev/null
+++ b/libclc/opencl/lib/generic/atomic/atomic_init.inc
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#if defined(__opencl_c_fp64) && (defined(cl_khr_int64_base_atomics) &&         \
+                                 defined(cl_khr_int64_extended_atomics))
+#define __CLC_HAVE_64_ATOMIC
+#endif
+#if defined(__CLC_FPSIZE) &&                                                   \
+    (__CLC_FPSIZE < 64 || defined(__CLC_HAVE_64_ATOMIC))
+#define __CLC_HAVE_FP_ATOMIC
+#endif
+#if defined(__CLC_GENSIZE) &&                                                  \
+    ((__CLC_GENSIZE == 32) ||                                                  \
+     (__CLC_GENSIZE == 64 && defined(__CLC_HAVE_64_ATOMIC)))
+#define __CLC_HAVE_INT_ATOMIC
+#endif
+#if defined(__CLC_HAVE_FP_ATOMIC) || defined(__CLC_HAVE_INT_ATOMIC)
+
+#define __CLC_DEFINE_ATOMIC(ADDRSPACE)                                         \
+  _CLC_OVERLOAD _CLC_DEF void atomic_init(                                     \
+      volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value) {     \
+    *(ADDRSPACE __CLC_GENTYPE *)Ptr = Value;                                   \
+  }
+
+__CLC_DEFINE_ATOMIC(global)
+__CLC_DEFINE_ATOMIC(local)
+#if _CLC_GENERIC_AS_SUPPORTED
+__CLC_DEFINE_ATOMIC()
+#endif
+
+#undef __CLC_DEFINE_ATOMIC
+
+#endif // __CLC_HAVE_FP_ATOMIC || __CLC_HAVE_INT_ATOMIC
+
+#undef __CLC_HAVE_INT_ATOMIC
+#undef __CLC_HAVE_FP_ATOMIC
+#undef __CLC_HAVE_64_ATOMIC
+
+#endif // __CLC_SCALAR
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index f289666ec12ab..a6bd3eacc095c 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -569,7 +569,6 @@ set(files
   __mdspan/mdspan.h
   __memory/addressof.h
   __memory/align.h
-  __memory/aligned_alloc.h
   __memory/allocate_at_least.h
   __memory/allocation_guard.h
   __memory/allocator.h
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 1b27f28f9ddef..ba10d95b92827 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -495,27 +495,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_HAS_ALIGNED_ALLOCATION 1
 #  endif
 
-// It is not yet possible to use aligned_alloc() on all Apple platforms since
-// 10.15 was the first version to ship an implementation of aligned_alloc().
-#  if defined(__APPLE__)
-#    if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) &&                                                     \
-         __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) ||                                                    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) &&                                                    \
-         __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 130000) ||                                                   \
-        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) &&                                                     \
-         __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 60000) ||                                                     \
-        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 130000)
-#      define _LIBCPP_HAS_C11_ALIGNED_ALLOC 0
-#    else
-#      define _LIBCPP_HAS_C11_ALIGNED_ALLOC 1
-#    endif
-#  elif defined(__ANDROID__) && __ANDROID_API__ < 28
-// Android only provides aligned_alloc when targeting API 28 or higher.
-#    define _LIBCPP_HAS_C11_ALIGNED_ALLOC 0
-#  else
-#    define _LIBCPP_HAS_C11_ALIGNED_ALLOC 1
-#  endif
-
 #  if defined(__APPLE__) || defined(__FreeBSD__)
 #    define _LIBCPP_WCTYPE_IS_MASK
 #  endif
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 492708792cbbf..1ee418b0b4337 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1628,7 +1628,6 @@ module std [system] {
   module memory {
     module addressof                          { header "__memory/addressof.h" }
     module align                              { header "__memory/align.h" }
-    module aligned_alloc                      { header "__memory/aligned_alloc.h" }
     module allocate_at_least                  { header "__memory/allocate_at_least.h" }
     module allocation_guard                   { header "__memory/allocation_guard.h" }
     module allocator {
diff --git a/libcxx/include/string b/libcxx/include/string
index 6b42cb2c7586d..2b3ba6d2d9b62 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -3851,46 +3851,52 @@ swap(basic_string<_CharT, _Traits, _Allocator>& __lhs, basic_string<_CharT, _Tra
   __lhs.swap(__rhs);
 }
 
-_LIBCPP_EXPORTED_FROM_ABI int stoi(const string& __str, size_t* __idx = nullptr, int __base = 10);
-_LIBCPP_EXPORTED_FROM_ABI long stol(const string& __str, size_t* __idx = nullptr, int __base = 10);
-_LIBCPP_EXPORTED_FROM_ABI unsigned long stoul(const string& __str, size_t* __idx = nullptr, int __base = 10);
-_LIBCPP_EXPORTED_FROM_ABI long long stoll(const string& __str, size_t* __idx = nullptr, int __base = 10);
-_LIBCPP_EXPORTED_FROM_ABI unsigned long long stoull(const string& __str, size_t* __idx = nullptr, int __base = 10);
-
-_LIBCPP_EXPORTED_FROM_ABI float stof(const string& __str, size_t* __idx = nullptr);
-_LIBCPP_EXPORTED_FROM_ABI double stod(const string& __str, size_t* __idx = nullptr);
-_LIBCPP_EXPORTED_FROM_ABI long double stold(const string& __str, size_t* __idx = nullptr);
-
-_LIBCPP_EXPORTED_FROM_ABI string to_string(int __val);
-_LIBCPP_EXPORTED_FROM_ABI string to_string(unsigned __val);
-_LIBCPP_EXPORTED_FROM_ABI string to_string(long __val);
-_LIBCPP_EXPORTED_FROM_ABI string to_string(unsigned long __val);
-_LIBCPP_EXPORTED_FROM_ABI string to_string(long long __val);
-_LIBCPP_EXPORTED_FROM_ABI string to_string(unsigned long long __val);
-_LIBCPP_EXPORTED_FROM_ABI string to_string(float __val);
-_LIBCPP_EXPORTED_FROM_ABI string to_string(double __val);
-_LIBCPP_EXPORTED_FROM_ABI string to_string(long double __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI int stoi(const string& __str, size_t* __idx = nullptr, int __base = 10);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI unsigned long
+stoul(const string& __str, size_t* __idx = nullptr, int __base = 10);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI long stol(const string& __str, size_t* __idx = nullptr, int __base = 10);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI long long
+stoll(const string& __str, size_t* __idx = nullptr, int __base = 10);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI unsigned long long
+stoull(const string& __str, size_t* __idx = nullptr, int __base = 10);
+
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI float stof(const string& __str, size_t* __idx = nullptr);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI double stod(const string& __str, size_t* __idx = nullptr);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI long double stold(const string& __str, size_t* __idx = nullptr);
+
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI string to_string(int __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI string to_string(unsigned __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI string to_string(long __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI string to_string(unsigned long __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI string to_string(long long __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI string to_string(unsigned long long __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI string to_string(float __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI string to_string(double __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI string to_string(long double __val);
 
 #  if _LIBCPP_HAS_WIDE_CHARACTERS
-_LIBCPP_EXPORTED_FROM_ABI int stoi(const wstring& __str, size_t* __idx = nullptr, int __base = 10);
-_LIBCPP_EXPORTED_FROM_ABI long stol(const wstring& __str, size_t* __idx = nullptr, int __base = 10);
-_LIBCPP_EXPORTED_FROM_ABI unsigned long stoul(const wstring& __str, size_t* __idx = nullptr, int __base = 10);
-_LIBCPP_EXPORTED_FROM_ABI long long stoll(const wstring& __str, size_t* __idx = nullptr, int __base = 10);
-_LIBCPP_EXPORTED_FROM_ABI unsigned long long stoull(const wstring& __str, size_t* __idx = nullptr, int __base = 10);
-
-_LIBCPP_EXPORTED_FROM_ABI float stof(const wstring& __str, size_t* __idx = nullptr);
-_LIBCPP_EXPORTED_FROM_ABI double stod(const wstring& __str, size_t* __idx = nullptr);
-_LIBCPP_EXPORTED_FROM_ABI long double stold(const wstring& __str, size_t* __idx = nullptr);
-
-_LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(int __val);
-_LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(unsigned __val);
-_LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(long __val);
-_LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(unsigned long __val);
-_LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(long long __val);
-_LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(unsigned long long __val);
-_LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(float __val);
-_LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(double __val);
-_LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(long double __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI int stoi(const wstring& __str, size_t* __idx = nullptr, int __base = 10);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI long stol(const wstring& __str, size_t* __idx = nullptr, int __base = 10);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI unsigned long
+stoul(const wstring& __str, size_t* __idx = nullptr, int __base = 10);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI long long
+stoll(const wstring& __str, size_t* __idx = nullptr, int __base = 10);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI unsigned long long
+stoull(const wstring& __str, size_t* __idx = nullptr, int __base = 10);
+
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI float stof(const wstring& __str, size_t* __idx = nullptr);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI double stod(const wstring& __str, size_t* __idx = nullptr);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI long double stold(const wstring& __str, size_t* __idx = nullptr);
+
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(int __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(unsigned __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(long __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(unsigned long __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(long long __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(unsigned long long __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(float __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(double __val);
+[[__nodiscard__]] _LIBCPP_EXPORTED_FROM_ABI wstring to_wstring(long double __val);
 #  endif // _LIBCPP_HAS_WIDE_CHARACTERS
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -3899,7 +3905,7 @@ _LIBCPP_TEMPLATE_DATA_VIS const typename basic_string<_CharT, _Traits, _Allocato
 
 template <class _CharT, class _Allocator>
 struct __string_hash : public __unary_function<basic_string<_CharT, char_traits<_CharT>, _Allocator>, size_t> {
-  _LIBCPP_HIDE_FROM_ABI size_t
+  [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI size_t
   operator()(const basic_string<_CharT, char_traits<_CharT>, _Allocator>& __val) const _NOEXCEPT {
     return std::__do_string_hash(__val.data(), __val.data() + __val.size());
   }
@@ -3970,30 +3976,31 @@ erase_if(basic_string<_CharT, _Traits, _Allocator>& __str, _Predicate __pred) {
 // Literal suffixes for basic_string [basic.string.literals]
 inline namespace literals {
 inline namespace string_literals {
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<char>
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<char>
 operator""s(const char* __str, size_t __len) {
   return basic_string<char>(__str, __len);
 }
 
 #    if _LIBCPP_HAS_WIDE_CHARACTERS
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<wchar_t>
-operator""s(const wchar_t* __str, size_t __len) {
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<wchar_t> operator""s(const wchar_t* __str, size_t __len) {
   return basic_string<wchar_t>(__str, __len);
 }
 #    endif
 
 #    if _LIBCPP_HAS_CHAR8_T
-inline _LIBCPP_HIDE_FROM_ABI constexpr basic_string<char8_t> operator""s(const char8_t* __str, size_t __len) {
+[[__nodiscard__]] inline
+    _LIBCPP_HIDE_FROM_ABI constexpr basic_string<char8_t> operator""s(const char8_t* __str, size_t __len) {
   return basic_string<char8_t>(__str, __len);
 }
 #    endif
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<char16_t>
-operator""s(const char16_t* __str, size_t __len) {
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI
+_LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<char16_t> operator""s(const char16_t* __str, size_t __len) {
   return basic_string<char16_t>(__str, __len);
 }
 
-inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<char32_t>
+[[__nodiscard__]] inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<char32_t>
 operator""s(const char32_t* __str, size_t __len) {
   return basic_string<char32_t>(__str, __len);
 }
diff --git a/libcxx/include/__memory/aligned_alloc.h b/libcxx/src/include/aligned_alloc.h
similarity index 90%
rename from libcxx/include/__memory/aligned_alloc.h
rename to libcxx/src/include/aligned_alloc.h
index fb36983d9c3dc..24ca26ce04525 100644
--- a/libcxx/include/__memory/aligned_alloc.h
+++ b/libcxx/src/include/aligned_alloc.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef _LIBCPP___MEMORY_ALIGNED_ALLOC_H
-#define _LIBCPP___MEMORY_ALIGNED_ALLOC_H
+#ifndef _LIBCPP_SRC_ALIGNED_ALLOC_H
+#define _LIBCPP_SRC_ALIGNED_ALLOC_H
 
 #include <__config>
 #include <cstdlib>
@@ -29,7 +29,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 inline _LIBCPP_HIDE_FROM_ABI void* __libcpp_aligned_alloc(std::size_t __alignment, std::size_t __size) {
 #  if defined(_LIBCPP_MSVCRT_LIKE)
   return ::_aligned_malloc(__size, __alignment);
-#  elif _LIBCPP_STD_VER >= 17 && _LIBCPP_HAS_C11_ALIGNED_ALLOC
+
+// Android only provides aligned_alloc when targeting API 28 or higher.
+#  elif !defined(__ANDROID__) || __ANDROID_API__ >= 28
   // aligned_alloc() requires that __size is a multiple of __alignment,
   // but for C++ [new.delete.general], only states "if the value of an
   // alignment argument passed to any of these functions is not a valid
@@ -60,4 +62,4 @@ inline _LIBCPP_HIDE_FROM_ABI void __libcpp_aligned_free(void* __ptr) {
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // _LIBCPP___MEMORY_ALIGNED_ALLOC_H
+#endif // _LIBCPP_SRC_ALIGNED_ALLOC_H
diff --git a/libcxx/src/new.cpp b/libcxx/src/new.cpp
index ce6b63775ce9c..70cdab683a861 100644
--- a/libcxx/src/new.cpp
+++ b/libcxx/src/new.cpp
@@ -6,9 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/aligned_alloc.h"
 #include "include/overridable_function.h"
 #include <__assert>
-#include <__memory/aligned_alloc.h>
 #include <cstddef>
 #include <cstdlib>
 #include <new>
diff --git a/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp
index f020516a2495a..0ff92cac3a3b2 100644
--- a/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/string.nodiscard.verify.cpp
@@ -130,3 +130,80 @@ void test() {
   str.subview(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
 #endif
 }
+
+void test_nonmembers() {
+  // Numeric conversions
+
+  std::string str;
+
+  std::stoi(str);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::stol(str);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::stoll(str);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::stoull(str); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  std::stof(str);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::stod(str);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::stold(str); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  std::to_string(94);    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_string(82U);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_string(94L);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_string(82UL);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_string(94LL);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_string(82ULL); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_string(94.0F); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_string(82.0);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_string(94.0L); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+#if !defined(TEST_HAS_NO_WIDE_CHARACTERS)
+
+  std::wstring wstr;
+
+  std::stoi(wstr);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::stol(wstr);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::stoll(wstr);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::stoull(wstr); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  std::stof(wstr);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::stod(wstr);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::stold(wstr); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+  std::to_wstring(94);    // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_wstring(82U);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_wstring(94L);   // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_wstring(82UL);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_wstring(94LL);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_wstring(82ULL); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_wstring(94.0F); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_wstring(82.0);  // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+  std::to_wstring(94.0L); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+#endif
+
+  // std::hash<>
+
+  std::hash<std::string> hash;
+
+  hash(str); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+
+#if TEST_STD_VER >= 14
+  // string literals
+
+  using namespace std::string_literals;
+
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  ""s; // const char*
+#  if !defined(TEST_HAS_NO_WIDE_CHARACTERS)
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  L""s; // const wchar_t*
+#  endif
+#  if !defined(TEST_HAS_NO_CHAR8_T)
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  u8""s; // const char8_t*
+#  endif
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  u""s; // const char16_t*
+  // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+  U""s; // const char32_t*
+#endif
+}
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index 4bb42cb532078..282d49d727c8c 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -21,6 +21,8 @@
 // GCC doesn't support the aligned-allocation flags.
 // XFAIL: gcc
 
+// ADDITIONAL_COMPILE_FLAGS: -I %{libcxx-dir}/src -Wno-macro-redefined
+
 // RUN: %{build} -faligned-allocation -fsized-deallocation
 // RUN: %{run}
 // RUN: %{build} -faligned-allocation -fno-sized-deallocation -DNO_SIZE
@@ -36,10 +38,7 @@
 
 #include "test_macros.h"
 
-TEST_DIAGNOSTIC_PUSH
-TEST_CLANG_DIAGNOSTIC_IGNORED("-Wprivate-header")
-#include <__memory/aligned_alloc.h>
-TEST_DIAGNOSTIC_POP
+#include "include/aligned_alloc.h"
 
 struct alloc_stats {
   alloc_stats() { reset(); }
diff --git a/libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp b/libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
index 281ef37e92d27..fb49375a21baa 100644
--- a/libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
+++ b/libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
@@ -12,6 +12,6 @@
 
 #include <cstdio>
 
-void f(char const* str) {
+void f(char* str) {
   (void)std::gets(str); // expected-error {{no member named 'gets' in namespace 'std'}}
 }
diff --git a/libcxxabi/src/fallback_malloc.cpp b/libcxxabi/src/fallback_malloc.cpp
index 75788fe9be8d9..6a261e6f009fe 100644
--- a/libcxxabi/src/fallback_malloc.cpp
+++ b/libcxxabi/src/fallback_malloc.cpp
@@ -16,7 +16,7 @@
 #endif
 #endif
 
-#include <__memory/aligned_alloc.h>
+#include "include/aligned_alloc.h" // from libc++
 #include <__assert>
 #include <stdlib.h> // for malloc, calloc, free
 #include <string.h> // for memset
diff --git a/libcxxabi/src/stdlib_new_delete.cpp b/libcxxabi/src/stdlib_new_delete.cpp
index b5ed59958d17e..dbb75b128a2a4 100644
--- a/libcxxabi/src/stdlib_new_delete.cpp
+++ b/libcxxabi/src/stdlib_new_delete.cpp
@@ -8,8 +8,8 @@
 
 #include "__cxxabi_config.h"
 #include "abort_message.h"
+#include "include/aligned_alloc.h"        // from libc++
 #include "include/overridable_function.h" // from libc++
-#include <__memory/aligned_alloc.h>
 #include <cstddef>
 #include <cstdlib>
 #include <new>
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index 7f880d223d6c3..30bca639060e6 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -60,7 +60,6 @@
 #include "lldb/lldb-forward.h"
 #include "lldb/lldb-private-enumerations.h"
 
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/CompilerInvocation.h"
 #include "clang/Frontend/FrontendActions.h"
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/CMakeLists.txt b/lldb/source/Plugins/ExpressionParser/Clang/CMakeLists.txt
index 759a7c4dd14fb..01d588ff6a78b 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/CMakeLists.txt
+++ b/lldb/source/Plugins/ExpressionParser/Clang/CMakeLists.txt
@@ -51,10 +51,10 @@ add_lldb_library(lldbPluginExpressionParserClang
   CLANG_LIBS
     clangAST
     clangCodeGen
+    clangDriver
     clangEdit
     clangFrontend
     clangLex
-    clangOptions
     clangParse
     clangRewrite
     clangRewriteFrontend
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangHost.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangHost.cpp
index 660a21e3c6a8d..6de851081598f 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangHost.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangHost.cpp
@@ -10,7 +10,7 @@
 
 #include "clang/Basic/Version.h"
 #include "clang/Config/config.h"
-#include "clang/Options/OptionUtils.h"
+#include "clang/Driver/Driver.h"
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -53,7 +53,7 @@ static bool DefaultComputeClangResourceDirectory(FileSpec &lldb_shlib_spec,
   std::string raw_path = lldb_shlib_spec.GetPath();
   llvm::StringRef parent_dir = llvm::sys::path::parent_path(raw_path);
   static const std::string clang_resource_path =
-      clang::GetResourcesPath("bin/lldb");
+      clang::driver::Driver::GetResourcesPath("bin/lldb");
 
   static const llvm::StringRef kResourceDirSuffixes[] = {
       // LLVM.org's build of LLDB uses the clang resource directory placed
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
index ce8dc50b84a31..e37c84efefdc9 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangModulesDeclVendor.cpp
@@ -10,7 +10,6 @@
 #include "clang/Basic/DiagnosticFrontend.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/TargetInfo.h"
-#include "clang/Driver/CreateInvocationFromArgs.h"
 #include "clang/Frontend/CompilerInstance.h"
 #include "clang/Frontend/FrontendActions.h"
 #include "clang/Frontend/TextDiagnosticPrinter.h"
diff --git a/lldb/unittests/Expression/ClangParserTest.cpp b/lldb/unittests/Expression/ClangParserTest.cpp
index c949026e87cd8..fab4487c73719 100644
--- a/lldb/unittests/Expression/ClangParserTest.cpp
+++ b/lldb/unittests/Expression/ClangParserTest.cpp
@@ -8,7 +8,7 @@
 
 #include "clang/Basic/Version.h"
 #include "clang/Config/config.h"
-#include "clang/Options/OptionUtils.h"
+#include "clang/Driver/Driver.h"
 
 #include "Plugins/ExpressionParser/Clang/ClangHost.h"
 #include "TestingSupport/SubsystemRAII.h"
@@ -43,7 +43,7 @@ TEST_F(ClangHostTest, ComputeClangResourceDirectory) {
   std::string path_to_liblldb = "C:\\foo\\bar\\lib\\";
 #endif
   std::string path_to_clang_dir =
-      clang::GetResourcesPath(path_to_liblldb + "liblldb");
+      clang::driver::Driver::GetResourcesPath(path_to_liblldb + "liblldb");
   llvm::SmallString<256> path_to_clang_lib_dir_real;
   llvm::sys::fs::real_path(path_to_clang_dir, path_to_clang_lib_dir_real);
 
diff --git a/lldb/unittests/Expression/DWARFExpressionTest.cpp b/lldb/unittests/Expression/DWARFExpressionTest.cpp
index a95456119956e..e0c2193d27c36 100644
--- a/lldb/unittests/Expression/DWARFExpressionTest.cpp
+++ b/lldb/unittests/Expression/DWARFExpressionTest.cpp
@@ -40,6 +40,51 @@ using namespace lldb_private;
 using namespace llvm::dwarf;
 
 namespace {
+/// A mock implementation of DWARFExpression::Delegate for testing.
+/// This class provides default implementations of all delegate methods,
+/// with the DWARF version being configurable via the constructor.
+class MockDwarfDelegate : public DWARFExpression::Delegate {
+public:
+  static constexpr uint16_t DEFAULT_DWARF_VERSION = 5;
+  static MockDwarfDelegate Dwarf5() { return MockDwarfDelegate(5); }
+  static MockDwarfDelegate Dwarf2() { return MockDwarfDelegate(2); }
+
+  MockDwarfDelegate() : MockDwarfDelegate(DEFAULT_DWARF_VERSION) {}
+  explicit MockDwarfDelegate(uint16_t version) : m_dwarf_version(version) {}
+
+  uint16_t GetVersion() const override { return m_dwarf_version; }
+
+  dw_addr_t GetBaseAddress() const override { return 0; }
+
+  uint8_t GetAddressByteSize() const override { return 4; }
+
+  llvm::Expected<std::pair<uint64_t, bool>>
+  GetDIEBitSizeAndSign(uint64_t relative_die_offset) const override {
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "GetDIEBitSizeAndSign not implemented");
+  }
+
+  dw_addr_t ReadAddressFromDebugAddrSection(uint32_t index) const override {
+    return 0;
+  }
+
+  lldb::offset_t GetVendorDWARFOpcodeSize(const DataExtractor &data,
+                                          const lldb::offset_t data_offset,
+                                          const uint8_t op) const override {
+    return LLDB_INVALID_OFFSET;
+  }
+
+  bool ParseVendorDWARFOpcode(uint8_t op, const DataExtractor &opcodes,
+                              lldb::offset_t &offset, RegisterContext *reg_ctx,
+                              lldb::RegisterKind reg_kind,
+                              DWARFExpression::Stack &stack) const override {
+    return false;
+  }
+
+private:
+  uint16_t m_dwarf_version;
+};
+
 /// Mock memory implementation for testing.
 /// Stores predefined memory contents indexed by {address, size} pairs.
 class MockMemory {
@@ -189,7 +234,7 @@ class MockRegisterContext : public RegisterContext {
 
 static llvm::Expected<Value> Evaluate(llvm::ArrayRef<uint8_t> expr,
                                       lldb::ModuleSP module_sp = {},
-                                      DWARFUnit *unit = nullptr,
+                                      DWARFExpression::Delegate *unit = nullptr,
                                       ExecutionContext *exe_ctx = nullptr,
                                       RegisterContext *reg_ctx = nullptr) {
   DataExtractor extractor(expr.data(), expr.size(), lldb::eByteOrderLittle,
@@ -534,6 +579,23 @@ TEST(DWARFExpression, DW_OP_stack_value) {
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_stack_value}), llvm::Failed());
 }
 
+// This test shows that the dwarf version is used by the expression evaluation.
+// Note that the different behavior tested here is not meant to imply that this
+// is the correct interpretation of dwarf2 vs. dwarf5, but rather it was picked
+// as an easy example that evaluates differently based on the dwarf version.
+TEST(DWARFExpression, dwarf_version) {
+  MockDwarfDelegate dwarf2 = MockDwarfDelegate::Dwarf2();
+  MockDwarfDelegate dwarf5 = MockDwarfDelegate::Dwarf5();
+
+  // In dwarf2 the constant on top of the stack is treated as a value.
+  EXPECT_THAT_EXPECTED(Evaluate({DW_OP_lit1}, {}, &dwarf2), ExpectScalar(1));
+
+  // In dwarf5 the constant on top of the stack is implicitly converted to an
+  // address.
+  EXPECT_THAT_EXPECTED(Evaluate({DW_OP_lit1}, {}, &dwarf5),
+                       ExpectLoadAddress(1));
+}
+
 TEST(DWARFExpression, DW_OP_piece) {
   EXPECT_THAT_EXPECTED(Evaluate({DW_OP_const2u, 0x11, 0x22, DW_OP_piece, 2,
                                  DW_OP_const2u, 0x33, 0x44, DW_OP_piece, 2}),
diff --git a/llvm/include/llvm/ADT/StringTable.h b/llvm/include/llvm/ADT/StringTable.h
index 9422a6da1ce8e..3a08e56e8f501 100644
--- a/llvm/include/llvm/ADT/StringTable.h
+++ b/llvm/include/llvm/ADT/StringTable.h
@@ -11,6 +11,7 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
+#include <cassert>
 #include <iterator>
 #include <limits>
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h
index 9f14c8b2efd5f..a7ba79164c471 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h
@@ -155,22 +155,20 @@ template <typename ContainerIdT, typename ElementIdT> class WaitingOnGraph {
     }
 
     template <typename Pred> void remove(Pred &&Remove) {
+      std::vector<hash_code> HashesToErase;
       for (auto &[Hash, SNs] : CanonicalSNs) {
-        bool Found = false;
-        for (size_t I = 0; I != SNs.size(); ++I) {
+        for (size_t I = 0; I != SNs.size();) {
           if (Remove(SNs[I])) {
             std::swap(SNs[I], SNs.back());
             SNs.pop_back();
-            Found = true;
-            break;
-          }
-        }
-        if (Found) {
-          if (SNs.empty())
-            CanonicalSNs.erase(Hash);
-          break;
+          } else
+            ++I;
         }
+        if (SNs.empty())
+          HashesToErase.push_back(Hash);
       }
+      for (auto Hash : HashesToErase)
+        CanonicalSNs.erase(Hash);
     }
 
   private:
@@ -396,9 +394,14 @@ template <typename ContainerIdT, typename ElementIdT> class WaitingOnGraph {
         ++I;
     }
 
+    CoalesceToPendingSNs.remove([&](SuperNode *SN) {
+      for (auto &E : FailedSNs)
+        if (E.get() == SN)
+          return true;
+      return false;
+    });
+
     for (auto &SN : FailedSNs) {
-      CoalesceToPendingSNs.remove(
-          [&](SuperNode *SNC) { return SNC == SN.get(); });
       for (auto &[Container, Elems] : SN->Defs) {
         assert(ElemToPendingSN.count(Container));
         auto &CElems = ElemToPendingSN[Container];
diff --git a/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h b/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h
index 24017492e30b2..dc99e29964afc 100644
--- a/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h
+++ b/llvm/include/llvm/Frontend/Offloading/OffloadWrapper.h
@@ -56,18 +56,18 @@ LLVM_ABI llvm::Error wrapHIPBinary(llvm::Module &M, llvm::ArrayRef<char> Images,
                                    bool EmitSurfacesAndTextures = true);
 
 struct SYCLJITOptions {
-  // Target/compiler specific options that are suggested to use to "compile"
-  // program at runtime.
+  // Target/compiler specific options that are passed to the device compiler at
+  // runtime.
   std::string CompileOptions;
-  // Target/compiler specific options that are suggested to use to "link"
-  // program at runtime.
+  // Target/compiler specific options that are passed to the device linker at
+  // runtime.
   std::string LinkOptions;
 };
 
 /// Wraps OffloadBinaries in the given \p Buffers into the module \p M
 /// as global symbols and registers the images with the SYCL Runtime.
 /// \param Options Compiler and linker options to be encoded for the later
-///  use by a runtime for JIT compilation.
+///  use by a runtime for JIT compilation. Not used for AOT.
 LLVM_ABI llvm::Error
 wrapSYCLBinaries(llvm::Module &M, llvm::ArrayRef<char> Buffer,
                  SYCLJITOptions Options = SYCLJITOptions());
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index ce7e836f66446..b40b7f199f9e5 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -182,10 +182,63 @@ foreach FPTy = ["F32", "F64", "F80", "F128", "PPCF128"] in {
   def MODF_#FPTy : RuntimeLibcall;
 }
 
-foreach VecTy = ["V4F32", "V2F64", "NXV4F32", "NXV2F64"] in {
-  def MODF_#VecTy : RuntimeLibcall;
-  def SINCOS_#VecTy : RuntimeLibcall;
-  def SINCOSPI_#VecTy : RuntimeLibcall;
+defvar F32VectorSuffixes = ["V2F32", "V4F32", "V8F32", "V16F32", "NXV4F32"];
+defvar F64VectorSuffixes = ["V2F64", "V4F64", "V8F64", "NXV2F64"];
+
+foreach S = !listconcat(F32VectorSuffixes, F64VectorSuffixes) in {
+  def ACOS_#S : RuntimeLibcall;
+  def ACOSH_#S : RuntimeLibcall;
+  def ASIN_#S : RuntimeLibcall;
+  def ASINH_#S : RuntimeLibcall;
+  def ATAN_#S : RuntimeLibcall;
+  def ATAN2_#S : RuntimeLibcall;
+  def ATANH_#S : RuntimeLibcall;
+  def CBRT_#S : RuntimeLibcall;
+  def CEIL_#S : RuntimeLibcall;
+  def COPYSIGN_#S : RuntimeLibcall;
+  def COS_#S : RuntimeLibcall;
+  def COSH_#S : RuntimeLibcall;
+  def COSPI_#S : RuntimeLibcall;
+  def ERFC_#S : RuntimeLibcall;
+  def ERF_#S : RuntimeLibcall;
+  def EXP_#S : RuntimeLibcall;
+  def EXP_FINITE_#S : RuntimeLibcall;
+  def EXP10_#S : RuntimeLibcall;
+  def EXP2_#S : RuntimeLibcall;
+  def EXPM1_#S : RuntimeLibcall;
+  def FABS_#S : RuntimeLibcall;
+  def FDIM_#S : RuntimeLibcall;
+  def FLOOR_#S : RuntimeLibcall;
+  def FMA_#S : RuntimeLibcall;
+  def FMAX_#S : RuntimeLibcall;
+  def FMIN_#S : RuntimeLibcall;
+  def FMOD_#S : RuntimeLibcall;
+  def HYPOT_#S : RuntimeLibcall;
+  def ILOGB_#S : RuntimeLibcall;
+  def LDEXP_#S : RuntimeLibcall;
+  def LGAMMA_#S : RuntimeLibcall;
+  def LOG_#S : RuntimeLibcall;
+  def LOG10_#S : RuntimeLibcall;
+  def LOG1P_#S : RuntimeLibcall;
+  def LOG2_#S : RuntimeLibcall;
+  def LOGB_#S : RuntimeLibcall;
+  def MODF_#S : RuntimeLibcall;
+  def NEXTAFTER_#S : RuntimeLibcall;
+  def POW_#S : RuntimeLibcall;
+  def SINCOS_#S : RuntimeLibcall;
+  def SINCOSPI_#S : RuntimeLibcall;
+  def SIN_#S : RuntimeLibcall;
+  def SINH_#S : RuntimeLibcall;
+  def SINPI_#S : RuntimeLibcall;
+  def SQRT_#S : RuntimeLibcall;
+  def TAN_#S : RuntimeLibcall;
+  def TANH_#S : RuntimeLibcall;
+  def TGAMMA_#S : RuntimeLibcall;
+}
+
+foreach S = F64VectorSuffixes in {
+  def LOG_FINITE_#S : RuntimeLibcall;
+  def POW_FINITE_#S : RuntimeLibcall;
 }
 
 def FEGETENV : RuntimeLibcall;
@@ -1089,50 +1142,6 @@ def __security_check_cookie : RuntimeLibcallImpl<SECURITY_CHECK_COOKIE>;
 def __security_check_cookie_arm64ec : RuntimeLibcallImpl<SECURITY_CHECK_COOKIE,
   "#__security_check_cookie_arm64ec">;
 
-//===----------------------------------------------------------------------===//
-// sleef calls
-//===----------------------------------------------------------------------===//
-
-defset list<RuntimeLibcallImpl> SleefLibcalls = {
-  def _ZGVnN2vl8_modf : RuntimeLibcallImpl<MODF_V2F64>;
-  def _ZGVnN4vl4_modff : RuntimeLibcallImpl<MODF_V4F32>;
-  def _ZGVsNxvl8_modf : RuntimeLibcallImpl<MODF_NXV2F64>;
-  def _ZGVsNxvl4_modff : RuntimeLibcallImpl<MODF_NXV4F32>;
-
-  def _ZGVnN2vl8l8_sincos : RuntimeLibcallImpl<SINCOS_V2F64>;
-  def _ZGVnN4vl4l4_sincosf : RuntimeLibcallImpl<SINCOS_V4F32>;
-  def _ZGVsNxvl8l8_sincos : RuntimeLibcallImpl<SINCOS_NXV2F64>;
-  def _ZGVsNxvl4l4_sincosf : RuntimeLibcallImpl<SINCOS_NXV4F32>;
-
-  def _ZGVnN4vl4l4_sincospif : RuntimeLibcallImpl<SINCOSPI_V4F32>;
-  def _ZGVnN2vl8l8_sincospi : RuntimeLibcallImpl<SINCOSPI_V2F64>;
-  def _ZGVsNxvl4l4_sincospif : RuntimeLibcallImpl<SINCOSPI_NXV4F32>;
-  def _ZGVsNxvl8l8_sincospi : RuntimeLibcallImpl<SINCOSPI_NXV2F64>;
-}
-
-//===----------------------------------------------------------------------===//
-// ARMPL calls
-//===----------------------------------------------------------------------===//
-
-defset list<RuntimeLibcallImpl> ARMPLLibcalls = {
-  def armpl_vmodfq_f64 : RuntimeLibcallImpl<MODF_V2F64>; // CallingConv::AArch64_VectorCall
-  def armpl_vmodfq_f32 : RuntimeLibcallImpl<MODF_V4F32>; // CallingConv::AArch64_VectorCall
-  def armpl_svmodf_f64_x : RuntimeLibcallImpl<MODF_NXV2F64>;
-  def armpl_svmodf_f32_x : RuntimeLibcallImpl<MODF_NXV4F32>;
-
-  def armpl_vsincosq_f64
-      : RuntimeLibcallImpl<SINCOS_V2F64>; // CallingConv::AArch64_VectorCall
-  def armpl_vsincosq_f32
-      : RuntimeLibcallImpl<SINCOS_V4F32>; // CallingConv::AArch64_VectorCall
-  def armpl_svsincos_f64_x : RuntimeLibcallImpl<SINCOS_NXV2F64>;
-  def armpl_svsincos_f32_x : RuntimeLibcallImpl<SINCOS_NXV4F32>;
-
-  def armpl_vsincospiq_f32 : RuntimeLibcallImpl<SINCOSPI_V4F32>;
-  def armpl_vsincospiq_f64 : RuntimeLibcallImpl<SINCOSPI_V2F64>;
-  def armpl_svsincospi_f32_x : RuntimeLibcallImpl<SINCOSPI_NXV4F32>;
-  def armpl_svsincospi_f64_x : RuntimeLibcallImpl<SINCOSPI_NXV2F64>;
-}
-
 //===----------------------------------------------------------------------===//
 // F128 libm Runtime Libcalls
 //===----------------------------------------------------------------------===//
@@ -2769,3 +2778,926 @@ def LegacyDefaultSystemLibrary
          LibcallImpls<(add Int128RTLibcalls), isArch64Bit>,
          DefaultStackProtector
 )>;
+
+//===----------------------------------------------------------------------===//
+// Vector math libraries
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Accelerate framework functions
+//===----------------------------------------------------------------------===//
+
+defset list<RuntimeLibcallImpl> ACCELERATE_VECFUNCS = {
+  def vacosf : RuntimeLibcallImpl<ACOS_V4F32>;
+  def vacoshf : RuntimeLibcallImpl<ACOSH_V4F32>;
+  def vasinf : RuntimeLibcallImpl<ASIN_V4F32>;
+  def vasinhf : RuntimeLibcallImpl<ASINH_V4F32>;
+  def vatan2f : RuntimeLibcallImpl<ATAN2_V4F32>;
+  def vatanf : RuntimeLibcallImpl<ATAN_V4F32>;
+  def vatanhf : RuntimeLibcallImpl<ATANH_V4F32>;
+  def vceilf : RuntimeLibcallImpl<CEIL_V4F32>;
+  def vcosf : RuntimeLibcallImpl<COS_V4F32>;
+  def vcoshf : RuntimeLibcallImpl<COSH_V4F32>;
+  def vexpf : RuntimeLibcallImpl<EXP_V4F32>;
+  def vexpm1f : RuntimeLibcallImpl<EXPM1_V4F32>;
+  def vfabsf : RuntimeLibcallImpl<FABS_V4F32>;
+  def vfloorf : RuntimeLibcallImpl<FLOOR_V4F32>;
+  def vlog10f : RuntimeLibcallImpl<LOG10_V4F32>;
+  def vlog1pf : RuntimeLibcallImpl<LOG1P_V4F32>;
+  def vlogbf : RuntimeLibcallImpl<LOGB_V4F32>;
+  def vlogf : RuntimeLibcallImpl<LOG_V4F32>;
+  def vsinf : RuntimeLibcallImpl<SIN_V4F32>;
+  def vsinhf : RuntimeLibcallImpl<SINH_V4F32>;
+  def vsqrtf : RuntimeLibcallImpl<SQRT_V4F32>;
+  def vtanf : RuntimeLibcallImpl<TAN_V4F32>;
+  def vtanhf : RuntimeLibcallImpl<TANH_V4F32>;
+}
+
+//===----------------------------------------------------------------------===//
+// Darwin_libsystem_m vector functions
+//===----------------------------------------------------------------------===//
+
+defset list<RuntimeLibcallImpl> DARWIN_LIBSYSTEM_M_VECFUNCS = {
+  def _simd_acos_d2 : RuntimeLibcallImpl<ACOS_V2F64>;
+  def _simd_acos_f4 : RuntimeLibcallImpl<ACOS_V4F32>;
+  def _simd_acosh_d2 : RuntimeLibcallImpl<ACOSH_V2F64>;
+  def _simd_acosh_f4 : RuntimeLibcallImpl<ACOSH_V4F32>;
+  def _simd_asin_d2 : RuntimeLibcallImpl<ASIN_V2F64>;
+  def _simd_asin_f4 : RuntimeLibcallImpl<ASIN_V4F32>;
+  def _simd_asinh_d2 : RuntimeLibcallImpl<ASINH_V2F64>;
+  def _simd_asinh_f4 : RuntimeLibcallImpl<ASINH_V4F32>;
+  def _simd_atan2_d2 : RuntimeLibcallImpl<ATAN2_V2F64>;
+  def _simd_atan2_f4 : RuntimeLibcallImpl<ATAN2_V4F32>;
+  def _simd_atan_d2 : RuntimeLibcallImpl<ATAN_V2F64>;
+  def _simd_atan_f4 : RuntimeLibcallImpl<ATAN_V4F32>;
+  def _simd_atanh_d2 : RuntimeLibcallImpl<ATANH_V2F64>;
+  def _simd_atanh_f4 : RuntimeLibcallImpl<ATANH_V4F32>;
+  def _simd_cbrt_d2 : RuntimeLibcallImpl<CBRT_V2F64>;
+  def _simd_cbrt_f4 : RuntimeLibcallImpl<CBRT_V4F32>;
+  def _simd_cos_d2 : RuntimeLibcallImpl<COS_V2F64>;
+  def _simd_cos_f4 : RuntimeLibcallImpl<COS_V4F32>;
+  def _simd_cosh_d2 : RuntimeLibcallImpl<COSH_V2F64>;
+  def _simd_cosh_f4 : RuntimeLibcallImpl<COSH_V4F32>;
+  def _simd_erf_d2 : RuntimeLibcallImpl<ERF_V2F64>;
+  def _simd_erf_f4 : RuntimeLibcallImpl<ERF_V4F32>;
+  def _simd_exp_d2 : RuntimeLibcallImpl<EXP_V2F64>;
+  def _simd_exp_f4 : RuntimeLibcallImpl<EXP_V4F32>;
+  def _simd_pow_d2 : RuntimeLibcallImpl<POW_V2F64>;
+  def _simd_pow_f4 : RuntimeLibcallImpl<POW_V4F32>;
+  def _simd_sin_d2 : RuntimeLibcallImpl<SIN_V2F64>;
+  def _simd_sin_f4 : RuntimeLibcallImpl<SIN_V4F32>;
+  def _simd_sinh_d2 : RuntimeLibcallImpl<SINH_V2F64>;
+  def _simd_sinh_f4 : RuntimeLibcallImpl<SINH_V4F32>;
+  def _simd_tan_d2 : RuntimeLibcallImpl<TAN_V2F64>;
+  def _simd_tan_f4 : RuntimeLibcallImpl<TAN_V4F32>;
+  def _simd_tanh_d2 : RuntimeLibcallImpl<TANH_V2F64>;
+  def _simd_tanh_f4 : RuntimeLibcallImpl<TANH_V4F32>;
+}
+
+//===----------------------------------------------------------------------===//
+// GLIBC Vector Math library LIBMVEC functions
+//===----------------------------------------------------------------------===//
+
+defvar LIBMVECPrefix = "LIBMVEC_";
+
+class LibmvecLibcall<RuntimeLibcall P>
+    : RuntimeLibcallImpl<P, !substr(NAME, !size(LIBMVECPrefix))>;
+
+defset list<RuntimeLibcallImpl> LIBMVEC_X86_VECFUNCS = {
+  def LIBMVEC__ZGVbN2v___exp_finite : LibmvecLibcall<EXP_FINITE_V2F64>;
+  def LIBMVEC__ZGVbN2v___log_finite : LibmvecLibcall<LOG_FINITE_V2F64>;
+  def LIBMVEC__ZGVbN2v_cos : LibmvecLibcall<COS_V2F64>;
+  def LIBMVEC__ZGVbN2v_exp : LibmvecLibcall<EXP_V2F64>;
+  def LIBMVEC__ZGVbN2v_log : LibmvecLibcall<LOG_V2F64>;
+  def LIBMVEC__ZGVbN2v_sin : LibmvecLibcall<SIN_V2F64>;
+  def LIBMVEC__ZGVbN2v_tan : LibmvecLibcall<TAN_V2F64>;
+  def LIBMVEC__ZGVbN2vv___pow_finite : LibmvecLibcall<POW_FINITE_V2F64>;
+  def LIBMVEC__ZGVbN2vv_pow : LibmvecLibcall<POW_V2F64>;
+  def LIBMVEC__ZGVbN4v___expf_finite : LibmvecLibcall<EXP_FINITE_V4F64>;
+  def LIBMVEC__ZGVbN4v___logf_finite : LibmvecLibcall<LOG_FINITE_V4F64>;
+  def LIBMVEC__ZGVbN4v_cosf : LibmvecLibcall<COS_V4F32>;
+  def LIBMVEC__ZGVbN4v_expf : LibmvecLibcall<EXP_V4F32>;
+  def LIBMVEC__ZGVbN4v_logf : LibmvecLibcall<LOG_V4F32>;
+  def LIBMVEC__ZGVbN4v_sinf : LibmvecLibcall<SIN_V4F32>;
+  def LIBMVEC__ZGVbN4v_tanf : LibmvecLibcall<TAN_V4F32>;
+  def LIBMVEC__ZGVbN4vv___powf_finite : LibmvecLibcall<POW_FINITE_V4F64>;
+  def LIBMVEC__ZGVbN4vv_powf : LibmvecLibcall<POW_V4F32>;
+  def LIBMVEC__ZGVdN4v___exp_finite : LibmvecLibcall<EXP_FINITE_V4F64>;
+  def LIBMVEC__ZGVdN4v___log_finite : LibmvecLibcall<LOG_FINITE_V4F64>;
+  def LIBMVEC__ZGVdN4v_cos : LibmvecLibcall<COS_V4F64>;
+  def LIBMVEC__ZGVdN4v_exp : LibmvecLibcall<EXP_V4F64>;
+  def LIBMVEC__ZGVdN4v_log : LibmvecLibcall<LOG_V4F64>;
+  def LIBMVEC__ZGVdN4v_sin : LibmvecLibcall<SIN_V4F64>;
+  def LIBMVEC__ZGVdN4v_tan : LibmvecLibcall<TAN_V4F64>;
+  def LIBMVEC__ZGVdN4vv___pow_finite : LibmvecLibcall<POW_FINITE_V4F64>;
+  def LIBMVEC__ZGVdN4vv_pow : LibmvecLibcall<POW_V4F64>;
+  def LIBMVEC__ZGVdN8v___expf_finite : LibmvecLibcall<EXP_FINITE_V8F64>;
+  def LIBMVEC__ZGVdN8v___logf_finite : LibmvecLibcall<LOG_FINITE_V8F64>;
+  def LIBMVEC__ZGVdN8v_cosf : LibmvecLibcall<COS_V8F32>;
+  def LIBMVEC__ZGVdN8v_expf : LibmvecLibcall<EXP_V8F32>;
+  def LIBMVEC__ZGVdN8v_logf : LibmvecLibcall<LOG_V8F32>;
+  def LIBMVEC__ZGVdN8v_sinf : LibmvecLibcall<SIN_V8F32>;
+  def LIBMVEC__ZGVdN8v_tanf : LibmvecLibcall<TAN_V8F32>;
+  def LIBMVEC__ZGVdN8vv___powf_finite : LibmvecLibcall<POW_FINITE_V8F64>;
+  def LIBMVEC__ZGVdN8vv_powf : LibmvecLibcall<POW_V8F32>;
+}
+
+defset list<RuntimeLibcallImpl> LIBMVEC_AARCH64_VECFUNCS = {
+  def LIBMVEC__ZGVnN2v_acos : LibmvecLibcall<ACOS_V2F64>;
+  def LIBMVEC__ZGVnN2v_acosf : LibmvecLibcall<ACOS_V2F32>;
+  def LIBMVEC__ZGVnN2v_acosh : LibmvecLibcall<ACOSH_V2F64>;
+  def LIBMVEC__ZGVnN2v_acoshf : LibmvecLibcall<ACOSH_V2F32>;
+  def LIBMVEC__ZGVnN2v_asin : LibmvecLibcall<ASIN_V2F64>;
+  def LIBMVEC__ZGVnN2v_asinf : LibmvecLibcall<ASIN_V2F32>;
+  def LIBMVEC__ZGVnN2v_asinh : LibmvecLibcall<ASINH_V2F64>;
+  def LIBMVEC__ZGVnN2v_asinhf : LibmvecLibcall<ASINH_V2F32>;
+  def LIBMVEC__ZGVnN2v_atan : LibmvecLibcall<ATAN_V2F64>;
+  def LIBMVEC__ZGVnN2v_atanf : LibmvecLibcall<ATAN_V2F32>;
+  def LIBMVEC__ZGVnN2v_atanh : LibmvecLibcall<ATANH_V2F64>;
+  def LIBMVEC__ZGVnN2v_atanhf : LibmvecLibcall<ATANH_V2F32>;
+  def LIBMVEC__ZGVnN2v_cbrt : LibmvecLibcall<CBRT_V2F64>;
+  def LIBMVEC__ZGVnN2v_cbrtf : LibmvecLibcall<CBRT_V2F32>;
+  def LIBMVEC__ZGVnN2v_cos : LibmvecLibcall<COS_V2F64>;
+  def LIBMVEC__ZGVnN2v_cosf : LibmvecLibcall<COS_V2F32>;
+  def LIBMVEC__ZGVnN2v_cosh : LibmvecLibcall<COSH_V2F64>;
+  def LIBMVEC__ZGVnN2v_coshf : LibmvecLibcall<COSH_V2F32>;
+  def LIBMVEC__ZGVnN2v_erf : LibmvecLibcall<ERF_V2F64>;
+  def LIBMVEC__ZGVnN2v_erfc : LibmvecLibcall<ERFC_V2F64>;
+  def LIBMVEC__ZGVnN2v_erfcf : LibmvecLibcall<ERFC_V2F32>;
+  def LIBMVEC__ZGVnN2v_erff : LibmvecLibcall<ERF_V2F32>;
+  def LIBMVEC__ZGVnN2v_exp : LibmvecLibcall<EXP_V2F64>;
+  def LIBMVEC__ZGVnN2v_exp10 : LibmvecLibcall<EXP10_V2F64>;
+  def LIBMVEC__ZGVnN2v_exp10f : LibmvecLibcall<EXP10_V2F32>;
+  def LIBMVEC__ZGVnN2v_exp2 : LibmvecLibcall<EXP2_V2F64>;
+  def LIBMVEC__ZGVnN2v_exp2f : LibmvecLibcall<EXP2_V2F32>;
+  def LIBMVEC__ZGVnN2v_expf : LibmvecLibcall<EXP_V2F32>;
+  def LIBMVEC__ZGVnN2v_expm1 : LibmvecLibcall<EXPM1_V2F64>;
+  def LIBMVEC__ZGVnN2v_expm1f : LibmvecLibcall<EXPM1_V2F32>;
+  def LIBMVEC__ZGVnN2v_log : LibmvecLibcall<LOG_V2F64>;
+  def LIBMVEC__ZGVnN2v_log10 : LibmvecLibcall<LOG10_V2F64>;
+  def LIBMVEC__ZGVnN2v_log10f : LibmvecLibcall<LOG10_V2F32>;
+  def LIBMVEC__ZGVnN2v_log1p : LibmvecLibcall<LOG1P_V2F64>;
+  def LIBMVEC__ZGVnN2v_log1pf : LibmvecLibcall<LOG1P_V2F32>;
+  def LIBMVEC__ZGVnN2v_log2 : LibmvecLibcall<LOG2_V2F64>;
+  def LIBMVEC__ZGVnN2v_log2f : LibmvecLibcall<LOG2_V2F32>;
+  def LIBMVEC__ZGVnN2v_logf : LibmvecLibcall<LOG_V2F32>;
+  def LIBMVEC__ZGVnN2v_sin : LibmvecLibcall<SIN_V2F64>;
+  def LIBMVEC__ZGVnN2v_sinf : LibmvecLibcall<SIN_V2F32>;
+  def LIBMVEC__ZGVnN2v_sinh : LibmvecLibcall<SINH_V2F64>;
+  def LIBMVEC__ZGVnN2v_sinhf : LibmvecLibcall<SINH_V2F32>;
+  def LIBMVEC__ZGVnN2v_tan : LibmvecLibcall<TAN_V2F64>;
+  def LIBMVEC__ZGVnN2v_tanf : LibmvecLibcall<TAN_V2F32>;
+  def LIBMVEC__ZGVnN2v_tanh : LibmvecLibcall<TANH_V2F64>;
+  def LIBMVEC__ZGVnN2v_tanhf : LibmvecLibcall<TANH_V2F32>;
+  def LIBMVEC__ZGVnN2vv_atan2 : LibmvecLibcall<ATAN2_V2F64>;
+  def LIBMVEC__ZGVnN2vv_atan2f : LibmvecLibcall<ATAN2_V2F32>;
+  def LIBMVEC__ZGVnN2vv_hypot : LibmvecLibcall<HYPOT_V2F64>;
+  def LIBMVEC__ZGVnN2vv_hypotf : LibmvecLibcall<HYPOT_V2F32>;
+  def LIBMVEC__ZGVnN2vv_pow : LibmvecLibcall<POW_V2F64>;
+  def LIBMVEC__ZGVnN2vv_powf : LibmvecLibcall<POW_V2F32>;
+  def LIBMVEC__ZGVnN4v_acosf : LibmvecLibcall<ACOS_V4F32>;
+  def LIBMVEC__ZGVnN4v_acoshf : LibmvecLibcall<ACOSH_V4F32>;
+  def LIBMVEC__ZGVnN4v_asinf : LibmvecLibcall<ASIN_V4F32>;
+  def LIBMVEC__ZGVnN4v_asinhf : LibmvecLibcall<ASINH_V4F32>;
+  def LIBMVEC__ZGVnN4v_atanf : LibmvecLibcall<ATAN_V4F32>;
+  def LIBMVEC__ZGVnN4v_atanhf : LibmvecLibcall<ATANH_V4F32>;
+  def LIBMVEC__ZGVnN4v_cbrtf : LibmvecLibcall<CBRT_V4F32>;
+  def LIBMVEC__ZGVnN4v_cosf : LibmvecLibcall<COS_V4F32>;
+  def LIBMVEC__ZGVnN4v_coshf : LibmvecLibcall<COSH_V4F32>;
+  def LIBMVEC__ZGVnN4v_erfcf : LibmvecLibcall<ERFC_V4F32>;
+  def LIBMVEC__ZGVnN4v_erff : LibmvecLibcall<ERF_V4F32>;
+  def LIBMVEC__ZGVnN4v_exp10f : LibmvecLibcall<EXP10_V4F32>;
+  def LIBMVEC__ZGVnN4v_exp2f : LibmvecLibcall<EXP2_V4F32>;
+  def LIBMVEC__ZGVnN4v_expf : LibmvecLibcall<EXP_V4F32>;
+  def LIBMVEC__ZGVnN4v_expm1f : LibmvecLibcall<EXPM1_V4F32>;
+  def LIBMVEC__ZGVnN4v_log10f : LibmvecLibcall<LOG10_V4F32>;
+  def LIBMVEC__ZGVnN4v_log1pf : LibmvecLibcall<LOG1P_V4F32>;
+  def LIBMVEC__ZGVnN4v_log2f : LibmvecLibcall<LOG2_V4F32>;
+  def LIBMVEC__ZGVnN4v_logf : LibmvecLibcall<LOG_V4F32>;
+  def LIBMVEC__ZGVnN4v_sinf : LibmvecLibcall<SIN_V4F32>;
+  def LIBMVEC__ZGVnN4v_sinhf : LibmvecLibcall<SINH_V4F32>;
+  def LIBMVEC__ZGVnN4v_tanf : LibmvecLibcall<TAN_V4F32>;
+  def LIBMVEC__ZGVnN4v_tanhf : LibmvecLibcall<TANH_V4F32>;
+  def LIBMVEC__ZGVnN4vv_atan2f : LibmvecLibcall<ATAN2_V4F32>;
+  def LIBMVEC__ZGVnN4vv_hypotf : LibmvecLibcall<HYPOT_V4F32>;
+  def LIBMVEC__ZGVnN4vv_powf : LibmvecLibcall<POW_V4F32>;
+  def LIBMVEC__ZGVsMxv_acos : LibmvecLibcall<ACOS_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_acosf : LibmvecLibcall<ACOS_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_acosh : LibmvecLibcall<ACOSH_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_acoshf : LibmvecLibcall<ACOSH_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_asin : LibmvecLibcall<ASIN_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_asinf : LibmvecLibcall<ASIN_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_asinh : LibmvecLibcall<ASINH_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_asinhf : LibmvecLibcall<ASINH_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_atan : LibmvecLibcall<ATAN_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_atanf : LibmvecLibcall<ATAN_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_atanh : LibmvecLibcall<ATANH_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_atanhf : LibmvecLibcall<ATANH_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_cbrt : LibmvecLibcall<CBRT_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_cbrtf : LibmvecLibcall<CBRT_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_cos : LibmvecLibcall<COS_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_cosf : LibmvecLibcall<COS_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_cosh : LibmvecLibcall<COSH_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_coshf : LibmvecLibcall<COSH_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_erf : LibmvecLibcall<ERF_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_erfc : LibmvecLibcall<ERFC_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_erfcf : LibmvecLibcall<ERFC_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_erff : LibmvecLibcall<ERF_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_exp : LibmvecLibcall<EXP_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_exp10 : LibmvecLibcall<EXP10_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_exp10f : LibmvecLibcall<EXP10_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_exp2 : LibmvecLibcall<EXP2_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_exp2f : LibmvecLibcall<EXP2_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_expf : LibmvecLibcall<EXP_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_expm1 : LibmvecLibcall<EXPM1_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_expm1f : LibmvecLibcall<EXPM1_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_log : LibmvecLibcall<LOG_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_log10 : LibmvecLibcall<LOG10_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_log10f : LibmvecLibcall<LOG10_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_log1p : LibmvecLibcall<LOG1P_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_log1pf : LibmvecLibcall<LOG1P_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_log2 : LibmvecLibcall<LOG2_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_log2f : LibmvecLibcall<LOG2_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_logf : LibmvecLibcall<LOG_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_sin : LibmvecLibcall<SIN_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_sinf : LibmvecLibcall<SIN_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_sinh : LibmvecLibcall<SINH_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_sinhf : LibmvecLibcall<SINH_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_tan : LibmvecLibcall<TAN_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_tanf : LibmvecLibcall<TAN_NXV4F32>;
+  def LIBMVEC__ZGVsMxv_tanh : LibmvecLibcall<TANH_NXV2F64>;
+  def LIBMVEC__ZGVsMxv_tanhf : LibmvecLibcall<TANH_NXV4F32>;
+  def LIBMVEC__ZGVsMxvv_atan2 : LibmvecLibcall<ATAN2_NXV2F64>;
+  def LIBMVEC__ZGVsMxvv_atan2f : LibmvecLibcall<ATAN2_NXV4F32>;
+  def LIBMVEC__ZGVsMxvv_hypot : LibmvecLibcall<HYPOT_NXV2F64>;
+  def LIBMVEC__ZGVsMxvv_hypotf : LibmvecLibcall<HYPOT_NXV4F32>;
+  def LIBMVEC__ZGVsMxvv_pow : LibmvecLibcall<POW_NXV2F64>;
+  def LIBMVEC__ZGVsMxvv_powf : LibmvecLibcall<POW_NXV4F32>;
+}
+
+//===----------------------------------------------------------------------===//
+// IBM MASS vector library (MASSV) functions
+//===----------------------------------------------------------------------===//
+
+defset list<RuntimeLibcallImpl> MASSV_VECFUNCS = {
+  def __acosd2 : RuntimeLibcallImpl<ACOS_V2F64>;
+  def __acosf4 : RuntimeLibcallImpl<ACOS_V4F32>;
+  def __acoshd2 : RuntimeLibcallImpl<ACOSH_V2F64>;
+  def __acoshf4 : RuntimeLibcallImpl<ACOSH_V4F32>;
+  def __asind2 : RuntimeLibcallImpl<ASIN_V2F64>;
+  def __asinf4 : RuntimeLibcallImpl<ASIN_V4F32>;
+  def __asinhd2 : RuntimeLibcallImpl<ASINH_V2F64>;
+  def __asinhf4 : RuntimeLibcallImpl<ASINH_V4F32>;
+  def __atan2d2 : RuntimeLibcallImpl<ATAN2_V2F64>;
+  def __atan2f4 : RuntimeLibcallImpl<ATAN2_V4F32>;
+  def __atand2 : RuntimeLibcallImpl<ATAN_V2F64>;
+  def __atanf4 : RuntimeLibcallImpl<ATAN_V4F32>;
+  def __atanhd2 : RuntimeLibcallImpl<ATANH_V2F64>;
+  def __atanhf4 : RuntimeLibcallImpl<ATANH_V4F32>;
+  def __cbrtd2 : RuntimeLibcallImpl<CBRT_V2F64>;
+  def __cbrtf4 : RuntimeLibcallImpl<CBRT_V4F32>;
+  def __cosd2 : RuntimeLibcallImpl<COS_V2F64>;
+  def __cosf4 : RuntimeLibcallImpl<COS_V4F32>;
+  def __coshd2 : RuntimeLibcallImpl<COSH_V2F64>;
+  def __coshf4 : RuntimeLibcallImpl<COSH_V4F32>;
+  def __exp2d2 : RuntimeLibcallImpl<EXP2_V2F64>;
+  def __exp2f4 : RuntimeLibcallImpl<EXP2_V4F32>;
+  def __expd2 : RuntimeLibcallImpl<EXP_V2F64>;
+  def __expf4 : RuntimeLibcallImpl<EXP_V4F32>;
+  def __expm1d2 : RuntimeLibcallImpl<EXPM1_V2F64>;
+  def __expm1f4 : RuntimeLibcallImpl<EXPM1_V4F32>;
+  def __log10d2 : RuntimeLibcallImpl<LOG10_V2F64>;
+  def __log10f4 : RuntimeLibcallImpl<LOG10_V4F32>;
+  def __log1pd2 : RuntimeLibcallImpl<LOG1P_V2F64>;
+  def __log1pf4 : RuntimeLibcallImpl<LOG1P_V4F32>;
+  def __log2d2 : RuntimeLibcallImpl<LOG2_V2F64>;
+  def __log2f4 : RuntimeLibcallImpl<LOG2_V4F32>;
+  def __logd2 : RuntimeLibcallImpl<LOG_V2F64>;
+  def __logf4 : RuntimeLibcallImpl<LOG_V4F32>;
+  def __powd2 : RuntimeLibcallImpl<POW_V2F64>;
+  def __powf4 : RuntimeLibcallImpl<POW_V4F32>;
+  def __sind2 : RuntimeLibcallImpl<SIN_V2F64>;
+  def __sinf4 : RuntimeLibcallImpl<SIN_V4F32>;
+  def __sinhd2 : RuntimeLibcallImpl<SINH_V2F64>;
+  def __sinhf4 : RuntimeLibcallImpl<SINH_V4F32>;
+  def __tand2 : RuntimeLibcallImpl<TAN_V2F64>;
+  def __tanf4 : RuntimeLibcallImpl<TAN_V4F32>;
+  def __tanhd2 : RuntimeLibcallImpl<TANH_V2F64>;
+  def __tanhf4 : RuntimeLibcallImpl<TANH_V4F32>;
+}
+
+//===----------------------------------------------------------------------===//
+// Intel SVML library functions
+//===----------------------------------------------------------------------===//
+
+defset list<RuntimeLibcallImpl> SVML_VECFUNCS = {
+  def __svml_cos2 : RuntimeLibcallImpl<COS_V2F64>;
+  def __svml_cos4 : RuntimeLibcallImpl<COS_V4F64>;
+  def __svml_cos8 : RuntimeLibcallImpl<COS_V8F64>;
+  def __svml_cosf16 : RuntimeLibcallImpl<COS_V16F32>;
+  def __svml_cosf4 : RuntimeLibcallImpl<COS_V4F32>;
+  def __svml_cosf8 : RuntimeLibcallImpl<COS_V8F32>;
+  def __svml_exp2 : RuntimeLibcallImpl<EXP_V2F64>;
+  def __svml_exp22 : RuntimeLibcallImpl<EXP2_V2F64>;
+  def __svml_exp24 : RuntimeLibcallImpl<EXP2_V4F64>;
+  def __svml_exp28 : RuntimeLibcallImpl<EXP2_V8F64>;
+  def __svml_exp2f16 : RuntimeLibcallImpl<EXP2_V16F32>;
+  def __svml_exp2f4 : RuntimeLibcallImpl<EXP2_V4F32>;
+  def __svml_exp2f8 : RuntimeLibcallImpl<EXP2_V8F32>;
+  def __svml_exp4 : RuntimeLibcallImpl<EXP_V4F64>;
+  def __svml_exp8 : RuntimeLibcallImpl<EXP_V8F64>;
+  def __svml_expf16 : RuntimeLibcallImpl<EXP_V16F32>;
+  def __svml_expf4 : RuntimeLibcallImpl<EXP_V4F32>;
+  def __svml_expf8 : RuntimeLibcallImpl<EXP_V8F32>;
+  def __svml_log102 : RuntimeLibcallImpl<LOG10_V2F64>;
+  def __svml_log104 : RuntimeLibcallImpl<LOG10_V4F64>;
+  def __svml_log108 : RuntimeLibcallImpl<LOG10_V8F64>;
+  def __svml_log10f16 : RuntimeLibcallImpl<LOG10_V16F32>;
+  def __svml_log10f4 : RuntimeLibcallImpl<LOG10_V4F32>;
+  def __svml_log10f8 : RuntimeLibcallImpl<LOG10_V8F32>;
+  def __svml_log2 : RuntimeLibcallImpl<LOG_V2F64>;
+  def __svml_log22 : RuntimeLibcallImpl<LOG2_V2F64>;
+  def __svml_log24 : RuntimeLibcallImpl<LOG2_V4F64>;
+  def __svml_log28 : RuntimeLibcallImpl<LOG2_V8F64>;
+  def __svml_log2f16 : RuntimeLibcallImpl<LOG2_V16F32>;
+  def __svml_log2f4 : RuntimeLibcallImpl<LOG2_V4F32>;
+  def __svml_log2f8 : RuntimeLibcallImpl<LOG2_V8F32>;
+  def __svml_log4 : RuntimeLibcallImpl<LOG_V4F64>;
+  def __svml_log8 : RuntimeLibcallImpl<LOG_V8F64>;
+  def __svml_logf16 : RuntimeLibcallImpl<LOG_V16F32>;
+  def __svml_logf4 : RuntimeLibcallImpl<LOG_V4F32>;
+  def __svml_logf8 : RuntimeLibcallImpl<LOG_V8F32>;
+  def __svml_pow2 : RuntimeLibcallImpl<POW_V2F64>;
+  def __svml_pow4 : RuntimeLibcallImpl<POW_V4F64>;
+  def __svml_pow8 : RuntimeLibcallImpl<POW_V8F64>;
+  def __svml_powf16 : RuntimeLibcallImpl<POW_V16F32>;
+  def __svml_powf4 : RuntimeLibcallImpl<POW_V4F32>;
+  def __svml_powf8 : RuntimeLibcallImpl<POW_V8F32>;
+  def __svml_sin2 : RuntimeLibcallImpl<SIN_V2F64>;
+  def __svml_sin4 : RuntimeLibcallImpl<SIN_V4F64>;
+  def __svml_sin8 : RuntimeLibcallImpl<SIN_V8F64>;
+  def __svml_sinf16 : RuntimeLibcallImpl<SIN_V16F32>;
+  def __svml_sinf4 : RuntimeLibcallImpl<SIN_V4F32>;
+  def __svml_sinf8 : RuntimeLibcallImpl<SIN_V8F32>;
+  def __svml_sqrt2 : RuntimeLibcallImpl<SQRT_V2F64>;
+  def __svml_sqrt4 : RuntimeLibcallImpl<SQRT_V4F64>;
+  def __svml_sqrt8 : RuntimeLibcallImpl<SQRT_V8F64>;
+  def __svml_sqrtf16 : RuntimeLibcallImpl<SQRT_V16F32>;
+  def __svml_sqrtf4 : RuntimeLibcallImpl<SQRT_V4F32>;
+  def __svml_sqrtf8 : RuntimeLibcallImpl<SQRT_V8F32>;
+  def __svml_tan2 : RuntimeLibcallImpl<TAN_V2F64>;
+  def __svml_tan4 : RuntimeLibcallImpl<TAN_V4F64>;
+  def __svml_tan8 : RuntimeLibcallImpl<TAN_V8F64>;
+  def __svml_tanf16 : RuntimeLibcallImpl<TAN_V16F32>;
+  def __svml_tanf4 : RuntimeLibcallImpl<TAN_V4F32>;
+  def __svml_tanf8 : RuntimeLibcallImpl<TAN_V8F32>;
+}
+
+//===----------------------------------------------------------------------===//
+// SIMD Library for Evaluating Elementary Functions
+//===----------------------------------------------------------------------===//
+
+defset list<RuntimeLibcallImpl> SLEEFGNUABI_VF2_VECFUNCS = {
+  def _ZGVnN2v_acos : RuntimeLibcallImpl<ACOS_V2F64>;
+  def _ZGVnN2v_acosh : RuntimeLibcallImpl<ACOSH_V2F64>;
+  def _ZGVnN2v_asin : RuntimeLibcallImpl<ASIN_V2F64>;
+  def _ZGVnN2v_asinh : RuntimeLibcallImpl<ASINH_V2F64>;
+  def _ZGVnN2v_atan : RuntimeLibcallImpl<ATAN_V2F64>;
+  def _ZGVnN2v_atanh : RuntimeLibcallImpl<ATANH_V2F64>;
+  def _ZGVnN2v_cbrt : RuntimeLibcallImpl<CBRT_V2F64>;
+  def _ZGVnN2v_cos : RuntimeLibcallImpl<COS_V2F64>;
+  def _ZGVnN2v_cosh : RuntimeLibcallImpl<COSH_V2F64>;
+  def _ZGVnN2v_cospi : RuntimeLibcallImpl<COSPI_V2F64>;
+  def _ZGVnN2v_erf : RuntimeLibcallImpl<ERF_V2F64>;
+  def _ZGVnN2v_erfc : RuntimeLibcallImpl<ERFC_V2F64>;
+  def _ZGVnN2v_exp : RuntimeLibcallImpl<EXP_V2F64>;
+  def _ZGVnN2v_exp10 : RuntimeLibcallImpl<EXP10_V2F64>;
+  def _ZGVnN2v_exp2 : RuntimeLibcallImpl<EXP2_V2F64>;
+  def _ZGVnN2v_expm1 : RuntimeLibcallImpl<EXPM1_V2F64>;
+  def _ZGVnN2v_ilogb : RuntimeLibcallImpl<ILOGB_V2F64>;
+  def _ZGVnN2v_lgamma : RuntimeLibcallImpl<LGAMMA_V2F64>;
+  def _ZGVnN2v_log : RuntimeLibcallImpl<LOG_V2F64>;
+  def _ZGVnN2v_log10 : RuntimeLibcallImpl<LOG10_V2F64>;
+  def _ZGVnN2v_log1p : RuntimeLibcallImpl<LOG1P_V2F64>;
+  def _ZGVnN2v_log2 : RuntimeLibcallImpl<LOG2_V2F64>;
+  def _ZGVnN2v_sin : RuntimeLibcallImpl<SIN_V2F64>;
+  def _ZGVnN2v_sinh : RuntimeLibcallImpl<SINH_V2F64>;
+  def _ZGVnN2v_sinpi : RuntimeLibcallImpl<SINPI_V2F64>;
+  def _ZGVnN2v_sqrt : RuntimeLibcallImpl<SQRT_V2F64>;
+  def _ZGVnN2v_tan : RuntimeLibcallImpl<TAN_V2F64>;
+  def _ZGVnN2v_tanh : RuntimeLibcallImpl<TANH_V2F64>;
+  def _ZGVnN2v_tgamma : RuntimeLibcallImpl<TGAMMA_V2F64>;
+  def _ZGVnN2vl8_modf : RuntimeLibcallImpl<MODF_V2F64>;
+  def _ZGVnN2vl8l8_sincos : RuntimeLibcallImpl<SINCOS_V2F64>;
+  def _ZGVnN2vl8l8_sincospi : RuntimeLibcallImpl<SINCOSPI_V2F64>;
+  def _ZGVnN2vv_atan2 : RuntimeLibcallImpl<ATAN2_V2F64>;
+  def _ZGVnN2vv_copysign : RuntimeLibcallImpl<COPYSIGN_V2F64>;
+  def _ZGVnN2vv_fdim : RuntimeLibcallImpl<FDIM_V2F64>;
+  def _ZGVnN2vv_fmax : RuntimeLibcallImpl<FMAX_V2F64>;
+  def _ZGVnN2vv_fmin : RuntimeLibcallImpl<FMIN_V2F64>;
+  def _ZGVnN2vv_fmod : RuntimeLibcallImpl<FMOD_V2F64>;
+  def _ZGVnN2vv_hypot : RuntimeLibcallImpl<HYPOT_V2F64>;
+  def _ZGVnN2vv_ldexp : RuntimeLibcallImpl<LDEXP_V2F64>;
+  def _ZGVnN2vv_nextafter : RuntimeLibcallImpl<NEXTAFTER_V2F64>;
+  def _ZGVnN2vv_pow : RuntimeLibcallImpl<POW_V2F64>;
+  def _ZGVnN2vvv_fma : RuntimeLibcallImpl<FMA_V2F64>;
+}
+
+defset list<RuntimeLibcallImpl> SLEEFGNUABI_VF4_VECFUNCS = {
+  def _ZGVnN4v_acosf : RuntimeLibcallImpl<ACOS_V4F32>;
+  def _ZGVnN4v_acoshf : RuntimeLibcallImpl<ACOSH_V4F32>;
+  def _ZGVnN4v_asinf : RuntimeLibcallImpl<ASIN_V4F32>;
+  def _ZGVnN4v_asinhf : RuntimeLibcallImpl<ASINH_V4F32>;
+  def _ZGVnN4v_atanf : RuntimeLibcallImpl<ATAN_V4F32>;
+  def _ZGVnN4v_atanhf : RuntimeLibcallImpl<ATANH_V4F32>;
+  def _ZGVnN4v_cbrtf : RuntimeLibcallImpl<CBRT_V4F32>;
+  def _ZGVnN4v_cosf : RuntimeLibcallImpl<COS_V4F32>;
+  def _ZGVnN4v_coshf : RuntimeLibcallImpl<COSH_V4F32>;
+  def _ZGVnN4v_cospif : RuntimeLibcallImpl<COSPI_V4F32>;
+  def _ZGVnN4v_erfcf : RuntimeLibcallImpl<ERFC_V4F32>;
+  def _ZGVnN4v_erff : RuntimeLibcallImpl<ERF_V4F32>;
+  def _ZGVnN4v_exp10f : RuntimeLibcallImpl<EXP10_V4F32>;
+  def _ZGVnN4v_exp2f : RuntimeLibcallImpl<EXP2_V4F32>;
+  def _ZGVnN4v_expf : RuntimeLibcallImpl<EXP_V4F32>;
+  def _ZGVnN4v_expm1f : RuntimeLibcallImpl<EXPM1_V4F32>;
+  def _ZGVnN4v_ilogbf : RuntimeLibcallImpl<ILOGB_V4F32>;
+  def _ZGVnN4v_lgammaf : RuntimeLibcallImpl<LGAMMA_V4F32>;
+  def _ZGVnN4v_log10f : RuntimeLibcallImpl<LOG10_V4F32>;
+  def _ZGVnN4v_log1pf : RuntimeLibcallImpl<LOG1P_V4F32>;
+  def _ZGVnN4v_log2f : RuntimeLibcallImpl<LOG2_V4F32>;
+  def _ZGVnN4v_logf : RuntimeLibcallImpl<LOG_V4F32>;
+  def _ZGVnN4v_sinf : RuntimeLibcallImpl<SIN_V4F32>;
+  def _ZGVnN4v_sinhf : RuntimeLibcallImpl<SINH_V4F32>;
+  def _ZGVnN4v_sinpif : RuntimeLibcallImpl<SINPI_V4F32>;
+  def _ZGVnN4v_sqrtf : RuntimeLibcallImpl<SQRT_V4F32>;
+  def _ZGVnN4v_tanf : RuntimeLibcallImpl<TAN_V4F32>;
+  def _ZGVnN4v_tanhf : RuntimeLibcallImpl<TANH_V4F32>;
+  def _ZGVnN4v_tgammaf : RuntimeLibcallImpl<TGAMMA_V4F32>;
+  def _ZGVnN4vl4_modff : RuntimeLibcallImpl<MODF_V4F32>;
+  def _ZGVnN4vl4l4_sincosf : RuntimeLibcallImpl<SINCOS_V4F32>;
+  def _ZGVnN4vl4l4_sincospif : RuntimeLibcallImpl<SINCOSPI_V4F32>;
+  def _ZGVnN4vv_atan2f : RuntimeLibcallImpl<ATAN2_V4F32>;
+  def _ZGVnN4vv_copysignf : RuntimeLibcallImpl<COPYSIGN_V4F32>;
+  def _ZGVnN4vv_fdimf : RuntimeLibcallImpl<FDIM_V4F32>;
+  def _ZGVnN4vv_fmaxf : RuntimeLibcallImpl<FMAX_V4F32>;
+  def _ZGVnN4vv_fminf : RuntimeLibcallImpl<FMIN_V4F32>;
+  def _ZGVnN4vv_fmodf : RuntimeLibcallImpl<FMOD_V4F32>;
+  def _ZGVnN4vv_hypotf : RuntimeLibcallImpl<HYPOT_V4F32>;
+  def _ZGVnN4vv_ldexpf : RuntimeLibcallImpl<LDEXP_V4F32>;
+  def _ZGVnN4vv_nextafterf : RuntimeLibcallImpl<NEXTAFTER_V4F32>;
+  def _ZGVnN4vv_powf : RuntimeLibcallImpl<POW_V4F32>;
+  def _ZGVnN4vvv_fmaf : RuntimeLibcallImpl<FMA_V4F32>;
+}
+
+defset list<RuntimeLibcallImpl> SLEEFGNUABI_SCALABLE_VECFUNCS = {
+  def _ZGVsMxv_acos : RuntimeLibcallImpl<ACOS_NXV2F64>;
+  def _ZGVsMxv_acosf : RuntimeLibcallImpl<ACOS_NXV4F32>;
+  def _ZGVsMxv_acosh : RuntimeLibcallImpl<ACOSH_NXV2F64>;
+  def _ZGVsMxv_acoshf : RuntimeLibcallImpl<ACOSH_NXV4F32>;
+  def _ZGVsMxv_asin : RuntimeLibcallImpl<ASIN_NXV2F64>;
+  def _ZGVsMxv_asinf : RuntimeLibcallImpl<ASIN_NXV4F32>;
+  def _ZGVsMxv_asinh : RuntimeLibcallImpl<ASINH_NXV2F64>;
+  def _ZGVsMxv_asinhf : RuntimeLibcallImpl<ASINH_NXV4F32>;
+  def _ZGVsMxv_atan : RuntimeLibcallImpl<ATAN_NXV2F64>;
+  def _ZGVsMxv_atanf : RuntimeLibcallImpl<ATAN_NXV4F32>;
+  def _ZGVsMxv_atanh : RuntimeLibcallImpl<ATANH_NXV2F64>;
+  def _ZGVsMxv_atanhf : RuntimeLibcallImpl<ATANH_NXV4F32>;
+  def _ZGVsMxv_cbrt : RuntimeLibcallImpl<CBRT_NXV2F64>;
+  def _ZGVsMxv_cbrtf : RuntimeLibcallImpl<CBRT_NXV4F32>;
+  def _ZGVsMxv_cos : RuntimeLibcallImpl<COS_NXV2F64>;
+  def _ZGVsMxv_cosf : RuntimeLibcallImpl<COS_NXV4F32>;
+  def _ZGVsMxv_cosh : RuntimeLibcallImpl<COSH_NXV2F64>;
+  def _ZGVsMxv_coshf : RuntimeLibcallImpl<COSH_NXV4F32>;
+  def _ZGVsMxv_cospi : RuntimeLibcallImpl<COSPI_NXV2F64>;
+  def _ZGVsMxv_cospif : RuntimeLibcallImpl<COSPI_NXV4F32>;
+  def _ZGVsMxv_erf : RuntimeLibcallImpl<ERF_NXV2F64>;
+  def _ZGVsMxv_erfc : RuntimeLibcallImpl<ERFC_NXV2F64>;
+  def _ZGVsMxv_erfcf : RuntimeLibcallImpl<ERFC_NXV4F32>;
+  def _ZGVsMxv_erff : RuntimeLibcallImpl<ERF_NXV4F32>;
+  def _ZGVsMxv_exp : RuntimeLibcallImpl<EXP_NXV2F64>;
+  def _ZGVsMxv_exp10 : RuntimeLibcallImpl<EXP10_NXV2F64>;
+  def _ZGVsMxv_exp10f : RuntimeLibcallImpl<EXP10_NXV4F32>;
+  def _ZGVsMxv_exp2 : RuntimeLibcallImpl<EXP2_NXV2F64>;
+  def _ZGVsMxv_exp2f : RuntimeLibcallImpl<EXP2_NXV4F32>;
+  def _ZGVsMxv_expf : RuntimeLibcallImpl<EXP_NXV4F32>;
+  def _ZGVsMxv_expm1 : RuntimeLibcallImpl<EXPM1_NXV2F64>;
+  def _ZGVsMxv_expm1f : RuntimeLibcallImpl<EXPM1_NXV4F32>;
+  def _ZGVsMxv_ilogb : RuntimeLibcallImpl<ILOGB_NXV2F64>;
+  def _ZGVsMxv_ilogbf : RuntimeLibcallImpl<ILOGB_NXV4F32>;
+  def _ZGVsMxv_lgamma : RuntimeLibcallImpl<LGAMMA_NXV2F64>;
+  def _ZGVsMxv_lgammaf : RuntimeLibcallImpl<LGAMMA_NXV4F32>;
+  def _ZGVsMxv_log : RuntimeLibcallImpl<LOG_NXV2F64>;
+  def _ZGVsMxv_log10 : RuntimeLibcallImpl<LOG10_NXV2F64>;
+  def _ZGVsMxv_log10f : RuntimeLibcallImpl<LOG10_NXV4F32>;
+  def _ZGVsMxv_log1p : RuntimeLibcallImpl<LOG1P_NXV2F64>;
+  def _ZGVsMxv_log1pf : RuntimeLibcallImpl<LOG1P_NXV4F32>;
+  def _ZGVsMxv_log2 : RuntimeLibcallImpl<LOG2_NXV2F64>;
+  def _ZGVsMxv_log2f : RuntimeLibcallImpl<LOG2_NXV4F32>;
+  def _ZGVsMxv_logf : RuntimeLibcallImpl<LOG_NXV4F32>;
+  def _ZGVsMxv_sin : RuntimeLibcallImpl<SIN_NXV2F64>;
+  def _ZGVsMxv_sinf : RuntimeLibcallImpl<SIN_NXV4F32>;
+  def _ZGVsMxv_sinh : RuntimeLibcallImpl<SINH_NXV2F64>;
+  def _ZGVsMxv_sinhf : RuntimeLibcallImpl<SINH_NXV4F32>;
+  def _ZGVsMxv_sinpi : RuntimeLibcallImpl<SINPI_NXV2F64>;
+  def _ZGVsMxv_sinpif : RuntimeLibcallImpl<SINPI_NXV4F32>;
+  def _ZGVsMxv_sqrt : RuntimeLibcallImpl<SQRT_NXV2F64>;
+  def _ZGVsMxv_sqrtf : RuntimeLibcallImpl<SQRT_NXV4F32>;
+  def _ZGVsMxv_tan : RuntimeLibcallImpl<TAN_NXV2F64>;
+  def _ZGVsMxv_tanf : RuntimeLibcallImpl<TAN_NXV4F32>;
+  def _ZGVsMxv_tanh : RuntimeLibcallImpl<TANH_NXV2F64>;
+  def _ZGVsMxv_tanhf : RuntimeLibcallImpl<TANH_NXV4F32>;
+  def _ZGVsMxv_tgamma : RuntimeLibcallImpl<TGAMMA_NXV2F64>;
+  def _ZGVsMxv_tgammaf : RuntimeLibcallImpl<TGAMMA_NXV4F32>;
+  def _ZGVsMxvv_atan2 : RuntimeLibcallImpl<ATAN2_NXV2F64>;
+  def _ZGVsMxvv_atan2f : RuntimeLibcallImpl<ATAN2_NXV4F32>;
+  def _ZGVsMxvv_copysign : RuntimeLibcallImpl<COPYSIGN_NXV2F64>;
+  def _ZGVsMxvv_copysignf : RuntimeLibcallImpl<COPYSIGN_NXV4F32>;
+  def _ZGVsMxvv_fdim : RuntimeLibcallImpl<FDIM_NXV2F64>;
+  def _ZGVsMxvv_fdimf : RuntimeLibcallImpl<FDIM_NXV4F32>;
+  def _ZGVsMxvv_fmax : RuntimeLibcallImpl<FMAX_NXV2F64>;
+  def _ZGVsMxvv_fmaxf : RuntimeLibcallImpl<FMAX_NXV4F32>;
+  def _ZGVsMxvv_fmin : RuntimeLibcallImpl<FMIN_NXV2F64>;
+  def _ZGVsMxvv_fminf : RuntimeLibcallImpl<FMIN_NXV4F32>;
+  def _ZGVsMxvv_fmod : RuntimeLibcallImpl<FMOD_NXV2F64>;
+  def _ZGVsMxvv_fmodf : RuntimeLibcallImpl<FMOD_NXV4F32>;
+  def _ZGVsMxvv_hypot : RuntimeLibcallImpl<HYPOT_NXV2F64>;
+  def _ZGVsMxvv_hypotf : RuntimeLibcallImpl<HYPOT_NXV4F32>;
+  def _ZGVsMxvv_ldexp : RuntimeLibcallImpl<LDEXP_NXV2F64>;
+  def _ZGVsMxvv_ldexpf : RuntimeLibcallImpl<LDEXP_NXV4F32>;
+  def _ZGVsMxvv_nextafter : RuntimeLibcallImpl<NEXTAFTER_NXV2F64>;
+  def _ZGVsMxvv_nextafterf : RuntimeLibcallImpl<NEXTAFTER_NXV4F32>;
+  def _ZGVsMxvv_pow : RuntimeLibcallImpl<POW_NXV2F64>;
+  def _ZGVsMxvv_powf : RuntimeLibcallImpl<POW_NXV4F32>;
+  def _ZGVsMxvvv_fma : RuntimeLibcallImpl<FMA_NXV2F64>;
+  def _ZGVsMxvvv_fmaf : RuntimeLibcallImpl<FMA_NXV4F32>;
+  def _ZGVsNxvl8_modf : RuntimeLibcallImpl<MODF_NXV2F64>;
+  def _ZGVsNxvl4_modff : RuntimeLibcallImpl<MODF_NXV4F32>;
+  def _ZGVsNxvl4l4_sincosf : RuntimeLibcallImpl<SINCOS_NXV4F32>;
+  def _ZGVsNxvl4l4_sincospif : RuntimeLibcallImpl<SINCOSPI_NXV4F32>;
+  def _ZGVsNxvl8l8_sincos : RuntimeLibcallImpl<SINCOS_NXV2F64>;
+  def _ZGVsNxvl8l8_sincospi : RuntimeLibcallImpl<SINCOSPI_NXV2F64>;
+}
+
+defset list<RuntimeLibcallImpl> SLEEFGNUABI_SCALABLE_VECFUNCS_RISCV = {
+  def Sleef_acosdx_u10rvvm2 : RuntimeLibcallImpl<ACOS_NXV2F64>;
+  def Sleef_acosfx_u10rvvm2 : RuntimeLibcallImpl<ACOS_NXV4F32>;
+  def Sleef_acoshdx_u10rvvm2 : RuntimeLibcallImpl<ACOSH_NXV2F64>;
+  def Sleef_acoshfx_u10rvvm2 : RuntimeLibcallImpl<ACOSH_NXV4F32>;
+  def Sleef_asindx_u10rvvm2 : RuntimeLibcallImpl<ASIN_NXV2F64>;
+  def Sleef_asinfx_u10rvvm2 : RuntimeLibcallImpl<ASIN_NXV4F32>;
+  def Sleef_asinhdx_u10rvvm2 : RuntimeLibcallImpl<ASINH_NXV2F64>;
+  def Sleef_asinhfx_u10rvvm2 : RuntimeLibcallImpl<ASINH_NXV4F32>;
+  def Sleef_atan2dx_u10rvvm2 : RuntimeLibcallImpl<ATAN2_NXV2F64>;
+  def Sleef_atan2fx_u10rvvm2 : RuntimeLibcallImpl<ATAN2_NXV4F32>;
+  def Sleef_atandx_u10rvvm2 : RuntimeLibcallImpl<ATAN_NXV2F64>;
+  def Sleef_atanfx_u10rvvm2 : RuntimeLibcallImpl<ATAN_NXV4F32>;
+  def Sleef_atanhdx_u10rvvm2 : RuntimeLibcallImpl<ATANH_NXV2F64>;
+  def Sleef_atanhfx_u10rvvm2 : RuntimeLibcallImpl<ATANH_NXV4F32>;
+  def Sleef_cbrtdx_u10rvvm2 : RuntimeLibcallImpl<CBRT_NXV2F64>;
+  def Sleef_cbrtfx_u10rvvm2 : RuntimeLibcallImpl<CBRT_NXV4F32>;
+  def Sleef_copysigndx_rvvm2 : RuntimeLibcallImpl<COPYSIGN_NXV2F64>;
+  def Sleef_copysignfx_rvvm2 : RuntimeLibcallImpl<COPYSIGN_NXV4F32>;
+  def Sleef_cosdx_u10rvvm2 : RuntimeLibcallImpl<COS_NXV2F64>;
+  def Sleef_cosfx_u10rvvm2 : RuntimeLibcallImpl<COS_NXV4F32>;
+  def Sleef_coshdx_u10rvvm2 : RuntimeLibcallImpl<COSH_NXV2F64>;
+  def Sleef_coshfx_u10rvvm2 : RuntimeLibcallImpl<COSH_NXV4F32>;
+  def Sleef_cospidx_u05rvvm2 : RuntimeLibcallImpl<COSPI_NXV2F64>;
+  def Sleef_cospifx_u05rvvm2 : RuntimeLibcallImpl<COSPI_NXV4F32>;
+  def Sleef_erfcdx_u15rvvm2 : RuntimeLibcallImpl<ERFC_NXV2F64>;
+  def Sleef_erfcfx_u15rvvm2 : RuntimeLibcallImpl<ERFC_NXV4F32>;
+  def Sleef_erfdx_u10rvvm2 : RuntimeLibcallImpl<ERF_NXV2F64>;
+  def Sleef_erffx_u10rvvm2 : RuntimeLibcallImpl<ERF_NXV4F32>;
+  def Sleef_exp10dx_u10rvvm2 : RuntimeLibcallImpl<EXP10_NXV2F64>;
+  def Sleef_exp10fx_u10rvvm2 : RuntimeLibcallImpl<EXP10_NXV4F32>;
+  def Sleef_exp2dx_u10rvvm2 : RuntimeLibcallImpl<EXP2_NXV2F64>;
+  def Sleef_exp2fx_u10rvvm2 : RuntimeLibcallImpl<EXP2_NXV4F32>;
+  def Sleef_expdx_u10rvvm2 : RuntimeLibcallImpl<EXP_NXV2F64>;
+  def Sleef_expfx_u10rvvm2 : RuntimeLibcallImpl<EXP_NXV4F32>;
+  def Sleef_expm1dx_u10rvvm2 : RuntimeLibcallImpl<EXPM1_NXV2F64>;
+  def Sleef_expm1fx_u10rvvm2 : RuntimeLibcallImpl<EXPM1_NXV4F32>;
+  def Sleef_fdimdx_rvvm2 : RuntimeLibcallImpl<FDIM_NXV2F64>;
+  def Sleef_fdimfx_rvvm2 : RuntimeLibcallImpl<FDIM_NXV4F32>;
+  def Sleef_fmadx_rvvm2 : RuntimeLibcallImpl<FMA_NXV2F64>;
+  def Sleef_fmafx_rvvm2 : RuntimeLibcallImpl<FMA_NXV4F32>;
+  def Sleef_fmaxdx_rvvm2 : RuntimeLibcallImpl<FMAX_NXV2F64>;
+  def Sleef_fmaxfx_rvvm2 : RuntimeLibcallImpl<FMAX_NXV4F32>;
+  def Sleef_fmindx_u10rvvm2 : RuntimeLibcallImpl<FMIN_NXV2F64>;
+  def Sleef_fminfx_u10rvvm2 : RuntimeLibcallImpl<FMIN_NXV4F32>;
+  def Sleef_fmoddx_rvvm2 : RuntimeLibcallImpl<FMOD_NXV2F64>;
+  def Sleef_fmodfx_rvvm2 : RuntimeLibcallImpl<FMOD_NXV4F32>;
+  def Sleef_hypotdx_u05rvvm2 : RuntimeLibcallImpl<HYPOT_NXV2F64>;
+  def Sleef_hypotfx_u05rvvm2 : RuntimeLibcallImpl<HYPOT_NXV4F32>;
+  def Sleef_ilogbdx_rvvm2 : RuntimeLibcallImpl<ILOGB_NXV2F64>;
+  def Sleef_ilogbfx_rvvm2 : RuntimeLibcallImpl<ILOGB_NXV4F32>;
+  def Sleef_ldexpdx_rvvm2 : RuntimeLibcallImpl<LDEXP_NXV2F64>;
+  def Sleef_ldexpfx_rvvm2 : RuntimeLibcallImpl<LDEXP_NXV4F32>;
+  def Sleef_lgammadx_u10rvvm2 : RuntimeLibcallImpl<LGAMMA_NXV2F64>;
+  def Sleef_lgammafx_u10rvvm2 : RuntimeLibcallImpl<LGAMMA_NXV4F32>;
+  def Sleef_log10dx_u10rvvm2 : RuntimeLibcallImpl<LOG10_NXV2F64>;
+  def Sleef_log10fx_u10rvvm2 : RuntimeLibcallImpl<LOG10_NXV4F32>;
+  def Sleef_log1pdx_u10rvvm2 : RuntimeLibcallImpl<LOG1P_NXV2F64>;
+  def Sleef_log1pfx_u10rvvm2 : RuntimeLibcallImpl<LOG1P_NXV4F32>;
+  def Sleef_log2dx_u10rvvm2 : RuntimeLibcallImpl<LOG2_NXV2F64>;
+  def Sleef_log2fx_u10rvvm2 : RuntimeLibcallImpl<LOG2_NXV4F32>;
+  def Sleef_logdx_u10rvvm2 : RuntimeLibcallImpl<LOG_NXV2F64>;
+  def Sleef_logfx_u10rvvm2 : RuntimeLibcallImpl<LOG_NXV4F32>;
+  def Sleef_modfdx_rvvm2 : RuntimeLibcallImpl<MODF_NXV2F64>;
+  def Sleef_modffx_rvvm2 : RuntimeLibcallImpl<MODF_NXV4F32>;
+  def Sleef_nextafterdx_rvvm2 : RuntimeLibcallImpl<NEXTAFTER_NXV2F64>;
+  def Sleef_nextafterfx_rvvm2 : RuntimeLibcallImpl<NEXTAFTER_NXV4F32>;
+  def Sleef_powdx_u10rvvm2 : RuntimeLibcallImpl<POW_NXV2F64>;
+  def Sleef_powfx_u10rvvm2 : RuntimeLibcallImpl<POW_NXV4F32>;
+  def Sleef_sincosdx_u10rvvm2 : RuntimeLibcallImpl<SINCOS_NXV2F64>;
+  def Sleef_sincosfx_u10rvvm2 : RuntimeLibcallImpl<SINCOS_NXV4F32>;
+  def Sleef_sincospidx_u10rvvm2 : RuntimeLibcallImpl<SINCOSPI_NXV2F64>;
+  def Sleef_sincospifx_u10rvvm2 : RuntimeLibcallImpl<SINCOSPI_NXV4F32>;
+  def Sleef_sindx_u10rvvm2 : RuntimeLibcallImpl<SIN_NXV2F64>;
+  def Sleef_sinfx_u10rvvm2 : RuntimeLibcallImpl<SIN_NXV4F32>;
+  def Sleef_sinhdx_u10rvvm2 : RuntimeLibcallImpl<SINH_NXV2F64>;
+  def Sleef_sinhfx_u10rvvm2 : RuntimeLibcallImpl<SINH_NXV4F32>;
+  def Sleef_sinpidx_u05rvvm2 : RuntimeLibcallImpl<SINPI_NXV2F64>;
+  def Sleef_sinpifx_u05rvvm2 : RuntimeLibcallImpl<SINPI_NXV4F32>;
+  def Sleef_sqrtdx_u05rvvm2 : RuntimeLibcallImpl<SQRT_NXV2F64>;
+  def Sleef_sqrtfx_u05rvvm2 : RuntimeLibcallImpl<SQRT_NXV4F32>;
+  def Sleef_tandx_u10rvvm2 : RuntimeLibcallImpl<TAN_NXV2F64>;
+  def Sleef_tanfx_u10rvvm2 : RuntimeLibcallImpl<TAN_NXV4F32>;
+  def Sleef_tanhdx_u10rvvm2 : RuntimeLibcallImpl<TANH_NXV2F64>;
+  def Sleef_tanhfx_u10rvvm2 : RuntimeLibcallImpl<TANH_NXV4F32>;
+  def Sleef_tgammadx_u10rvvm2 : RuntimeLibcallImpl<TGAMMA_NXV2F64>;
+  def Sleef_tgammafx_u10rvvm2 : RuntimeLibcallImpl<TGAMMA_NXV4F32>;
+}
+
+//===----------------------------------------------------------------------===//
+// Arm Performance Libraries (ARMPL) functions
+//===----------------------------------------------------------------------===//
+
+defset list<RuntimeLibcallImpl> ARMPL_VECFUNCS = {
+  def armpl_svacos_f32_x : RuntimeLibcallImpl<ACOS_NXV4F32>;
+  def armpl_svacos_f64_x : RuntimeLibcallImpl<ACOS_NXV2F64>;
+  def armpl_svacosh_f32_x : RuntimeLibcallImpl<ACOSH_NXV4F32>;
+  def armpl_svacosh_f64_x : RuntimeLibcallImpl<ACOSH_NXV2F64>;
+  def armpl_svasin_f32_x : RuntimeLibcallImpl<ASIN_NXV4F32>;
+  def armpl_svasin_f64_x : RuntimeLibcallImpl<ASIN_NXV2F64>;
+  def armpl_svasinh_f32_x : RuntimeLibcallImpl<ASINH_NXV4F32>;
+  def armpl_svasinh_f64_x : RuntimeLibcallImpl<ASINH_NXV2F64>;
+  def armpl_svatan2_f32_x : RuntimeLibcallImpl<ATAN2_NXV4F32>;
+  def armpl_svatan2_f64_x : RuntimeLibcallImpl<ATAN2_NXV2F64>;
+  def armpl_svatan_f32_x : RuntimeLibcallImpl<ATAN_NXV4F32>;
+  def armpl_svatan_f64_x : RuntimeLibcallImpl<ATAN_NXV2F64>;
+  def armpl_svatanh_f32_x : RuntimeLibcallImpl<ATANH_NXV4F32>;
+  def armpl_svatanh_f64_x : RuntimeLibcallImpl<ATANH_NXV2F64>;
+  def armpl_svcbrt_f32_x : RuntimeLibcallImpl<CBRT_NXV4F32>;
+  def armpl_svcbrt_f64_x : RuntimeLibcallImpl<CBRT_NXV2F64>;
+  def armpl_svcopysign_f32_x : RuntimeLibcallImpl<COPYSIGN_NXV4F32>;
+  def armpl_svcopysign_f64_x : RuntimeLibcallImpl<COPYSIGN_NXV2F64>;
+  def armpl_svcos_f32_x : RuntimeLibcallImpl<COS_NXV4F32>;
+  def armpl_svcos_f64_x : RuntimeLibcallImpl<COS_NXV2F64>;
+  def armpl_svcosh_f32_x : RuntimeLibcallImpl<COSH_NXV4F32>;
+  def armpl_svcosh_f64_x : RuntimeLibcallImpl<COSH_NXV2F64>;
+  def armpl_svcospi_f32_x : RuntimeLibcallImpl<COSPI_NXV4F32>;
+  def armpl_svcospi_f64_x : RuntimeLibcallImpl<COSPI_NXV2F64>;
+  def armpl_sverf_f32_x : RuntimeLibcallImpl<ERF_NXV4F32>;
+  def armpl_sverf_f64_x : RuntimeLibcallImpl<ERF_NXV2F64>;
+  def armpl_sverfc_f32_x : RuntimeLibcallImpl<ERFC_NXV4F32>;
+  def armpl_sverfc_f64_x : RuntimeLibcallImpl<ERFC_NXV2F64>;
+  def armpl_svexp10_f32_x : RuntimeLibcallImpl<EXP10_NXV4F32>;
+  def armpl_svexp10_f64_x : RuntimeLibcallImpl<EXP10_NXV2F64>;
+  def armpl_svexp2_f32_x : RuntimeLibcallImpl<EXP2_NXV4F32>;
+  def armpl_svexp2_f64_x : RuntimeLibcallImpl<EXP2_NXV2F64>;
+  def armpl_svexp_f32_x : RuntimeLibcallImpl<EXP_NXV4F32>;
+  def armpl_svexp_f64_x : RuntimeLibcallImpl<EXP_NXV2F64>;
+  def armpl_svexpm1_f32_x : RuntimeLibcallImpl<EXPM1_NXV4F32>;
+  def armpl_svexpm1_f64_x : RuntimeLibcallImpl<EXPM1_NXV2F64>;
+  def armpl_svfdim_f32_x : RuntimeLibcallImpl<FDIM_NXV4F32>;
+  def armpl_svfdim_f64_x : RuntimeLibcallImpl<FDIM_NXV2F64>;
+  def armpl_svfma_f32_x : RuntimeLibcallImpl<FMA_NXV4F32>;
+  def armpl_svfma_f64_x : RuntimeLibcallImpl<FMA_NXV2F64>;
+  def armpl_svfmax_f32_x : RuntimeLibcallImpl<FMAX_NXV4F32>;
+  def armpl_svfmax_f64_x : RuntimeLibcallImpl<FMAX_NXV2F64>;
+  def armpl_svfmin_f32_x : RuntimeLibcallImpl<FMIN_NXV4F32>;
+  def armpl_svfmin_f64_x : RuntimeLibcallImpl<FMIN_NXV2F64>;
+  def armpl_svfmod_f32_x : RuntimeLibcallImpl<FMOD_NXV4F32>;
+  def armpl_svfmod_f64_x : RuntimeLibcallImpl<FMOD_NXV2F64>;
+  def armpl_svhypot_f32_x : RuntimeLibcallImpl<HYPOT_NXV4F32>;
+  def armpl_svhypot_f64_x : RuntimeLibcallImpl<HYPOT_NXV2F64>;
+  def armpl_svilogb_f32_x : RuntimeLibcallImpl<ILOGB_NXV4F32>;
+  def armpl_svilogb_f64_x : RuntimeLibcallImpl<ILOGB_NXV2F64>;
+  def armpl_svldexp_f32_x : RuntimeLibcallImpl<LDEXP_NXV4F32>;
+  def armpl_svldexp_f64_x : RuntimeLibcallImpl<LDEXP_NXV2F64>;
+  def armpl_svlgamma_f32_x : RuntimeLibcallImpl<LGAMMA_NXV4F32>;
+  def armpl_svlgamma_f64_x : RuntimeLibcallImpl<LGAMMA_NXV2F64>;
+  def armpl_svlog10_f32_x : RuntimeLibcallImpl<LOG10_NXV4F32>;
+  def armpl_svlog10_f64_x : RuntimeLibcallImpl<LOG10_NXV2F64>;
+  def armpl_svlog1p_f32_x : RuntimeLibcallImpl<LOG1P_NXV4F32>;
+  def armpl_svlog1p_f64_x : RuntimeLibcallImpl<LOG1P_NXV2F64>;
+  def armpl_svlog2_f32_x : RuntimeLibcallImpl<LOG2_NXV4F32>;
+  def armpl_svlog2_f64_x : RuntimeLibcallImpl<LOG2_NXV2F64>;
+  def armpl_svlog_f32_x : RuntimeLibcallImpl<LOG_NXV4F32>;
+  def armpl_svlog_f64_x : RuntimeLibcallImpl<LOG_NXV2F64>;
+  def armpl_svmodf_f32_x : RuntimeLibcallImpl<MODF_NXV4F32>;
+  def armpl_svmodf_f64_x : RuntimeLibcallImpl<MODF_NXV2F64>;
+  def armpl_svnextafter_f32_x : RuntimeLibcallImpl<NEXTAFTER_NXV4F32>;
+  def armpl_svnextafter_f64_x : RuntimeLibcallImpl<NEXTAFTER_NXV2F64>;
+  def armpl_svpow_f32_x : RuntimeLibcallImpl<POW_NXV4F32>;
+  def armpl_svpow_f64_x : RuntimeLibcallImpl<POW_NXV2F64>;
+  def armpl_svsin_f32_x : RuntimeLibcallImpl<SIN_NXV4F32>;
+  def armpl_svsin_f64_x : RuntimeLibcallImpl<SIN_NXV2F64>;
+  def armpl_svsincos_f32_x : RuntimeLibcallImpl<SINCOS_NXV4F32>;
+  def armpl_svsincos_f64_x : RuntimeLibcallImpl<SINCOS_NXV2F64>;
+  def armpl_svsincospi_f32_x : RuntimeLibcallImpl<SINCOSPI_NXV4F32>;
+  def armpl_svsincospi_f64_x : RuntimeLibcallImpl<SINCOSPI_NXV2F64>;
+  def armpl_svsinh_f32_x : RuntimeLibcallImpl<SINH_NXV4F32>;
+  def armpl_svsinh_f64_x : RuntimeLibcallImpl<SINH_NXV2F64>;
+  def armpl_svsinpi_f32_x : RuntimeLibcallImpl<SINPI_NXV4F32>;
+  def armpl_svsinpi_f64_x : RuntimeLibcallImpl<SINPI_NXV2F64>;
+  def armpl_svsqrt_f32_x : RuntimeLibcallImpl<SQRT_NXV4F32>;
+  def armpl_svsqrt_f64_x : RuntimeLibcallImpl<SQRT_NXV2F64>;
+  def armpl_svtan_f32_x : RuntimeLibcallImpl<TAN_NXV4F32>;
+  def armpl_svtan_f64_x : RuntimeLibcallImpl<TAN_NXV2F64>;
+  def armpl_svtanh_f32_x : RuntimeLibcallImpl<TANH_NXV4F32>;
+  def armpl_svtanh_f64_x : RuntimeLibcallImpl<TANH_NXV2F64>;
+  def armpl_svtgamma_f32_x : RuntimeLibcallImpl<TGAMMA_NXV4F32>;
+  def armpl_svtgamma_f64_x : RuntimeLibcallImpl<TGAMMA_NXV2F64>;
+  def armpl_vacoshq_f32 : RuntimeLibcallImpl<ACOSH_V4F32>;
+  def armpl_vacoshq_f64 : RuntimeLibcallImpl<ACOSH_V2F64>;
+  def armpl_vacosq_f32 : RuntimeLibcallImpl<ACOS_V4F32>;
+  def armpl_vacosq_f64 : RuntimeLibcallImpl<ACOS_V2F64>;
+  def armpl_vasinhq_f32 : RuntimeLibcallImpl<ASINH_V4F32>;
+  def armpl_vasinhq_f64 : RuntimeLibcallImpl<ASINH_V2F64>;
+  def armpl_vasinq_f32 : RuntimeLibcallImpl<ASIN_V4F32>;
+  def armpl_vasinq_f64 : RuntimeLibcallImpl<ASIN_V2F64>;
+  def armpl_vatan2q_f32 : RuntimeLibcallImpl<ATAN2_V4F32>;
+  def armpl_vatan2q_f64 : RuntimeLibcallImpl<ATAN2_V2F64>;
+  def armpl_vatanhq_f32 : RuntimeLibcallImpl<ATANH_V4F32>;
+  def armpl_vatanhq_f64 : RuntimeLibcallImpl<ATANH_V2F64>;
+  def armpl_vatanq_f32 : RuntimeLibcallImpl<ATAN_V4F32>;
+  def armpl_vatanq_f64 : RuntimeLibcallImpl<ATAN_V2F64>;
+  def armpl_vcbrtq_f32 : RuntimeLibcallImpl<CBRT_V4F32>;
+  def armpl_vcbrtq_f64 : RuntimeLibcallImpl<CBRT_V2F64>;
+  def armpl_vcopysignq_f32 : RuntimeLibcallImpl<COPYSIGN_V4F32>;
+  def armpl_vcopysignq_f64 : RuntimeLibcallImpl<COPYSIGN_V2F64>;
+  def armpl_vcoshq_f32 : RuntimeLibcallImpl<COSH_V4F32>;
+  def armpl_vcoshq_f64 : RuntimeLibcallImpl<COSH_V2F64>;
+  def armpl_vcospiq_f32 : RuntimeLibcallImpl<COSPI_V4F32>;
+  def armpl_vcospiq_f64 : RuntimeLibcallImpl<COSPI_V2F64>;
+  def armpl_vcosq_f32 : RuntimeLibcallImpl<COS_V4F32>;
+  def armpl_vcosq_f64 : RuntimeLibcallImpl<COS_V2F64>;
+  def armpl_verfcq_f32 : RuntimeLibcallImpl<ERFC_V4F32>;
+  def armpl_verfcq_f64 : RuntimeLibcallImpl<ERFC_V2F64>;
+  def armpl_verfq_f32 : RuntimeLibcallImpl<ERF_V4F32>;
+  def armpl_verfq_f64 : RuntimeLibcallImpl<ERF_V2F64>;
+  def armpl_vexp10q_f32 : RuntimeLibcallImpl<EXP10_V4F32>;
+  def armpl_vexp10q_f64 : RuntimeLibcallImpl<EXP10_V2F64>;
+  def armpl_vexp2q_f32 : RuntimeLibcallImpl<EXP2_V4F32>;
+  def armpl_vexp2q_f64 : RuntimeLibcallImpl<EXP2_V2F64>;
+  def armpl_vexpm1q_f32 : RuntimeLibcallImpl<EXPM1_V4F32>;
+  def armpl_vexpm1q_f64 : RuntimeLibcallImpl<EXPM1_V2F64>;
+  def armpl_vexpq_f32 : RuntimeLibcallImpl<EXP_V4F32>;
+  def armpl_vexpq_f64 : RuntimeLibcallImpl<EXP_V2F64>;
+  def armpl_vfdimq_f32 : RuntimeLibcallImpl<FDIM_V4F32>;
+  def armpl_vfdimq_f64 : RuntimeLibcallImpl<FDIM_V2F64>;
+  def armpl_vfmaq_f32 : RuntimeLibcallImpl<FMA_V4F32>;
+  def armpl_vfmaq_f64 : RuntimeLibcallImpl<FMA_V2F64>;
+  def armpl_vfmaxq_f32 : RuntimeLibcallImpl<FMAX_V4F32>;
+  def armpl_vfmaxq_f64 : RuntimeLibcallImpl<FMAX_V2F64>;
+  def armpl_vfminq_f32 : RuntimeLibcallImpl<FMIN_V4F32>;
+  def armpl_vfminq_f64 : RuntimeLibcallImpl<FMIN_V2F64>;
+  def armpl_vfmodq_f32 : RuntimeLibcallImpl<FMOD_V4F32>;
+  def armpl_vfmodq_f64 : RuntimeLibcallImpl<FMOD_V2F64>;
+  def armpl_vhypotq_f32 : RuntimeLibcallImpl<HYPOT_V4F32>;
+  def armpl_vhypotq_f64 : RuntimeLibcallImpl<HYPOT_V2F64>;
+  def armpl_vilogbq_f32 : RuntimeLibcallImpl<ILOGB_V4F32>;
+  def armpl_vilogbq_f64 : RuntimeLibcallImpl<ILOGB_V2F64>;
+  def armpl_vldexpq_f32 : RuntimeLibcallImpl<LDEXP_V4F32>;
+  def armpl_vldexpq_f64 : RuntimeLibcallImpl<LDEXP_V2F64>;
+  def armpl_vlgammaq_f32 : RuntimeLibcallImpl<LGAMMA_V4F32>;
+  def armpl_vlgammaq_f64 : RuntimeLibcallImpl<LGAMMA_V2F64>;
+  def armpl_vlog10q_f32 : RuntimeLibcallImpl<LOG10_V4F32>;
+  def armpl_vlog10q_f64 : RuntimeLibcallImpl<LOG10_V2F64>;
+  def armpl_vlog1pq_f32 : RuntimeLibcallImpl<LOG1P_V4F32>;
+  def armpl_vlog1pq_f64 : RuntimeLibcallImpl<LOG1P_V2F64>;
+  def armpl_vlog2q_f32 : RuntimeLibcallImpl<LOG2_V4F32>;
+  def armpl_vlog2q_f64 : RuntimeLibcallImpl<LOG2_V2F64>;
+  def armpl_vlogq_f32 : RuntimeLibcallImpl<LOG_V4F32>;
+  def armpl_vlogq_f64 : RuntimeLibcallImpl<LOG_V2F64>;
+  def armpl_vmodfq_f32 : RuntimeLibcallImpl<MODF_V4F32>;
+  def armpl_vmodfq_f64 : RuntimeLibcallImpl<MODF_V2F64>;
+  def armpl_vnextafterq_f32 : RuntimeLibcallImpl<NEXTAFTER_V4F32>;
+  def armpl_vnextafterq_f64 : RuntimeLibcallImpl<NEXTAFTER_V2F64>;
+  def armpl_vpowq_f32 : RuntimeLibcallImpl<POW_V4F32>;
+  def armpl_vpowq_f64 : RuntimeLibcallImpl<POW_V2F64>;
+  def armpl_vsincospiq_f32 : RuntimeLibcallImpl<SINCOSPI_V4F32>;
+  def armpl_vsincospiq_f64 : RuntimeLibcallImpl<SINCOSPI_V2F64>;
+  def armpl_vsincosq_f32 : RuntimeLibcallImpl<SINCOS_V4F32>;
+  def armpl_vsincosq_f64 : RuntimeLibcallImpl<SINCOS_V2F64>;
+  def armpl_vsinhq_f32 : RuntimeLibcallImpl<SINH_V4F32>;
+  def armpl_vsinhq_f64 : RuntimeLibcallImpl<SINH_V2F64>;
+  def armpl_vsinpiq_f32 : RuntimeLibcallImpl<SINPI_V4F32>;
+  def armpl_vsinpiq_f64 : RuntimeLibcallImpl<SINPI_V2F64>;
+  def armpl_vsinq_f32 : RuntimeLibcallImpl<SIN_V4F32>;
+  def armpl_vsinq_f64 : RuntimeLibcallImpl<SIN_V2F64>;
+  def armpl_vsqrtq_f32 : RuntimeLibcallImpl<SQRT_V4F32>;
+  def armpl_vsqrtq_f64 : RuntimeLibcallImpl<SQRT_V2F64>;
+  def armpl_vtanhq_f32 : RuntimeLibcallImpl<TANH_V4F32>;
+  def armpl_vtanhq_f64 : RuntimeLibcallImpl<TANH_V2F64>;
+  def armpl_vtanq_f32 : RuntimeLibcallImpl<TAN_V4F32>;
+  def armpl_vtanq_f64 : RuntimeLibcallImpl<TAN_V2F64>;
+  def armpl_vtgammaq_f32 : RuntimeLibcallImpl<TGAMMA_V4F32>;
+  def armpl_vtgammaq_f64 : RuntimeLibcallImpl<TGAMMA_V2F64>;
+}
+
+//===----------------------------------------------------------------------===//
+// AMD vector math library (AMDLIBM) functions
+//===----------------------------------------------------------------------===//
+
+defset list<RuntimeLibcallImpl> AMDLIBM_VECFUNCS = {
+  def amd_vrd2_atan : RuntimeLibcallImpl<ATAN_V2F64>;
+  def amd_vrd2_cbrt : RuntimeLibcallImpl<CBRT_V2F64>;
+  def amd_vrd2_cos : RuntimeLibcallImpl<COS_V2F64>;
+  def amd_vrd2_erf : RuntimeLibcallImpl<ERF_V2F64>;
+  def amd_vrd2_exp : RuntimeLibcallImpl<EXP_V2F64>;
+  def amd_vrd2_exp10 : RuntimeLibcallImpl<EXP10_V2F64>;
+  def amd_vrd2_exp2 : RuntimeLibcallImpl<EXP2_V2F64>;
+  def amd_vrd2_expm1 : RuntimeLibcallImpl<EXPM1_V2F64>;
+  def amd_vrd2_log : RuntimeLibcallImpl<LOG_V2F64>;
+  def amd_vrd2_log10 : RuntimeLibcallImpl<LOG10_V2F64>;
+  def amd_vrd2_log1p : RuntimeLibcallImpl<LOG1P_V2F64>;
+  def amd_vrd2_log2 : RuntimeLibcallImpl<LOG2_V2F64>;
+  def amd_vrd2_pow : RuntimeLibcallImpl<POW_V2F64>;
+  def amd_vrd2_sin : RuntimeLibcallImpl<SIN_V2F64>;
+  def amd_vrd2_tan : RuntimeLibcallImpl<TAN_V2F64>;
+  def amd_vrd4_atan : RuntimeLibcallImpl<ATAN_V4F64>;
+  def amd_vrd4_cos : RuntimeLibcallImpl<COS_V4F64>;
+  def amd_vrd4_erf : RuntimeLibcallImpl<ERF_V4F32>;
+  def amd_vrd4_exp : RuntimeLibcallImpl<EXP_V4F64>;
+  def amd_vrd4_exp2 : RuntimeLibcallImpl<EXP2_V4F64>;
+  def amd_vrd4_log : RuntimeLibcallImpl<LOG_V4F64>;
+  def amd_vrd4_log2 : RuntimeLibcallImpl<LOG2_V4F64>;
+  def amd_vrd4_pow : RuntimeLibcallImpl<POW_V4F64>;
+  def amd_vrd4_sin : RuntimeLibcallImpl<SIN_V4F64>;
+  def amd_vrd4_sincos : RuntimeLibcallImpl<SINCOS_V4F64>;
+  def amd_vrd4_tan : RuntimeLibcallImpl<TAN_V4F64>;
+  def amd_vrd8_asin : RuntimeLibcallImpl<ASIN_V8F64>;
+  def amd_vrd8_atan : RuntimeLibcallImpl<ATAN_V8F64>;
+  def amd_vrd8_cos : RuntimeLibcallImpl<COS_V8F64>;
+  def amd_vrd8_erf : RuntimeLibcallImpl<ERF_V8F32>;
+  def amd_vrd8_exp : RuntimeLibcallImpl<EXP_V8F64>;
+  def amd_vrd8_exp2 : RuntimeLibcallImpl<EXP2_V8F64>;
+  def amd_vrd8_log : RuntimeLibcallImpl<LOG_V8F64>;
+  def amd_vrd8_log2 : RuntimeLibcallImpl<LOG2_V8F64>;
+  def amd_vrd8_pow : RuntimeLibcallImpl<POW_V8F64>;
+  def amd_vrd8_sin : RuntimeLibcallImpl<SIN_V8F64>;
+  def amd_vrd8_sincos : RuntimeLibcallImpl<SINCOS_V8F64>;
+  def amd_vrd8_tan : RuntimeLibcallImpl<TAN_V8F64>;
+  def amd_vrs16_acosf : RuntimeLibcallImpl<ACOS_V16F32>;
+  def amd_vrs16_asinf : RuntimeLibcallImpl<ASIN_V16F32>;
+  def amd_vrs16_atanf : RuntimeLibcallImpl<ATAN_V16F32>;
+  def amd_vrs16_cosf : RuntimeLibcallImpl<COS_V16F32>;
+  def amd_vrs16_erff : RuntimeLibcallImpl<ERF_V16F32>;
+  def amd_vrs16_exp2f : RuntimeLibcallImpl<EXP2_V16F32>;
+  def amd_vrs16_expf : RuntimeLibcallImpl<EXP_V16F32>;
+  def amd_vrs16_log10f : RuntimeLibcallImpl<LOG10_V16F32>;
+  def amd_vrs16_log2f : RuntimeLibcallImpl<LOG2_V16F32>;
+  def amd_vrs16_logf : RuntimeLibcallImpl<LOG_V16F32>;
+  def amd_vrs16_powf : RuntimeLibcallImpl<POW_V16F32>;
+  def amd_vrs16_sincosf : RuntimeLibcallImpl<SINCOS_V16F32>;
+  def amd_vrs16_sinf : RuntimeLibcallImpl<SIN_V16F32>;
+  def amd_vrs16_tanf : RuntimeLibcallImpl<TAN_V16F32>;
+  def amd_vrs16_tanhf : RuntimeLibcallImpl<TANH_V16F32>;
+  def amd_vrs4_acosf : RuntimeLibcallImpl<ACOS_V4F32>;
+  def amd_vrs4_asinf : RuntimeLibcallImpl<ASIN_V4F32>;
+  def amd_vrs4_atanf : RuntimeLibcallImpl<ATAN_V4F32>;
+  def amd_vrs4_cbrtf : RuntimeLibcallImpl<CBRT_V4F32>;
+  def amd_vrs4_cosf : RuntimeLibcallImpl<COS_V4F32>;
+  def amd_vrs4_coshf : RuntimeLibcallImpl<COSH_V4F32>;
+  def amd_vrs4_erff : RuntimeLibcallImpl<ERF_V4F32>;
+  def amd_vrs4_exp10f : RuntimeLibcallImpl<EXP10_V4F32>;
+  def amd_vrs4_exp2f : RuntimeLibcallImpl<EXP2_V4F32>;
+  def amd_vrs4_expf : RuntimeLibcallImpl<EXP_V4F32>;
+  def amd_vrs4_expm1f : RuntimeLibcallImpl<EXPM1_V4F32>;
+  def amd_vrs4_log10f : RuntimeLibcallImpl<LOG10_V4F32>;
+  def amd_vrs4_log1pf : RuntimeLibcallImpl<LOG1P_V4F32>;
+  def amd_vrs4_log2f : RuntimeLibcallImpl<LOG2_V4F32>;
+  def amd_vrs4_logf : RuntimeLibcallImpl<LOG_V4F32>;
+  def amd_vrs4_powf : RuntimeLibcallImpl<POW_V4F32>;
+  def amd_vrs4_sincosf : RuntimeLibcallImpl<SINCOS_V4F32>;
+  def amd_vrs4_sinf : RuntimeLibcallImpl<SIN_V4F32>;
+  def amd_vrs4_tanf : RuntimeLibcallImpl<TAN_V4F32>;
+  def amd_vrs4_tanhf : RuntimeLibcallImpl<TANH_V4F32>;
+  def amd_vrs8_acosf : RuntimeLibcallImpl<ACOS_V8F32>;
+  def amd_vrs8_asinf : RuntimeLibcallImpl<ASIN_V8F32>;
+  def amd_vrs8_atanf : RuntimeLibcallImpl<ATAN_V8F32>;
+  def amd_vrs8_cosf : RuntimeLibcallImpl<COS_V8F32>;
+  def amd_vrs8_coshf : RuntimeLibcallImpl<COSH_V8F32>;
+  def amd_vrs8_erff : RuntimeLibcallImpl<ERF_V8F32>;
+  def amd_vrs8_exp2f : RuntimeLibcallImpl<EXP2_V8F32>;
+  def amd_vrs8_expf : RuntimeLibcallImpl<EXP_V8F32>;
+  def amd_vrs8_log10f : RuntimeLibcallImpl<LOG10_V8F32>;
+  def amd_vrs8_log2f : RuntimeLibcallImpl<LOG2_V8F32>;
+  def amd_vrs8_logf : RuntimeLibcallImpl<LOG_V8F32>;
+  def amd_vrs8_powf : RuntimeLibcallImpl<POW_V8F32>;
+  def amd_vrs8_sincosf : RuntimeLibcallImpl<SINCOS_V8F32>;
+  def amd_vrs8_sinf : RuntimeLibcallImpl<SIN_V8F32>;
+  def amd_vrs8_tanf : RuntimeLibcallImpl<TAN_V8F32>;
+  def amd_vrs8_tanhf : RuntimeLibcallImpl<TANH_V8F32>;
+}
diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index 86060d1d2b0b3..288fa10fc04bb 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -640,15 +640,8 @@ void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc,
 }
 
 /// SYCLWrapper helper class that creates all LLVM IRs wrapping given images.
-struct SYCLWrapper {
-  Module &M;
-  LLVMContext &C;
-  SYCLJITOptions Options;
-
-  StructType *EntryTy = nullptr;
-  StructType *SyclDeviceImageTy = nullptr;
-  StructType *SyclBinDescTy = nullptr;
-
+class SYCLWrapper {
+public:
   SYCLWrapper(Module &M, const SYCLJITOptions &Options)
       : M(M), C(M.getContext()), Options(Options) {
     EntryTy = offloading::getEntryTy(M);
@@ -656,6 +649,115 @@ struct SYCLWrapper {
     SyclBinDescTy = getSyclBinDescTy();
   }
 
+  /// Creates binary descriptor for the given device images. Binary descriptor
+  /// is an object that is passed to the offloading runtime at program startup
+  /// and it describes all device images available in the executable or shared
+  /// library. It is defined as follows:
+  ///
+  /// \code
+  /// __attribute__((visibility("hidden")))
+  /// __tgt_offload_entry *__sycl_offload_entries_arr0[];
+  /// ...
+  /// __attribute__((visibility("hidden")))
+  /// __tgt_offload_entry *__sycl_offload_entries_arrN[];
+  ///
+  /// __attribute__((visibility("hidden")))
+  /// extern const char *CompileOptions = "...";
+  /// ...
+  /// __attribute__((visibility("hidden")))
+  /// extern const char *LinkOptions = "...";
+  /// ...
+  ///
+  /// static const char Image0[] = { ... };
+  ///  ...
+  /// static const char ImageN[] = { ... };
+  ///
+  /// static const __sycl.tgt_device_image Images[] = {
+  ///   {
+  ///     Version,                                      // Version
+  ///     OffloadKind,                                  // OffloadKind
+  ///     Format,                                       // Format of the image.
+  //      TripleString,                                 // Arch
+  ///     CompileOptions,                               // CompileOptions
+  ///     LinkOptions,                                  // LinkOptions
+  ///     Image0,                                       // ImageStart
+  ///     Image0 + IMAGE0_SIZE,                         // ImageEnd
+  ///     __sycl_offload_entries_arr0,                  // EntriesBegin
+  ///     __sycl_offload_entries_arr0 + ENTRIES0_SIZE,  // EntriesEnd
+  ///     NULL,                                         // PropertiesBegin
+  ///     NULL,                                         // PropertiesEnd
+  ///   },
+  ///   ...
+  /// };
+  ///
+  /// static const __sycl.tgt_bin_desc FatbinDesc = {
+  ///   Version,                             //Version
+  ///   sizeof(Images) / sizeof(Images[0]),  //NumDeviceImages
+  ///   Images,                              //DeviceImages
+  ///   NULL,                                //HostEntriesBegin
+  ///   NULL                                 //HostEntriesEnd
+  /// };
+  /// \endcode
+  ///
+  /// \returns Global variable that represents FatbinDesc.
+  GlobalVariable *createFatbinDesc(ArrayRef<OffloadFile> OffloadFiles) {
+    StringRef OffloadKindTag = ".sycl_offloading.";
+    SmallVector<Constant *> WrappedImages;
+    WrappedImages.reserve(OffloadFiles.size());
+    for (size_t I = 0, E = OffloadFiles.size(); I != E; ++I)
+      WrappedImages.push_back(
+          wrapImage(*OffloadFiles[I].getBinary(), Twine(I), OffloadKindTag));
+
+    return combineWrappedImages(WrappedImages, OffloadKindTag);
+  }
+
+  void createRegisterFatbinFunction(GlobalVariable *FatbinDesc) {
+    FunctionType *FuncTy =
+        FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
+    Function *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+                                      Twine("sycl") + ".descriptor_reg", &M);
+    Func->setSection(".text.startup");
+
+    // Get RegFuncName function declaration.
+    FunctionType *RegFuncTy =
+        FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C),
+                          /*isVarArg=*/false);
+    FunctionCallee RegFuncC =
+        M.getOrInsertFunction("__sycl_register_lib", RegFuncTy);
+
+    // Construct function body.
+    IRBuilder Builder(BasicBlock::Create(C, "entry", Func));
+    Builder.CreateCall(RegFuncC, FatbinDesc);
+    Builder.CreateRetVoid();
+
+    // Add this function to constructors.
+    appendToGlobalCtors(M, Func, /*Priority*/ 1);
+  }
+
+  void createUnregisterFunction(GlobalVariable *FatbinDesc) {
+    FunctionType *FuncTy =
+        FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
+    Function *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
+                                      "sycl.descriptor_unreg", &M);
+    Func->setSection(".text.startup");
+
+    // Get UnregFuncName function declaration.
+    FunctionType *UnRegFuncTy =
+        FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C),
+                          /*isVarArg=*/false);
+    FunctionCallee UnRegFuncC =
+        M.getOrInsertFunction("__sycl_unregister_lib", UnRegFuncTy);
+
+    // Construct function body
+    IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
+    Builder.CreateCall(UnRegFuncC, FatbinDesc);
+    Builder.CreateRetVoid();
+
+    // Add this function to global destructors.
+    appendToGlobalDtors(M, Func, /*Priority*/ 1);
+  }
+
+private:
   IntegerType *getSizeTTy() {
     switch (M.getDataLayout().getPointerSize()) {
     case 4:
@@ -678,28 +780,28 @@ struct SYCLWrapper {
   /// SYCL specific image descriptor type.
   /// \code
   /// struct __sycl.tgt_device_image {
-  ///   // version of this structure - for backward compatibility;
+  ///   // Version of this structure - for backward compatibility;
   ///   // all modifications which change order/type/offsets of existing fields
   ///   // should increment the version.
   ///   uint16_t Version;
-  ///   // the kind of offload model the image employs.
+  ///   // The kind of offload model the image employs.
   ///   uint8_t OffloadKind;
-  ///   // format of the image data - SPIRV, LLVMIR bitcode, etc
+  ///   // Format of the image data - SPIRV, LLVMIR bitcode, etc.
   ///   uint8_t Format;
-  ///   // null-terminated string representation of the device's target
-  ///   // architecture
+  ///   // Null-terminated string representation of the device's target
+  ///   // architecture.
   ///   const char *Arch;
-  ///   // a null-terminated string; target- and compiler-specific options
-  ///   // which are suggested to use to "compile" program at runtime
+  ///   // A null-terminated string; target- and compiler-specific options
+  ///   // which are passed to the device compiler at runtime.
   ///   const char *CompileOptions;
-  ///   // a null-terminated string; target- and compiler-specific options
-  ///   // which are suggested to use to "link" program at runtime
+  ///   // A null-terminated string; target- and compiler-specific options
+  ///   // which are passed to the device linker at runtime.
   ///   const char *LinkOptions;
-  ///   // Pointer to the device binary image start
+  ///   // Pointer to the device binary image start.
   ///   void *ImageStart;
-  ///   // Pointer to the device binary image end
+  ///   // Pointer to the device binary image end.
   ///   void *ImageEnd;
-  ///   // the entry table
+  ///   // The entry table.
   ///   __tgt_offload_entry *EntriesBegin;
   ///   __tgt_offload_entry *EntriesEnd;
   ///   const char *PropertiesBegin;
@@ -753,13 +855,14 @@ struct SYCLWrapper {
   GlobalVariable *addGlobalArrayVariable(const Twine &Name,
                                          ArrayRef<char> Initializer,
                                          const Twine &Section = "") {
-    auto *Arr = ConstantDataArray::get(M.getContext(), Initializer);
-    auto *Var = new GlobalVariable(M, Arr->getType(), /*isConstant*/ true,
-                                   GlobalVariable::InternalLinkage, Arr, Name);
+    Constant *Arr = ConstantDataArray::get(M.getContext(), Initializer);
+    GlobalVariable *Var =
+        new GlobalVariable(M, Arr->getType(), /*isConstant*/ true,
+                           GlobalVariable::InternalLinkage, Arr, Name);
     Var->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
 
     SmallVector<char, 32> NameBuf;
-    auto SectionName = Section.toStringRef(NameBuf);
+    StringRef SectionName = Section.toStringRef(NameBuf);
     if (!SectionName.empty())
       Var->setSection(SectionName);
     return Var;
@@ -771,10 +874,10 @@ struct SYCLWrapper {
   std::pair<Constant *, Constant *>
   addArrayToModule(ArrayRef<char> Buf, const Twine &Name,
                    const Twine &Section = "") {
-    auto *Var = addGlobalArrayVariable(Name, Buf, Section);
-    auto *ImageB = ConstantExpr::getGetElementPtr(Var->getValueType(), Var,
-                                                  getSizetConstPair(0, 0));
-    auto *ImageE = ConstantExpr::getGetElementPtr(
+    GlobalVariable *Var = addGlobalArrayVariable(Name, Buf, Section);
+    Constant *ImageB = ConstantExpr::getGetElementPtr(Var->getValueType(), Var,
+                                                      getSizetConstPair(0, 0));
+    Constant *ImageE = ConstantExpr::getGetElementPtr(
         Var->getValueType(), Var, getSizetConstPair(0, Buf.size()));
     return std::make_pair(ImageB, ImageE);
   }
@@ -783,9 +886,9 @@ struct SYCLWrapper {
   /// \returns Constant pointer to the added data. The pointer type does not
   /// carry size information.
   Constant *addRawDataToModule(ArrayRef<char> Data, const Twine &Name) {
-    auto *Var = addGlobalArrayVariable(Name, Data);
-    auto *DataPtr = ConstantExpr::getGetElementPtr(Var->getValueType(), Var,
-                                                   getSizetConstPair(0, 0));
+    GlobalVariable *Var = addGlobalArrayVariable(Name, Data);
+    Constant *DataPtr = ConstantExpr::getGetElementPtr(Var->getValueType(), Var,
+                                                       getSizetConstPair(0, 0));
     return DataPtr;
   }
 
@@ -795,11 +898,12 @@ struct SYCLWrapper {
   /// \returns Link-time constant pointer (constant expr) to that
   /// variable.
   Constant *addStringToModule(StringRef Str, const Twine &Name) {
-    auto *Arr = ConstantDataArray::getString(C, Str);
-    auto *Var = new GlobalVariable(M, Arr->getType(), /*isConstant*/ true,
-                                   GlobalVariable::InternalLinkage, Arr, Name);
+    Constant *Arr = ConstantDataArray::getString(C, Str);
+    GlobalVariable *Var =
+        new GlobalVariable(M, Arr->getType(), /*isConstant*/ true,
+                           GlobalVariable::InternalLinkage, Arr, Name);
     Var->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-    auto *Zero = ConstantInt::get(getSizeTTy(), 0);
+    ConstantInt *Zero = ConstantInt::get(getSizeTTy(), 0);
     Constant *ZeroZero[] = {Zero, Zero};
     return ConstantExpr::getGetElementPtr(Var->getValueType(), Var, ZeroZero);
   }
@@ -823,15 +927,15 @@ struct SYCLWrapper {
       EntriesInits.push_back(GV->getInitializer());
     }
 
-    auto *Arr = ConstantArray::get(ArrayType::get(EntryTy, EntriesInits.size()),
-                                   EntriesInits);
-    auto *EntriesGV = new GlobalVariable(M, Arr->getType(), /*isConstant*/ true,
-                                         GlobalVariable::InternalLinkage, Arr,
-                                         OffloadKindTag + "entries_arr");
+    Constant *Arr = ConstantArray::get(
+        ArrayType::get(EntryTy, EntriesInits.size()), EntriesInits);
+    GlobalVariable *EntriesGV = new GlobalVariable(
+        M, Arr->getType(), /*isConstant*/ true, GlobalVariable::InternalLinkage,
+        Arr, OffloadKindTag + "entries_arr");
 
-    auto *EntriesB = ConstantExpr::getGetElementPtr(
+    Constant *EntriesB = ConstantExpr::getGetElementPtr(
         EntriesGV->getValueType(), EntriesGV, getSizetConstPair(0, 0));
-    auto *EntriesE = ConstantExpr::getGetElementPtr(
+    Constant *EntriesE = ConstantExpr::getGetElementPtr(
         EntriesGV->getValueType(), EntriesGV,
         getSizetConstPair(0, EntriesInits.size()));
     return std::make_pair(EntriesB, EntriesE);
@@ -871,6 +975,9 @@ struct SYCLWrapper {
     // For SYCL images offload entries are defined here per image.
     std::pair<Constant *, Constant *> ImageEntriesPtrs =
         initOffloadEntriesPerImage(OB.getString("symbols"), OffloadKindTag);
+
+    // .first and .second arguments below correspond to start and end pointers
+    // respectively.
     Constant *WrappedBinary = ConstantStruct::get(
         SyclDeviceImageTy, Version, OffloadKindConstant, ImageKindConstant,
         TripleConstant, CompileOptions, LinkOptions, Binary.first,
@@ -882,23 +989,23 @@ struct SYCLWrapper {
 
   GlobalVariable *combineWrappedImages(ArrayRef<Constant *> WrappedImages,
                                        StringRef OffloadKindTag) {
-    auto *ImagesData = ConstantArray::get(
+    Constant *ImagesData = ConstantArray::get(
         ArrayType::get(SyclDeviceImageTy, WrappedImages.size()), WrappedImages);
-    auto *ImagesGV =
+    GlobalVariable *ImagesGV =
         new GlobalVariable(M, ImagesData->getType(), /*isConstant*/ true,
                            GlobalValue::InternalLinkage, ImagesData,
                            Twine(OffloadKindTag) + "device_images");
     ImagesGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
 
-    auto *Zero = ConstantInt::get(getSizeTTy(), 0);
+    ConstantInt *Zero = ConstantInt::get(getSizeTTy(), 0);
     Constant *ZeroZero[] = {Zero, Zero};
-    auto *ImagesB = ConstantExpr::getGetElementPtr(ImagesGV->getValueType(),
-                                                   ImagesGV, ZeroZero);
+    Constant *ImagesB = ConstantExpr::getGetElementPtr(ImagesGV->getValueType(),
+                                                       ImagesGV, ZeroZero);
 
     Constant *EntriesB = Constant::getNullValue(PointerType::getUnqual(C));
     Constant *EntriesE = Constant::getNullValue(PointerType::getUnqual(C));
     static constexpr uint16_t BinDescStructVersion = 1;
-    auto *DescInit = ConstantStruct::get(
+    Constant *DescInit = ConstantStruct::get(
         SyclBinDescTy,
         ConstantInt::get(Type::getInt16Ty(C), BinDescStructVersion),
         ConstantInt::get(Type::getInt16Ty(C), WrappedImages.size()), ImagesB,
@@ -909,111 +1016,13 @@ struct SYCLWrapper {
                               Twine(OffloadKindTag) + "descriptor");
   }
 
-  /// Creates binary descriptor for the given device images. Binary descriptor
-  /// is an object that is passed to the offloading runtime at program startup
-  /// and it describes all device images available in the executable or shared
-  /// library. It is defined as follows:
-  ///
-  /// \code
-  /// __attribute__((visibility("hidden")))
-  /// __tgt_offload_entry *__sycl_offload_entries_arr0[];
-  /// ...
-  /// __attribute__((visibility("hidden")))
-  /// __tgt_offload_entry *__sycl_offload_entries_arrN[];
-  ///
-  /// __attribute__((visibility("hidden")))
-  /// extern const char *CompileOptions = "...";
-  /// ...
-  /// __attribute__((visibility("hidden")))
-  /// extern const char *LinkOptions = "...";
-  /// ...
-  ///
-  /// static const char Image0[] = { ... };
-  ///  ...
-  /// static const char ImageN[] = { ... };
-  ///
-  /// static const __sycl.tgt_device_image Images[] = {
-  ///   {
-  ///     Version,                                      // Version
-  ///     OffloadKind,                                  // OffloadKind
-  ///     Format,                                       // Format of the image.
-  //      TripleString,                                 // Arch
-  ///     CompileOptions,                               // CompileOptions
-  ///     LinkOptions,                                  // LinkOptions
-  ///     Image0,                                       // ImageStart
-  ///     Image0 + IMAGE0_SIZE,                         // ImageEnd
-  ///     __sycl_offload_entries_arr0,                  // EntriesBegin
-  ///     __sycl_offload_entries_arr0 + ENTRIES0_SIZE,  // EntriesEnd
-  ///     NULL,                                         // PropertiesBegin
-  ///     NULL,                                         // PropertiesEnd
-  ///   },
-  ///   ...
-  /// };
-  ///
-  /// static const __sycl.tgt_bin_desc FatbinDesc = {
-  ///   Version,                             //Version
-  ///   sizeof(Images) / sizeof(Images[0]),  //NumDeviceImages
-  ///   Images,                              //DeviceImages
-  ///   NULL,                                //HostEntriesBegin
-  ///   NULL                                 //HostEntriesEnd
-  /// };
-  /// \endcode
-  ///
-  /// \returns Global variable that represents FatbinDesc.
-  GlobalVariable *createFatbinDesc(ArrayRef<OffloadFile> OffloadFiles) {
-    StringRef OffloadKindTag = ".sycl_offloading.";
-    SmallVector<Constant *> WrappedImages;
-    WrappedImages.reserve(OffloadFiles.size());
-    for (size_t I = 0, E = OffloadFiles.size(); I != E; ++I)
-      WrappedImages.push_back(
-          wrapImage(*OffloadFiles[I].getBinary(), Twine(I), OffloadKindTag));
-
-    return combineWrappedImages(WrappedImages, OffloadKindTag);
-  }
-
-  void createRegisterFatbinFunction(GlobalVariable *FatbinDesc) {
-    auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-    auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
-                                  Twine("sycl") + ".descriptor_reg", &M);
-    Func->setSection(".text.startup");
-
-    // Get RegFuncName function declaration.
-    auto *RegFuncTy =
-        FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C),
-                          /*isVarArg=*/false);
-    FunctionCallee RegFuncC =
-        M.getOrInsertFunction("__sycl_register_lib", RegFuncTy);
-
-    // Construct function body
-    IRBuilder Builder(BasicBlock::Create(C, "entry", Func));
-    Builder.CreateCall(RegFuncC, FatbinDesc);
-    Builder.CreateRetVoid();
-
-    // Add this function to constructors.
-    appendToGlobalCtors(M, Func, /*Priority*/ 1);
-  }
-
-  void createUnregisterFunction(GlobalVariable *FatbinDesc) {
-    auto *FuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false);
-    auto *Func = Function::Create(FuncTy, GlobalValue::InternalLinkage,
-                                  "sycl.descriptor_unreg", &M);
-    Func->setSection(".text.startup");
-
-    // Get UnregFuncName function declaration.
-    auto *UnRegFuncTy =
-        FunctionType::get(Type::getVoidTy(C), PointerType::getUnqual(C),
-                          /*isVarArg=*/false);
-    FunctionCallee UnRegFuncC =
-        M.getOrInsertFunction("__sycl_unregister_lib", UnRegFuncTy);
-
-    // Construct function body
-    IRBuilder<> Builder(BasicBlock::Create(C, "entry", Func));
-    Builder.CreateCall(UnRegFuncC, FatbinDesc);
-    Builder.CreateRetVoid();
+  Module &M;
+  LLVMContext &C;
+  SYCLJITOptions Options;
 
-    // Add this function to global destructors.
-    appendToGlobalDtors(M, Func, /*Priority*/ 1);
-  }
+  StructType *EntryTy = nullptr;
+  StructType *SyclDeviceImageTy = nullptr;
+  StructType *SyclBinDescTy = nullptr;
 }; // end of SYCLWrapper
 
 } // namespace
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 24604391af33e..5150df5346c47 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5569,9 +5569,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
         Desc.getNumOperands() + Desc.implicit_uses().size();
     const unsigned NumImplicitOps = IsDst ? 2 : 1;
 
-    // Allow additional implicit operands. This allows a fixup done by the post
-    // RA scheduler where the main implicit operand is killed and implicit-defs
-    // are added for sub-registers that remain live after this instruction.
+    // Require additional implicit operands. This allows a fixup done by the
+    // post RA scheduler where the main implicit operand is killed and
+    // implicit-defs are added for sub-registers that remain live after this
+    // instruction.
     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
       ErrInfo = "missing implicit register operands";
       return false;
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index ac95ef5f30888..b8c1c261fa6db 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2886,11 +2886,13 @@ static SDValue lowerBUILD_VECTORAsBroadCastLoad(BuildVectorSDNode *BVOp,
 
   if ((ExtType == ISD::EXTLOAD || ExtType == ISD::NON_EXTLOAD) &&
       VT.getScalarSizeInBits() == LN->getMemoryVT().getScalarSizeInBits()) {
-    SDVTList Tys =
-        LN->isIndexed()
-            ? DAG.getVTList(VT, LN->getBasePtr().getValueType(), MVT::Other)
-            : DAG.getVTList(VT, MVT::Other);
-    SDValue Ops[] = {LN->getChain(), LN->getBasePtr(), LN->getOffset()};
+    // Indexed loads and stores are not supported on LoongArch.
+    assert(LN->isUnindexed() && "Unexpected indexed load.");
+
+    SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+    // The offset operand of unindexed load is always undefined, so there is
+    // no need to pass it to VLDREPL.
+    SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
     SDValue BCast = DAG.getNode(LoongArchISD::VLDREPL, DL, Tys, Ops);
     DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
     return BCast;
diff --git a/llvm/lib/Target/LoongArch/LoongArchSelectionDAGInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchSelectionDAGInfo.cpp
index 11d05042c94f8..c07adfc48a0f5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchSelectionDAGInfo.cpp
@@ -17,13 +17,3 @@ LoongArchSelectionDAGInfo::LoongArchSelectionDAGInfo()
     : SelectionDAGGenTargetInfo(LoongArchGenSDNodeInfo) {}
 
 LoongArchSelectionDAGInfo::~LoongArchSelectionDAGInfo() = default;
-
-void LoongArchSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
-                                                 const SDNode *N) const {
-  switch (N->getOpcode()) {
-  case LoongArchISD::VLDREPL:
-    // invalid number of operands; expected 2, got 3
-    return;
-  }
-  SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
-}
diff --git a/llvm/lib/Target/LoongArch/LoongArchSelectionDAGInfo.h b/llvm/lib/Target/LoongArch/LoongArchSelectionDAGInfo.h
index ba5657080b3e4..7210a15297a3e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSelectionDAGInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSelectionDAGInfo.h
@@ -21,9 +21,6 @@ class LoongArchSelectionDAGInfo : public SelectionDAGGenTargetInfo {
   LoongArchSelectionDAGInfo();
 
   ~LoongArchSelectionDAGInfo() override;
-
-  void verifyTargetNode(const SelectionDAG &DAG,
-                        const SDNode *N) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a77eb0240e677..454a237b1be78 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -713,8 +713,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
                        Custom);
   }
 
-  setOperationAction(ISD::BSWAP, MVT::i16, Expand);
-
   setOperationAction(ISD::BR_JT, MVT::Other, Custom);
   setOperationAction(ISD::BRIND, MVT::Other, Expand);
 
@@ -1106,6 +1104,10 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   //   * MVT::Other - internal.addrspace.wrap
   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
                      {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
+
+  // Custom lowering for bswap
+  setOperationAction(ISD::BSWAP, {MVT::i16, MVT::i32, MVT::i64, MVT::v2i16},
+                     Custom);
 }
 
 TargetLoweringBase::LegalizeTypeAction
@@ -2570,6 +2572,44 @@ static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG) {
   return Tcgen05StNode;
 }
 
+static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue Src = Op.getOperand(0);
+  EVT VT = Op.getValueType();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i16: {
+    SDValue Extended = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
+    SDValue Swapped =
+        getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG);
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped);
+  }
+  case MVT::i32: {
+    return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
+  }
+  case MVT::v2i16: {
+    SDValue Converted = DAG.getBitcast(MVT::i32, Src);
+    SDValue Swapped =
+        getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG);
+    return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped);
+  }
+  case MVT::i64: {
+    SDValue UnpackSrc =
+        DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, Src);
+    SDValue SwappedLow =
+        getPRMT(UnpackSrc.getValue(0), DAG.getConstant(0, DL, MVT::i32), 0x0123,
+                DL, DAG);
+    SDValue SwappedHigh =
+        getPRMT(UnpackSrc.getValue(1), DAG.getConstant(0, DL, MVT::i32), 0x0123,
+                DL, DAG);
+    return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64,
+                       {SwappedHigh, SwappedLow});
+  }
+  default:
+    llvm_unreachable("unsupported type for bswap");
+  }
+}
+
 static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
   switch (IID) {
   case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
@@ -3193,7 +3233,8 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return lowerCTLZCTPOP(Op, DAG);
   case ISD::FREM:
     return lowerFREM(Op, DAG);
-
+  case ISD::BSWAP:
+    return lowerBSWAP(Op, DAG);
   default:
     llvm_unreachable("Custom lowering not defined for operation");
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 8b129e7e5eeae..04e2dd435cdf0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2468,38 +2468,6 @@ let Predicates = [hasPTX<73>, hasSM<52>] in {
 
 include "NVPTXIntrinsics.td"
 
-//-----------------------------------
-// Notes
-//-----------------------------------
-// BSWAP is currently expanded. The following is a more efficient
-// - for < sm_20, use vector scalar mov, as tesla support native 16-bit register
-// - for sm_20, use pmpt (use vector scalar mov to get the pack and
-//   unpack). sm_20 supports native 32-bit register, but not native 16-bit
-// register.
-
-def : Pat <
-  (i32 (bswap i32:$a)),
-  (PRMT_B32rii $a, (i32 0), (i32 0x0123), PrmtNONE)>;
-
-def : Pat <
-  (v2i16 (bswap v2i16:$a)),
-  (PRMT_B32rii $a, (i32 0), (i32 0x2301), PrmtNONE)>;
-
-def : Pat <
-  (i64 (bswap i64:$a)),
-  (V2I32toI64
-    (PRMT_B32rii (I64toI32H_Sink $a), (i32 0), (i32 0x0123), PrmtNONE),
-    (PRMT_B32rii (I64toI32L_Sink $a), (i32 0), (i32 0x0123), PrmtNONE))>,
-  Requires<[hasPTX<71>]>;
-
-// Fall back to the old way if we don't have PTX 7.1.
-def : Pat <
-  (i64 (bswap i64:$a)),
-  (V2I32toI64
-    (PRMT_B32rii (I64toI32H $a), (i32 0), (i32 0x0123), PrmtNONE),
-    (PRMT_B32rii (I64toI32L $a), (i32 0), (i32 0x0123), PrmtNONE))>;
-
-
 ////////////////////////////////////////////////////////////////////////////////
 // PTX Fence instructions
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index e5819d90526d9..2ddc9b0adb9e1 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -838,16 +838,23 @@ class VSETVLIInfo {
   /// @{
   void print(raw_ostream &OS) const {
     OS << "{";
-    if (!isValid())
+    switch (State) {
+    case Uninitialized:
       OS << "Uninitialized";
-    if (isUnknown())
+      break;
+    case Unknown:
       OS << "unknown";
-    if (hasAVLReg())
+      break;
+    case AVLIsReg:
       OS << "AVLReg=" << llvm::printReg(getAVLReg());
-    if (hasAVLImm())
+      break;
+    case AVLIsImm:
       OS << "AVLImm=" << (unsigned)AVLImm;
-    if (hasAVLVLMAX())
+      break;
+    case AVLIsVLMAX:
       OS << "AVLVLMAX";
+      break;
+    }
     OS << ", ";
 
     unsigned LMul;
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index d8620b9865508..8d1820cac0ee5 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -2271,20 +2271,81 @@ StringMap<bool> sys::getHostCPUFeatures() {
   uint32_t Sha2 = CAP_SHA1 | CAP_SHA2;
   Features["aes"] = (crypto & Aes) == Aes;
   Features["sha2"] = (crypto & Sha2) == Sha2;
+
+  // Even if an underlying core supports SVE, it might not be available if
+  // it's disabled by the OS, or some other layer. Disable SVE if we don't
+  // detect support at runtime.
+  if (!Features.contains("sve"))
+    Features["sve"] = false;
 #endif
 
   return Features;
 }
 #elif defined(_WIN32) && (defined(__aarch64__) || defined(_M_ARM64) ||         \
                           defined(__arm64ec__) || defined(_M_ARM64EC))
+#ifndef PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE 43
+#endif
+#ifndef PF_ARM_V83_JSCVT_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_V83_JSCVT_INSTRUCTIONS_AVAILABLE 44
+#endif
+#ifndef PF_ARM_V83_LRCPC_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_V83_LRCPC_INSTRUCTIONS_AVAILABLE 45
+#endif
+#ifndef PF_ARM_SVE_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_SVE_INSTRUCTIONS_AVAILABLE 46
+#endif
+#ifndef PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE 47
+#endif
+#ifndef PF_ARM_SVE_PMULL128_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_SVE_PMULL128_INSTRUCTIONS_AVAILABLE 50
+#endif
+#ifndef PF_ARM_SVE_SHA3_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_SVE_SHA3_INSTRUCTIONS_AVAILABLE 55
+#endif
+#ifndef PF_ARM_SVE_SM4_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_SVE_SM4_INSTRUCTIONS_AVAILABLE 56
+#endif
+#ifndef PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE 57
+#endif
+#ifndef PF_ARM_SVE_F32MM_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_SVE_F32MM_INSTRUCTIONS_AVAILABLE 58
+#endif
+#ifndef PF_ARM_SVE_F64MM_INSTRUCTIONS_AVAILABLE
+#define PF_ARM_SVE_F64MM_INSTRUCTIONS_AVAILABLE 59
+#endif
 StringMap<bool> sys::getHostCPUFeatures() {
   StringMap<bool> Features;
 
   // If we're asking the OS at runtime, believe what the OS says
-  Features["neon"] =
-      IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE);
   Features["crc"] =
       IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE);
+  Features["lse"] =
+      IsProcessorFeaturePresent(PF_ARM_V81_ATOMIC_INSTRUCTIONS_AVAILABLE);
+  Features["dotprod"] =
+      IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE);
+  Features["jsconv"] =
+      IsProcessorFeaturePresent(PF_ARM_V83_JSCVT_INSTRUCTIONS_AVAILABLE);
+  Features["rcpc"] =
+      IsProcessorFeaturePresent(PF_ARM_V83_LRCPC_INSTRUCTIONS_AVAILABLE);
+  Features["sve"] =
+      IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE);
+  Features["sve2"] =
+      IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE);
+  Features["sve-aes"] =
+      IsProcessorFeaturePresent(PF_ARM_SVE_PMULL128_INSTRUCTIONS_AVAILABLE);
+  Features["sve-sha3"] =
+      IsProcessorFeaturePresent(PF_ARM_SVE_SHA3_INSTRUCTIONS_AVAILABLE);
+  Features["sve-sm4"] =
+      IsProcessorFeaturePresent(PF_ARM_SVE_SM4_INSTRUCTIONS_AVAILABLE);
+  Features["f32mm"] =
+      IsProcessorFeaturePresent(PF_ARM_SVE_F32MM_INSTRUCTIONS_AVAILABLE);
+  Features["f64mm"] =
+      IsProcessorFeaturePresent(PF_ARM_SVE_F64MM_INSTRUCTIONS_AVAILABLE);
+  Features["i8mm"] =
+      IsProcessorFeaturePresent(PF_ARM_SVE_I8MM_INSTRUCTIONS_AVAILABLE);
 
   // Avoid inferring "crypto" means more than the traditional AES + SHA2
   bool TradCrypto =
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7ac132a99fbec..a63956c0cba6b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9257,6 +9257,7 @@ static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
 ///  2. In the case of loops with uncountable early exits, we may have to do
 ///     extra work when exiting the loop early, such as calculating the final
 ///     exit values of variables used outside the loop.
+///  3. The middle block, if expected TC <= VF.Width.
 static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
                                         VectorizationFactor &VF, Loop *L,
                                         PredicatedScalarEvolution &PSE,
@@ -9271,6 +9272,14 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
   // one exists.
   TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
 
+  // If the expected trip count is less than the VF, the vector loop will only
+  // execute a single iteration. Then the middle block is executed the same
+  // number of times as the vector region.
+  // TODO: Extend logic to always account for the cost of the middle block.
+  auto ExpectedTC = getSmallBestKnownTC(PSE, L);
+  if (ExpectedTC && ElementCount::isKnownLE(*ExpectedTC, VF.Width))
+    TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
+
   // When interleaving only scalar and vector cost will be equal, which in turn
   // would lead to a divide by 0. Fall back to hard threshold.
   if (VF.Width.isScalar()) {
@@ -9301,9 +9310,11 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
   //  The total cost of the vector loop is
   //    RtC + VecC * (TC / VF) + EpiC
   //  where
-  //  * RtC is the cost of the generated runtime checks plus the cost of
-  //    performing any additional work in the vector.early.exit block for loops
-  //    with uncountable early exits.
+  //  * RtC is the sum of the costs cost of
+  //    - the generated runtime checks
+  //    - performing any additional work in the vector.early.exit block for
+  //      loops with uncountable early exits.
+  //    - the middle block, if ExpectedTC <=  VF.Width.
   //  * VecC is the cost of a single vector iteration.
   //  * TC is the actual trip count of the loop
   //  * VF is the vectorization factor
diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll
index e3d1c80922609..8050c6f1c7031 100644
--- a/llvm/test/CodeGen/NVPTX/bswap.ll
+++ b/llvm/test/CodeGen/NVPTX/bswap.ll
@@ -1,25 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | FileCheck -check-prefixes CHECK,PTX70 %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; RUN: %if ptxas-isa-7.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx70 | %ptxas-verify %}
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | FileCheck -check-prefixes CHECK,PTX71 %s
-; RUN: %if ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -mattr=+ptx71 | %ptxas-verify %}
 
 target triple = "nvptx64-nvidia-cuda"
 
 define i16 @bswap16(i16 %a) {
 ; CHECK-LABEL: bswap16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b16 %rs1, [bswap16_param_0];
-; CHECK-NEXT:    shr.u16 %rs2, %rs1, 8;
-; CHECK-NEXT:    shl.b16 %rs3, %rs1, 8;
-; CHECK-NEXT:    or.b16 %rs4, %rs3, %rs2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ld.param.b16 %r1, [bswap16_param_0];
+; CHECK-NEXT:    prmt.b32 %r2, %r1, 0, 0x7701U;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NEXT:    ret;
   %b = tail call i16 @llvm.bswap.i16(i16 %a)
   ret i16 %b
@@ -56,40 +49,39 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
 }
 
 define i64 @bswap64(i64 %a) {
-; PTX70-LABEL: bswap64(
-; PTX70:       {
-; PTX70-NEXT:    .reg .b32 %r<5>;
-; PTX70-NEXT:    .reg .b64 %rd<3>;
-; PTX70-EMPTY:
-; PTX70-NEXT:  // %bb.0:
-; PTX70-NEXT:    ld.param.b64 %rd1, [bswap64_param_0];
-; PTX70-NEXT:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
-; PTX70-NEXT:    prmt.b32 %r2, %r1, 0, 0x123U;
-; PTX70-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
-; PTX70-NEXT:    prmt.b32 %r4, %r3, 0, 0x123U;
-; PTX70-NEXT:    mov.b64 %rd2, {%r4, %r2};
-; PTX70-NEXT:    st.param.b64 [func_retval0], %rd2;
-; PTX70-NEXT:    ret;
-;
-; PTX71-LABEL: bswap64(
-; PTX71:       {
-; PTX71-NEXT:    .reg .b32 %r<5>;
-; PTX71-NEXT:    .reg .b64 %rd<3>;
-; PTX71-EMPTY:
-; PTX71-NEXT:  // %bb.0:
-; PTX71-NEXT:    ld.param.b64 %rd1, [bswap64_param_0];
-; PTX71-NEXT:    mov.b64 {%r1, _}, %rd1;
-; PTX71-NEXT:    prmt.b32 %r2, %r1, 0, 0x123U;
-; PTX71-NEXT:    mov.b64 {_, %r3}, %rd1;
-; PTX71-NEXT:    prmt.b32 %r4, %r3, 0, 0x123U;
-; PTX71-NEXT:    mov.b64 %rd2, {%r4, %r2};
-; PTX71-NEXT:    st.param.b64 [func_retval0], %rd2;
-; PTX71-NEXT:    ret;
+; CHECK-LABEL: bswap64(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.b64 %rd1, [bswap64_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT:    prmt.b32 %r3, %r1, 0, 0x123U;
+; CHECK-NEXT:    prmt.b32 %r4, %r2, 0, 0x123U;
+; CHECK-NEXT:    mov.b64 %rd2, {%r4, %r3};
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
+; CHECK-NEXT:    ret;
   %b = tail call i64 @llvm.bswap.i64(i64 %a)
   ret i64 %b
 }
 
+define <2 x i32> @bswapv2i32(<2 x i32> %a) {
+; CHECK-LABEL: bswapv2i32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [bswapv2i32_param_0];
+; CHECK-NEXT:    prmt.b32 %r3, %r2, 0, 0x123U;
+; CHECK-NEXT:    prmt.b32 %r4, %r1, 0, 0x123U;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
+; CHECK-NEXT:    ret;
+  %b = tail call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %a)
+  ret <2 x i32> %b
+}
 declare i16 @llvm.bswap.i16(i16)
 declare i32 @llvm.bswap.i32(i32)
 declare <2 x i16> @llvm.bswap.v2i16(<2 x i16>)
 declare i64 @llvm.bswap.i64(i64)
+declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>)
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
index 803c472a35a51..f44d3008cbaa5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll
@@ -569,53 +569,36 @@ define double @test_load_used_by_other_load_scev_low_trip_count(ptr %ptr.a, ptr
 ; I64-NEXT:  [[ENTRY:.*]]:
 ; I64-NEXT:    br label %[[OUTER_LOOP:.*]]
 ; I64:       [[OUTER_LOOP_LOOPEXIT:.*]]:
+; I64-NEXT:    [[RESULT_LCSSA:%.*]] = phi double [ [[RESULT:%.*]], %[[INNER_LOOP:.*]] ]
 ; I64-NEXT:    br label %[[OUTER_LOOP]]
 ; I64:       [[OUTER_LOOP]]:
-; I64-NEXT:    [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29:%.*]], %[[OUTER_LOOP_LOOPEXIT]] ]
+; I64-NEXT:    [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT_LCSSA]], %[[OUTER_LOOP_LOOPEXIT]] ]
 ; I64-NEXT:    [[COND:%.*]] = call i1 @cond()
 ; I64-NEXT:    br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
 ; I64:       [[INNER_LOOP_PREHEADER]]:
-; I64-NEXT:    br label %[[VECTOR_PH:.*]]
-; I64:       [[VECTOR_PH]]:
-; I64-NEXT:    br label %[[VECTOR_BODY:.*]]
-; I64:       [[VECTOR_BODY]]:
-; I64-NEXT:    [[TMP0:%.*]] = add i64 0, 1
-; I64-NEXT:    [[TMP1:%.*]] = add i64 1, 1
-; I64-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP0]]
+; I64-NEXT:    br label %[[INNER_LOOP]]
+; I64:       [[INNER_LOOP]]:
+; I64-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[INNER_LOOP_PREHEADER]] ]
+; I64-NEXT:    [[ACCUM_INNER:%.*]] = phi double [ [[MUL1:%.*]], %[[INNER_LOOP]] ], [ [[ACCUM]], %[[INNER_LOOP_PREHEADER]] ]
+; I64-NEXT:    [[TMP1:%.*]] = add i64 [[IV]], 1
 ; I64-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP1]]
-; I64-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP0]]
 ; I64-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP1]]
-; I64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
 ; I64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
-; I64-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP6]]
 ; I64-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP7]]
 ; I64-NEXT:    [[TMP10:%.*]] = load double, ptr [[PTR_A]], align 8
-; I64-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
-; I64-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
-; I64-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], zeroinitializer
-; I64-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP2]], i64 8
+; I64-NEXT:    [[ADD1:%.*]] = fadd double [[TMP10]], 0.000000e+00
 ; I64-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
-; I64-NEXT:    [[TMP14:%.*]] = load double, ptr [[TMP12]], align 8
 ; I64-NEXT:    [[TMP15:%.*]] = load double, ptr [[TMP13]], align 8
-; I64-NEXT:    [[TMP16:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i32 0
-; I64-NEXT:    [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP15]], i32 1
-; I64-NEXT:    [[TMP18:%.*]] = fmul <2 x double> [[TMP11]], zeroinitializer
-; I64-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[ACCUM]], i64 0
-; I64-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
-; I64-NEXT:    [[TMP19:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT2]], <2 x double> [[TMP18]], <2 x i32> <i32 1, i32 2>
-; I64-NEXT:    [[TMP20:%.*]] = fmul <2 x double> [[TMP17]], zeroinitializer
-; I64-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], zeroinitializer
-; I64-NEXT:    [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], splat (double 1.000000e+00)
-; I64-NEXT:    [[TMP23:%.*]] = load double, ptr [[TMP8]], align 8
+; I64-NEXT:    [[MUL1]] = fmul double [[ADD1]], 0.000000e+00
+; I64-NEXT:    [[MUL2:%.*]] = fmul double [[TMP15]], 0.000000e+00
+; I64-NEXT:    [[ADD2:%.*]] = fadd double [[MUL2]], 0.000000e+00
+; I64-NEXT:    [[ADD3:%.*]] = fadd double [[ADD2]], 1.000000e+00
 ; I64-NEXT:    [[TMP24:%.*]] = load double, ptr [[TMP9]], align 8
-; I64-NEXT:    [[TMP25:%.*]] = insertelement <2 x double> poison, double [[TMP23]], i32 0
-; I64-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[TMP24]], i32 1
-; I64-NEXT:    [[TMP27:%.*]] = fdiv <2 x double> [[TMP26]], [[TMP22]]
-; I64-NEXT:    [[TMP28:%.*]] = fsub <2 x double> [[TMP19]], [[TMP27]]
-; I64-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
-; I64:       [[MIDDLE_BLOCK]]:
-; I64-NEXT:    [[TMP29]] = extractelement <2 x double> [[TMP28]], i32 1
-; I64-NEXT:    br label %[[OUTER_LOOP_LOOPEXIT]]
+; I64-NEXT:    [[DIV:%.*]] = fdiv double [[TMP24]], [[ADD3]]
+; I64-NEXT:    [[RESULT]] = fsub double [[ACCUM_INNER]], [[DIV]]
+; I64-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; I64-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1
+; I64-NEXT:    br i1 [[EXITCOND]], label %[[OUTER_LOOP_LOOPEXIT]], label %[[INNER_LOOP]]
 ; I64:       [[EXIT]]:
 ; I64-NEXT:    ret double [[ACCUM]]
 ;
diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
index 9b524e2ef7cd5..66dbf6152472a 100644
--- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
@@ -29,8 +29,6 @@ static_library("Driver") {
   sources = [
     "Action.cpp",
     "Compilation.cpp",
-    "CreateASTUnitFromArgs.cpp",
-    "CreateInvocationFromArgs.cpp",
     "Distro.cpp",
     "Driver.cpp",
     "Job.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
index cdf39d645bc52..4009cfc609f4a 100644
--- a/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Frontend/BUILD.gn
@@ -28,6 +28,7 @@ static_library("Frontend") {
     "ChainedIncludesSource.cpp",
     "CompilerInstance.cpp",
     "CompilerInvocation.cpp",
+    "CreateInvocationFromCommandLine.cpp",
     "DependencyFile.cpp",
     "DependencyGraph.cpp",
     "DiagnosticRenderer.cpp",
@@ -47,7 +48,6 @@ static_library("Frontend") {
     "SARIFDiagnosticPrinter.cpp",
     "SerializedDiagnosticPrinter.cpp",
     "SerializedDiagnosticReader.cpp",
-    "StandaloneDiagnostic.cpp",
     "TestModuleFileExtension.cpp",
     "TextDiagnostic.cpp",
     "TextDiagnosticBuffer.cpp",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 82fe916645635..3985e4b73fcfc 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1216,7 +1216,6 @@ if (current_toolchain == default_toolchain) {
       "__mdspan/mdspan.h",
       "__memory/addressof.h",
       "__memory/align.h",
-      "__memory/aligned_alloc.h",
       "__memory/allocate_at_least.h",
       "__memory/allocation_guard.h",
       "__memory/allocator.h",
diff --git a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
index 699edb188a70a..81fbdb1611deb 100644
--- a/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
+++ b/mlir/lib/Conversion/ArithToAPFloat/ArithToAPFloat.cpp
@@ -41,24 +41,15 @@ static FuncOp createFnDecl(OpBuilder &b, SymbolOpInterface symTable,
 }
 
 /// Helper function to look up or create the symbol for a runtime library
-/// function for a binary arithmetic operation.
-///
-/// Parameter 1: APFloat semantics
-/// Parameter 2: Left-hand side operand
-/// Parameter 3: Right-hand side operand
-///
-/// This function will return a failure if the function is found but has an
-/// unexpected signature.
-///
+/// function with the given parameter types. Always returns an int64_t.
 static FailureOr<FuncOp>
-lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name,
-                       SymbolTableCollection *symbolTables = nullptr) {
-  auto i32Type = IntegerType::get(symTable->getContext(), 32);
+lookupOrCreateApFloatFn(OpBuilder &b, SymbolOpInterface symTable,
+                        StringRef name, TypeRange paramTypes,
+                        SymbolTableCollection *symbolTables = nullptr) {
   auto i64Type = IntegerType::get(symTable->getContext(), 64);
 
   std::string funcName = (llvm::Twine("_mlir_apfloat_") + name).str();
-  FunctionType funcT =
-      FunctionType::get(b.getContext(), {i32Type, i64Type, i64Type}, {i64Type});
+  auto funcT = FunctionType::get(b.getContext(), paramTypes, {i64Type});
   FailureOr<FuncOp> func =
       lookupFnDecl(symTable, funcName, funcT, symbolTables);
   // Failed due to type mismatch.
@@ -72,6 +63,31 @@ lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name,
                       /*setPrivate=*/true, symbolTables);
 }
 
+/// Helper function to look up or create the symbol for a runtime library
+/// function for a binary arithmetic operation.
+///
+/// Parameter 1: APFloat semantics
+/// Parameter 2: Left-hand side operand
+/// Parameter 3: Right-hand side operand
+///
+/// This function will return a failure if the function is found but has an
+/// unexpected signature.
+///
+static FailureOr<FuncOp>
+lookupOrCreateBinaryFn(OpBuilder &b, SymbolOpInterface symTable, StringRef name,
+                       SymbolTableCollection *symbolTables = nullptr) {
+  auto i32Type = IntegerType::get(symTable->getContext(), 32);
+  auto i64Type = IntegerType::get(symTable->getContext(), 64);
+  return lookupOrCreateApFloatFn(b, symTable, name, {i32Type, i64Type, i64Type},
+                                 symbolTables);
+}
+
+static Value getSemanticsValue(OpBuilder &b, Location loc, FloatType floatTy) {
+  int32_t sem = llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
+  return arith::ConstantOp::create(b, loc, b.getI32Type(),
+                                   b.getIntegerAttr(b.getI32Type(), sem));
+}
+
 /// Rewrite a binary arithmetic operation to an APFloat function call.
 template <typename OpTy>
 struct BinaryArithOpToAPFloatConversion final : OpRewritePattern<OpTy> {
@@ -104,11 +120,7 @@ struct BinaryArithOpToAPFloatConversion final : OpRewritePattern<OpTy> {
         arith::BitcastOp::create(rewriter, loc, intWType, op.getRhs()));
 
     // Call APFloat function.
-    int32_t sem =
-        llvm::APFloatBase::SemanticsToEnum(floatTy.getFloatSemantics());
-    Value semValue = arith::ConstantOp::create(
-        rewriter, loc, rewriter.getI32Type(),
-        rewriter.getIntegerAttr(rewriter.getI32Type(), sem));
+    Value semValue = getSemanticsValue(rewriter, loc, floatTy);
     SmallVector<Value> params = {semValue, lhsBits, rhsBits};
     auto resultOp =
         func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
@@ -126,6 +138,176 @@ struct BinaryArithOpToAPFloatConversion final : OpRewritePattern<OpTy> {
   const char *APFloatName;
 };
 
+template <typename OpTy>
+struct FpToFpConversion final : OpRewritePattern<OpTy> {
+  FpToFpConversion(MLIRContext *context, SymbolOpInterface symTable,
+                   PatternBenefit benefit = 1)
+      : OpRewritePattern<OpTy>(context, benefit), symTable(symTable) {}
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Get APFloat function from runtime library.
+    auto i32Type = IntegerType::get(symTable->getContext(), 32);
+    auto i64Type = IntegerType::get(symTable->getContext(), 64);
+    FailureOr<FuncOp> fn = lookupOrCreateApFloatFn(
+        rewriter, symTable, "convert", {i32Type, i32Type, i64Type});
+    if (failed(fn))
+      return fn;
+
+    rewriter.setInsertionPoint(op);
+    // Cast operands to 64-bit integers.
+    Location loc = op.getLoc();
+    auto inFloatTy = cast<FloatType>(op.getOperand().getType());
+    auto inIntWType = rewriter.getIntegerType(inFloatTy.getWidth());
+    Value operandBits = arith::ExtUIOp::create(
+        rewriter, loc, i64Type,
+        arith::BitcastOp::create(rewriter, loc, inIntWType, op.getOperand()));
+
+    // Call APFloat function.
+    Value inSemValue = getSemanticsValue(rewriter, loc, inFloatTy);
+    auto outFloatTy = cast<FloatType>(op.getType());
+    Value outSemValue = getSemanticsValue(rewriter, loc, outFloatTy);
+    std::array<Value, 3> params = {inSemValue, outSemValue, operandBits};
+    auto resultOp =
+        func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
+                             SymbolRefAttr::get(*fn), params);
+
+    // Truncate result to the original width.
+    auto outIntWType = rewriter.getIntegerType(outFloatTy.getWidth());
+    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, outIntWType,
+                                                  resultOp->getResult(0));
+    rewriter.replaceOp(
+        op, arith::BitcastOp::create(rewriter, loc, outFloatTy, truncatedBits));
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+};
+
+template <typename OpTy>
+struct FpToIntConversion final : OpRewritePattern<OpTy> {
+  FpToIntConversion(MLIRContext *context, SymbolOpInterface symTable,
+                    bool isUnsigned, PatternBenefit benefit = 1)
+      : OpRewritePattern<OpTy>(context, benefit), symTable(symTable),
+        isUnsigned(isUnsigned) {}
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getType().getIntOrFloatBitWidth() > 64)
+      return rewriter.notifyMatchFailure(
+          op, "result type > 64 bits is not supported");
+
+    // Get APFloat function from runtime library.
+    auto i1Type = IntegerType::get(symTable->getContext(), 1);
+    auto i32Type = IntegerType::get(symTable->getContext(), 32);
+    auto i64Type = IntegerType::get(symTable->getContext(), 64);
+    FailureOr<FuncOp> fn =
+        lookupOrCreateApFloatFn(rewriter, symTable, "convert_to_int",
+                                {i32Type, i32Type, i1Type, i64Type});
+    if (failed(fn))
+      return fn;
+
+    rewriter.setInsertionPoint(op);
+    // Cast operands to 64-bit integers.
+    Location loc = op.getLoc();
+    auto inFloatTy = cast<FloatType>(op.getOperand().getType());
+    auto inIntWType = rewriter.getIntegerType(inFloatTy.getWidth());
+    Value operandBits = arith::ExtUIOp::create(
+        rewriter, loc, i64Type,
+        arith::BitcastOp::create(rewriter, loc, inIntWType, op.getOperand()));
+
+    // Call APFloat function.
+    Value inSemValue = getSemanticsValue(rewriter, loc, inFloatTy);
+    auto outIntTy = cast<IntegerType>(op.getType());
+    Value outWidthValue = arith::ConstantOp::create(
+        rewriter, loc, i32Type,
+        rewriter.getIntegerAttr(i32Type, outIntTy.getWidth()));
+    Value isUnsignedValue = arith::ConstantOp::create(
+        rewriter, loc, i1Type, rewriter.getIntegerAttr(i1Type, isUnsigned));
+    SmallVector<Value> params = {inSemValue, outWidthValue, isUnsignedValue,
+                                 operandBits};
+    auto resultOp =
+        func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
+                             SymbolRefAttr::get(*fn), params);
+
+    // Truncate result to the original width.
+    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, outIntTy,
+                                                  resultOp->getResult(0));
+    rewriter.replaceOp(op, truncatedBits);
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+  bool isUnsigned;
+};
+
+template <typename OpTy>
+struct IntToFpConversion final : OpRewritePattern<OpTy> {
+  IntToFpConversion(MLIRContext *context, SymbolOpInterface symTable,
+                    bool isUnsigned, PatternBenefit benefit = 1)
+      : OpRewritePattern<OpTy>(context, benefit), symTable(symTable),
+        isUnsigned(isUnsigned) {}
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    if (op.getIn().getType().getIntOrFloatBitWidth() > 64) {
+      return rewriter.notifyMatchFailure(
+          loc, "integer bitwidth > 64 is not supported");
+    }
+
+    // Get APFloat function from runtime library.
+    auto i1Type = IntegerType::get(symTable->getContext(), 1);
+    auto i32Type = IntegerType::get(symTable->getContext(), 32);
+    auto i64Type = IntegerType::get(symTable->getContext(), 64);
+    FailureOr<FuncOp> fn =
+        lookupOrCreateApFloatFn(rewriter, symTable, "convert_from_int",
+                                {i32Type, i32Type, i1Type, i64Type});
+    if (failed(fn))
+      return fn;
+
+    rewriter.setInsertionPoint(op);
+    // Cast operands to 64-bit integers.
+    auto inIntTy = cast<IntegerType>(op.getOperand().getType());
+    Value operandBits = op.getOperand();
+    if (operandBits.getType().getIntOrFloatBitWidth() < 64) {
+      if (isUnsigned) {
+        operandBits =
+            arith::ExtUIOp::create(rewriter, loc, i64Type, operandBits);
+      } else {
+        operandBits =
+            arith::ExtSIOp::create(rewriter, loc, i64Type, operandBits);
+      }
+    }
+
+    // Call APFloat function.
+    auto outFloatTy = cast<FloatType>(op.getType());
+    Value outSemValue = getSemanticsValue(rewriter, loc, outFloatTy);
+    Value inWidthValue = arith::ConstantOp::create(
+        rewriter, loc, i32Type,
+        rewriter.getIntegerAttr(i32Type, inIntTy.getWidth()));
+    Value isUnsignedValue = arith::ConstantOp::create(
+        rewriter, loc, i1Type, rewriter.getIntegerAttr(i1Type, isUnsigned));
+    SmallVector<Value> params = {outSemValue, inWidthValue, isUnsignedValue,
+                                 operandBits};
+    auto resultOp =
+        func::CallOp::create(rewriter, loc, TypeRange(rewriter.getI64Type()),
+                             SymbolRefAttr::get(*fn), params);
+
+    // Truncate result to the original width.
+    auto outIntWType = rewriter.getIntegerType(outFloatTy.getWidth());
+    Value truncatedBits = arith::TruncIOp::create(rewriter, loc, outIntWType,
+                                                  resultOp->getResult(0));
+    Value result =
+        arith::BitcastOp::create(rewriter, loc, outFloatTy, truncatedBits);
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+
+  SymbolOpInterface symTable;
+  bool isUnsigned;
+};
+
 namespace {
 struct ArithToAPFloatConversionPass final
     : impl::ArithToAPFloatConversionPassBase<ArithToAPFloatConversionPass> {
@@ -147,6 +329,17 @@ void ArithToAPFloatConversionPass::runOnOperation() {
       context, "divide", getOperation());
   patterns.add<BinaryArithOpToAPFloatConversion<arith::RemFOp>>(
       context, "remainder", getOperation());
+  patterns
+      .add<FpToFpConversion<arith::ExtFOp>, FpToFpConversion<arith::TruncFOp>>(
+          context, getOperation());
+  patterns.add<FpToIntConversion<arith::FPToSIOp>>(context, getOperation(),
+                                                   /*isUnsigned=*/false);
+  patterns.add<FpToIntConversion<arith::FPToUIOp>>(context, getOperation(),
+                                                   /*isUnsigned=*/true);
+  patterns.add<IntToFpConversion<arith::SIToFPOp>>(context, getOperation(),
+                                                   /*isUnsigned=*/false);
+  patterns.add<IntToFpConversion<arith::UIToFPOp>>(context, getOperation(),
+                                                   /*isUnsigned=*/true);
   LogicalResult result = success();
   ScopedDiagnosticHandler scopedHandler(context, [&result](Diagnostic &diag) {
     if (diag.getSeverity() == DiagnosticSeverity::Error) {
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 1c21a2f270da6..1035d7cb46e6e 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -1074,13 +1074,6 @@ OpFoldResult DimOp::fold(FoldAdaptor adaptor) {
     return subview.getDynamicSize(sourceIndex);
   }
 
-  if (auto sizeInterface =
-          dyn_cast_or_null<OffsetSizeAndStrideOpInterface>(definingOp)) {
-    assert(sizeInterface.isDynamicSize(unsignedIndex) &&
-           "Expected dynamic subview size");
-    return sizeInterface.getDynamicSize(unsignedIndex);
-  }
-
   // dim(memrefcast) -> dim
   if (succeeded(foldMemRefCast(*this)))
     return getResult();
diff --git a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
index 0a05f7369e556..44980ccd77491 100644
--- a/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/APFloatWrappers.cpp
@@ -20,6 +20,7 @@
 // APFloatBase::Semantics enum value.
 //
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APSInt.h"
 
 #ifdef _WIN32
 #ifndef MLIR_APFLOAT_WRAPPERS_EXPORT
@@ -51,7 +52,7 @@
 
 /// Binary operations with rounding mode.
 #define APFLOAT_BINARY_OP_ROUNDING_MODE(OP, ROUNDING_MODE)                     \
-  MLIR_APFLOAT_WRAPPERS_EXPORT int64_t _mlir_apfloat_##OP(                     \
+  MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_##OP(                    \
       int32_t semantics, uint64_t a, uint64_t b) {                             \
     const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(        \
         static_cast<llvm::APFloatBase::Semantics>(semantics));                 \
@@ -86,4 +87,48 @@ MLIR_APFLOAT_WRAPPERS_EXPORT void printApFloat(int32_t semantics, uint64_t a) {
   double d = x.convertToDouble();
   fprintf(stdout, "%lg", d);
 }
+
+MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t
+_mlir_apfloat_convert(int32_t inSemantics, int32_t outSemantics, uint64_t a) {
+  const llvm::fltSemantics &inSem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(inSemantics));
+  const llvm::fltSemantics &outSem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(outSemantics));
+  unsigned bitWidthIn = llvm::APFloatBase::semanticsSizeInBits(inSem);
+  llvm::APFloat val(inSem, llvm::APInt(bitWidthIn, a));
+  // TODO: Custom rounding modes are not supported yet.
+  bool losesInfo;
+  val.convert(outSem, llvm::RoundingMode::NearestTiesToEven, &losesInfo);
+  llvm::APInt result = val.bitcastToAPInt();
+  return result.getZExtValue();
+}
+
+MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_convert_to_int(
+    int32_t semantics, int32_t resultWidth, bool isUnsigned, uint64_t a) {
+  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(semantics));
+  unsigned inputWidth = llvm::APFloatBase::semanticsSizeInBits(sem);
+  llvm::APFloat val(sem, llvm::APInt(inputWidth, a));
+  llvm::APSInt result(resultWidth, isUnsigned);
+  bool isExact;
+  // TODO: Custom rounding modes are not supported yet.
+  val.convertToInteger(result, llvm::RoundingMode::NearestTiesToEven, &isExact);
+  // This function always returns uint64_t, regardless of the desired result
+  // width. It does not matter whether we zero-extend or sign-extend the APSInt
+  // to 64 bits because the generated IR in arith-to-apfloat will truncate the
+  // result to the desired result width.
+  return result.getZExtValue();
+}
+
+MLIR_APFLOAT_WRAPPERS_EXPORT uint64_t _mlir_apfloat_convert_from_int(
+    int32_t semantics, int32_t inputWidth, bool isUnsigned, uint64_t a) {
+  llvm::APInt val(inputWidth, a, /*isSigned=*/!isUnsigned);
+  const llvm::fltSemantics &sem = llvm::APFloatBase::EnumToSemantics(
+      static_cast<llvm::APFloatBase::Semantics>(semantics));
+  llvm::APFloat result(sem);
+  // TODO: Custom rounding modes are not supported yet.
+  result.convertFromAPInt(val, /*IsSigned=*/!isUnsigned,
+                          llvm::RoundingMode::NearestTiesToEven);
+  return result.bitcastToAPInt().getZExtValue();
+}
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 0b6fc30b8db70..ef068b3e8dc67 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1217,6 +1217,7 @@ allocReductionVars(T op, ArrayRef<BlockArgument> reductionArgs,
 template <typename T>
 static void
 mapInitializationArgs(T loop, LLVM::ModuleTranslation &moduleTranslation,
+                      llvm::IRBuilderBase &builder,
                       SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
                       DenseMap<Value, llvm::Value *> &reductionVariableMap,
                       unsigned i) {
@@ -1227,8 +1228,17 @@ mapInitializationArgs(T loop, LLVM::ModuleTranslation &moduleTranslation,
 
   mlir::Value mlirSource = loop.getReductionVars()[i];
   llvm::Value *llvmSource = moduleTranslation.lookupValue(mlirSource);
-  assert(llvmSource && "lookup reduction var");
-  moduleTranslation.mapValue(reduction.getInitializerMoldArg(), llvmSource);
+  llvm::Value *origVal = llvmSource;
+  // If a non-pointer value is expected, load the value from the source pointer.
+  if (!isa<LLVM::LLVMPointerType>(
+          reduction.getInitializerMoldArg().getType()) &&
+      isa<LLVM::LLVMPointerType>(mlirSource.getType())) {
+    origVal =
+        builder.CreateLoad(moduleTranslation.convertType(
+                               reduction.getInitializerMoldArg().getType()),
+                           llvmSource, "omp_orig");
+  }
+  moduleTranslation.mapValue(reduction.getInitializerMoldArg(), origVal);
 
   if (entry.getNumArguments() > 1) {
     llvm::Value *allocation =
@@ -1308,7 +1318,7 @@ initReductionVars(OP op, ArrayRef<BlockArgument> reductionArgs,
     SmallVector<llvm::Value *, 1> phis;
 
     // map block argument to initializer region
-    mapInitializationArgs(op, moduleTranslation, reductionDecls,
+    mapInitializationArgs(op, moduleTranslation, builder, reductionDecls,
                           reductionVariableMap, i);
 
     if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
diff --git a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
index 797f42c37a26f..d71d81dddcd4f 100644
--- a/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
+++ b/mlir/test/Conversion/ArithToApfloat/arith-to-apfloat.mlir
@@ -126,3 +126,75 @@ func.func @remf(%arg0: f4E2M1FN, %arg1: f4E2M1FN) {
   %0 = arith.remf %arg0, %arg1 : f4E2M1FN
   return
 }
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_convert(i32, i32, i64) -> i64
+// CHECK: %[[sem_in:.*]] = arith.constant 18 : i32
+// CHECK: %[[sem_out:.*]] = arith.constant 2 : i32
+// CHECK: call @_mlir_apfloat_convert(%[[sem_in]], %[[sem_out]], %{{.*}}) : (i32, i32, i64) -> i64
+func.func @extf(%arg0: f4E2M1FN) {
+  %0 = arith.extf %arg0 : f4E2M1FN to f32
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_convert(i32, i32, i64) -> i64
+// CHECK: %[[sem_in:.*]] = arith.constant 1 : i32
+// CHECK: %[[sem_out:.*]] = arith.constant 18 : i32
+// CHECK: call @_mlir_apfloat_convert(%[[sem_in]], %[[sem_out]], %{{.*}}) : (i32, i32, i64) -> i64
+func.func @truncf(%arg0: bf16) {
+  %0 = arith.truncf %arg0 : bf16 to f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_convert_to_int(i32, i32, i1, i64) -> i64
+// CHECK: %[[sem_in:.*]] = arith.constant 0 : i32
+// CHECK: %[[out_width:.*]] = arith.constant 4 : i32
+// CHECK: %[[is_unsigned:.*]] = arith.constant false
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_convert_to_int(%[[sem_in]], %[[out_width]], %[[is_unsigned]], %{{.*}}) : (i32, i32, i1, i64) -> i64
+// CHECK: arith.trunci %[[res]] : i64 to i4
+func.func @fptosi(%arg0: f16) {
+  %0 = arith.fptosi %arg0 : f16 to i4
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_convert_to_int(i32, i32, i1, i64) -> i64
+// CHECK: %[[sem_in:.*]] = arith.constant 0 : i32
+// CHECK: %[[out_width:.*]] = arith.constant 4 : i32
+// CHECK: %[[is_unsigned:.*]] = arith.constant true
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_convert_to_int(%[[sem_in]], %[[out_width]], %[[is_unsigned]], %{{.*}}) : (i32, i32, i1, i64) -> i64
+// CHECK: arith.trunci %[[res]] : i64 to i4
+func.func @fptoui(%arg0: f16) {
+  %0 = arith.fptoui %arg0 : f16 to i4
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_convert_from_int(i32, i32, i1, i64) -> i64
+// CHECK: %[[sem_out:.*]] = arith.constant 18 : i32
+// CHECK: %[[in_width:.*]] = arith.constant 32 : i32
+// CHECK: %[[is_unsigned:.*]] = arith.constant false
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_convert_from_int(%[[sem_out]], %[[in_width]], %[[is_unsigned]], %{{.*}}) : (i32, i32, i1, i64) -> i64
+func.func @sitofp(%arg0: i32) {
+  %0 = arith.sitofp %arg0 : i32 to f4E2M1FN
+  return
+}
+
+// -----
+
+// CHECK: func.func private @_mlir_apfloat_convert_from_int(i32, i32, i1, i64) -> i64
+// CHECK: %[[sem_out:.*]] = arith.constant 18 : i32
+// CHECK: %[[in_width:.*]] = arith.constant 32 : i32
+// CHECK: %[[is_unsigned:.*]] = arith.constant true
+// CHECK: %[[res:.*]] = call @_mlir_apfloat_convert_from_int(%[[sem_out]], %[[in_width]], %[[is_unsigned]], %{{.*}}) : (i32, i32, i1, i64) -> i64
+func.func @uitofp(%arg0: i32) {
+  %0 = arith.uitofp %arg0 : i32 to f4E2M1FN
+  return
+}
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index 313090272ef90..e02717a2f5689 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -208,19 +208,6 @@ func.func @subview_negative_stride2(%arg0 : memref<7xf32>) -> memref<?xf32, stri
 
 // -----
 
-// CHECK-LABEL: func @dim_of_sized_view
-//  CHECK-SAME:   %{{[a-z0-9A-Z_]+}}: memref<?xi8>
-//  CHECK-SAME:   %[[SIZE:.[a-z0-9A-Z_]+]]: index
-//       CHECK:   return %[[SIZE]] : index
-func.func @dim_of_sized_view(%arg : memref<?xi8>, %size: index) -> index {
-  %c0 = arith.constant 0 : index
-  %0 = memref.reinterpret_cast %arg to offset: [0], sizes: [%size], strides: [1] : memref<?xi8> to memref<?xi8>
-  %1 = memref.dim %0, %c0 : memref<?xi8>
-  return %1 : index
-}
-
-// -----
-
 // CHECK-LABEL: func @no_fold_subview_negative_size
 //  CHECK:        %[[SUBVIEW:.+]] = memref.subview
 //  CHECK:        return %[[SUBVIEW]]
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
index dbaa20346a03a..8046610d479a8 100644
--- a/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
+++ b/mlir/test/Integration/Dialect/Arith/CPU/test-apfloat-emulation.mlir
@@ -27,14 +27,44 @@ func.func @entry() {
   %a1 = arith.constant 1.4 : f8E4M3FN
   %a2 = arith.constant 1.4 : f32
   %b1, %b2 = func.call @foo() : () -> (f8E4M3FN, f32)
-  %c1 = arith.addf %a1, %b1 : f8E4M3FN  // not supported by LLVM
-  %c2 = arith.addf %a2, %b2 : f32       // supported by LLVM
 
-  // CHECK: 3.5
+  // CHECK: 2.2
+  vector.print %b2 : f32
+
+  // CHECK-NEXT: 3.5
+  %c1 = arith.addf %a1, %b1 : f8E4M3FN  // not supported by LLVM
   vector.print %c1 : f8E4M3FN
 
-  // CHECK: 3.6
+  // CHECK-NEXT: 3.6
+  %c2 = arith.addf %a2, %b2 : f32       // supported by LLVM
   vector.print %c2 : f32
 
+  // CHECK-NEXT: 2.25
+  %cvt = arith.truncf %b2 : f32 to f8E4M3FN
+  vector.print %cvt : f8E4M3FN
+
+  // CHECK-NEXT: 1
+  // Bit pattern: 01, interpreted as signed integer: 1
+  %cvt_int_signed = arith.fptosi %cvt : f8E4M3FN to i2
+  vector.print %cvt_int_signed : i2
+
+  // CHECK-NEXT: -2
+  // Bit pattern: 10, interpreted as signed integer: -2
+  %cvt_int_unsigned = arith.fptoui %cvt : f8E4M3FN to i2
+  vector.print %cvt_int_unsigned : i2
+
+  // CHECK-NEXT: -6
+  // Bit pattern: 1...11110111, interpreted as signed: -9
+  // Closest f4E2M1FN value: -6.0
+  %c9 = arith.constant -9 : i16
+  %cvt_from_signed_int = arith.sitofp %c9 : i16 to f4E2M1FN
+  vector.print %cvt_from_signed_int : f4E2M1FN
+
+  // CHECK-NEXT: 6
+  // Bit pattern: 1...11110111, interpreted as unsigned: 65527
+  // Closest f4E2M1FN value: 6.0
+  %cvt_from_unsigned_int = arith.uitofp %c9 : i16 to f4E2M1FN
+  vector.print %cvt_from_unsigned_int : f4E2M1FN
+
   return
 }
diff --git a/mlir/test/Target/LLVMIR/omptarget-declare-target-to-host.mlir b/mlir/test/Target/LLVMIR/omptarget-declare-target-to-host.mlir
new file mode 100644
index 0000000000000..4202421aed5ac
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-declare-target-to-host.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_gpu = false, omp.is_target_device = false, omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+  // CHECK-DAG: @_QMtest_0Ezii = global [11 x float] zeroinitializer
+  // CHECK-DAG: @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 48]
+  // CHECK-DAG: @.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 3]
+  // CHECK-DAG: @.offloading.entry._QMtest_0Ezii = weak constant %struct.__tgt_offload_entry {{.*}} ptr @_QMtest_0Ezii, {{.*}}, i64 44,{{.*}}
+  llvm.mlir.global external @_QMtest_0Ezii() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : !llvm.array<11 x f32> {
+    %0 = llvm.mlir.zero : !llvm.array<11 x f32>
+    llvm.return %0 : !llvm.array<11 x f32>
+  }
+
+  // CHECK-DAG: %[[BASEPTR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+  // CHECK-DAG: store ptr @_QMtest_0Ezii, ptr %[[BASEPTR]], align 8
+  // CHECK-DAG: %[[OFFLOADPTR:.*]] = getelementptr inbounds [1 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+  // CHECK-DAG: store ptr @_QMtest_0Ezii, ptr %[[OFFLOADPTR]], align 8
+  llvm.func @_QQmain() {
+    %0 = llvm.mlir.constant(1 : index) : i64
+    %1 = llvm.mlir.constant(0 : index) : i64
+    %2 = llvm.mlir.constant(11 : index) : i64
+    %3 = llvm.mlir.addressof @_QMtest_0Ezii : !llvm.ptr
+    %4 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%2 : i64) extent(%2 : i64) stride(%0 : i64) start_idx(%1 : i64) {stride_in_bytes = true}
+    %5 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.array<11 x f32>) map_clauses(tofrom) capture(ByRef) bounds(%4) -> !llvm.ptr
+    omp.target map_entries(%5 -> %arg0 : !llvm.ptr) {
+      %6 = llvm.mlir.constant(1.0 : f32) : f32
+      %7 = llvm.mlir.constant(0 : i64) : i64
+      %8 = llvm.getelementptr %arg0[%7] : (!llvm.ptr, i64) -> !llvm.ptr, f32
+      llvm.store %6, %8 : f32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  // CHEKC-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Ezii", i32 {{.*}}, i32 {{.*}}}
+}
diff --git a/mlir/test/Target/LLVMIR/omptarget-overlapping-record-member-map.mlir b/mlir/test/Target/LLVMIR/omptarget-overlapping-record-member-map.mlir
new file mode 100644
index 0000000000000..ab488a2b0ec57
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-overlapping-record-member-map.mlir
@@ -0,0 +1,65 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_gpu = false, omp.is_target_device = false, omp.target_triples = ["amdgcn-amd-amdhsa"]} {
+    llvm.func @_QQmain() attributes {fir.bindc_name = "main"} {
+        %0 = llvm.mlir.constant(1 : i64) : i64
+        %1 = llvm.alloca %0 x !llvm.struct<"_QFTdtype", (f32, i32)> {bindc_name = "dtypev"} : (i64) -> !llvm.ptr
+        %2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFTdtype", (f32, i32)>
+        %3 = omp.map.info var_ptr(%2 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "dtypev%value2"}
+        %4 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.struct<"_QFTdtype", (f32, i32)>) map_clauses(to) capture(ByRef) members(%3 : [1] : !llvm.ptr) -> !llvm.ptr {name = "dtypev"}
+        omp.target map_entries(%4 -> %arg0, %3 -> %arg1 : !llvm.ptr, !llvm.ptr) {
+          omp.terminator
+        }
+        llvm.return
+    }
+}
+
+// CHECK: @.offload_sizes = private unnamed_addr constant [4 x i64] [i64 0, i64 0, i64 0, i64 4]
+// CHECK: @.offload_maptypes = private unnamed_addr constant [4 x i64] [i64 32, i64 281474976710657, i64 281474976710657, i64 844424930131971]
+
+// CHECK: %[[ALLOCA:.*]] = alloca %_QFTdtype, i64 1, align 8
+// CHECK: %[[ELEMENT_ACC:.*]] = getelementptr %_QFTdtype, ptr %[[ALLOCA]], i32 0, i32 1
+
+// CHECK: %[[SIZE1_CALC_1:.*]] = getelementptr %_QFTdtype, ptr %[[ALLOCA]], i32 1
+// CHECK: %[[SIZE1_CALC_2:.*]] = ptrtoint ptr %[[SIZE1_CALC_1]] to i64
+// CHECK: %[[SIZE1_CALC_3:.*]] = ptrtoint ptr %[[ALLOCA]] to i64
+// CHECK: %[[SIZE1_CALC_4:.*]] = sub i64 %[[SIZE1_CALC_2]], %[[SIZE1_CALC_3]]
+// CHECK: %[[SIZE1_CALC_5:.*]] = sdiv exact i64 %[[SIZE1_CALC_4]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+
+// CHECK:  %[[SIZE2_CALC_1:.*]] = getelementptr %_QFTdtype, ptr %[[ALLOCA]], i32 1
+// CHECK:  %[[SIZE2_CALC_2:.*]] = ptrtoint ptr %[[ELEMENT_ACC]] to i64
+// CHECK:  %[[SIZE2_CALC_3:.*]] = ptrtoint ptr %[[ALLOCA]] to i64
+// CHECK:  %[[SIZE2_CALC_4:.*]] = sub i64 %[[SIZE2_CALC_2]], %[[SIZE2_CALC_3]]
+// CHECK:  %[[SIZE2_CALC_5:.*]] = sdiv exact i64 %[[SIZE2_CALC_4]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+
+// CHECK:  %[[SIZE3_CALC_1:.*]] = getelementptr i32, ptr %[[ELEMENT_ACC]], i32 1
+// CHECK:  %[[SIZE3_CALC_2:.*]] = ptrtoint ptr %[[SIZE2_CALC_1]] to i64
+// CHECK:  %[[SIZE3_CALC_3:.*]] = ptrtoint ptr %[[SIZE3_CALC_1]] to i64
+// CHECK:  %[[SIZE3_CALC_4:.*]] = sub i64 %[[SIZE3_CALC_2]], %[[SIZE3_CALC_3]]
+// CHECK:  %[[SIZE3_CALC_5:.*]] = sdiv exact i64 %[[SIZE3_CALC_4]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+
+// CHECK: %[[BASEPTR:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+// CHECK: store ptr %[[ALLOCA]], ptr %[[BASEPTR]], align 8
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+// CHECK: store ptr %[[ALLOCA]], ptr %[[PTRS]], align 8
+// CHECK: %[[SIZES:.*]] = getelementptr inbounds [4 x i64], ptr %.offload_sizes, i32 0, i32 0
+// CHECK: store i64 %[[SIZE1_CALC_5]], ptr %[[SIZES]], align 8
+
+// CHECK: %[[BASEPTR:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_baseptrs, i32 0, i32 1
+// CHECK: store ptr %[[ALLOCA]], ptr %[[BASEPTR]], align 8
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_ptrs, i32 0, i32 1
+// CHECK: store ptr %[[ALLOCA]], ptr %[[PTRS]], align 8
+// CHECK: %[[SIZES:.*]] = getelementptr inbounds [4 x i64], ptr %.offload_sizes, i32 0, i32 1
+// CHECK: store i64 %[[SIZE2_CALC_5]], ptr %[[SIZES]], align 8
+
+// CHECK: %[[BASEPTR:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_baseptrs, i32 0, i32 2
+// CHECK: store ptr %[[ALLOCA]], ptr %[[BASEPTR]], align 8
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_ptrs, i32 0, i32 2
+// CHECK: store ptr %13, ptr %[[PTRS]], align 8
+// CHECK: %[[SIZES:.*]] = getelementptr inbounds [4 x i64], ptr %.offload_sizes, i32 0, i32 2
+// CHECK: store i64 %[[SIZE3_CALC_5]], ptr %[[SIZES]], align 8
+
+// CHECK: %[[BASEPTR:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_baseptrs, i32 0, i32 3
+// CHECK: store ptr %[[ALLOCA]], ptr %[[BASEPTR]], align 8
+// CHECK: %[[PTRS:.*]] = getelementptr inbounds [4 x ptr], ptr %.offload_ptrs, i32 0, i32 3
+// CHECK: store ptr %[[ELEMENT_ACC]], ptr %[[PTRS]], align 8
diff --git a/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir b/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir
index 02f2fef480303..f18ae06b898a2 100644
--- a/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-record-type-with-ptr-member-host.mlir
@@ -59,9 +59,9 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
 
 // CHECK: @[[FULL_ARR_GLOB:.*]] = internal global { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } undef
 // CHECK: @[[ARR_SECT_GLOB:.*]] = internal global { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } undef
-// CHECK: @.offload_sizes = private unnamed_addr constant [12 x i64] [i64 0, i64 48, i64 8, i64 0, i64 0, i64 48, i64 8, i64 0, i64 0, i64 24, i64 8, i64 0]
-// CHECK: @.offload_maptypes = private unnamed_addr constant [12 x i64] [i64 32, i64 281474976710659, i64 281474976710659, i64 281474976710675, i64 32, i64 1407374883553283, i64 1407374883553283, i64 1407374883553299, i64 32, i64 2533274790395907, i64 2533274790395907, i64 2533274790395923]
-// CHECK: @.offload_mapnames = private constant [12 x ptr] [ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}]
+// CHECK: @.offload_sizes = private unnamed_addr constant [15 x i64] [i64 0, i64 0, i64 0, i64 8, i64 0, i64 0, i64 0, i64 0, i64 8, i64 0, i64 0, i64 0, i64 0, i64 8, i64 0]
+// CHECK: @.offload_maptypes = private unnamed_addr constant [15 x i64] [i64 32, i64 281474976710659, i64 281474976710659, i64 281474976710659, i64 281474976710675, i64 32, i64 1688849860263939, i64 1688849860263939, i64 1688849860263939, i64 1688849860263955, i64 32, i64 3096224743817219, i64 3096224743817219, i64 3096224743817219, i64 3096224743817235]
+// CHECK: @.offload_mapnames = private constant [15 x ptr] [ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}]
 
 // CHECK: define void @main()
 // CHECK: %[[SCALAR_ALLOCA:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, i64 1, align 8
@@ -85,74 +85,97 @@ module attributes {omp.is_target_device = false, omp.target_triples = ["amdgcn-a
 // CHECK: %[[ARR_SECT_PTR:.*]] = getelementptr inbounds i32, ptr %[[LARR_SECT]], i64 %[[ARR_SECT_OFFSET2]]
 // CHECK: %[[SCALAR_PTR_LOAD:.*]] = load ptr, ptr %[[SCALAR_BASE]], align 8
 // CHECK: %[[FULL_ARR_DESC_SIZE:.*]] = sdiv exact i64 48, ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
-// CHECK: %[[FULL_ARR_SIZE_CMP:.*]] = icmp eq ptr %[[FULL_ARR_PTR]], null
-// CHECK: %[[FULL_ARR_SIZE_SEL:.*]] = select i1 %[[FULL_ARR_SIZE_CMP]], i64 0, i64 %[[FULL_ARR_SIZE]]
+// CHECK: %[[FULL_ARR_SZ:.*]] = sdiv exact i64 40, ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CHECK: %[[NULL_CMP:.*]] = icmp eq ptr %[[FULL_ARR_PTR]], null
+// CHECK: %[[IS_NULL:.*]] = select i1 %[[NULL_CMP]], i64 0, i64 %[[FULL_ARR_SIZE]]
 // CHECK: %[[ARR_SECT_DESC_SIZE:.*]] = sdiv exact i64 48, ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
-// CHECK: %[[ARR_SECT_SIZE_CMP:.*]] = icmp eq ptr %[[ARR_SECT_PTR]], null
-// CHECK: %[[ARR_SECT_SIZE_SEL:.*]] = select i1 %[[ARR_SECT_SIZE_CMP]], i64 0, i64 %[[ARR_SECT_SIZE]]
+// CHECK: %[[ARR_SECT_SZ:.*]] = sdiv exact i64 40, ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CHECK: %[[NULL_CMP2:.*]] = icmp eq ptr %[[ARR_SECT_PTR]], null
+// CHECK: %[[IS_NULL2:.*]] = select i1 %[[NULL_CMP2]], i64 0, i64 %[[ARR_SECT_SIZE]]
 // CHECK: %[[SCALAR_DESC_SZ4:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[SCALAR_ALLOCA]], i32 1
 // CHECK: %[[SCALAR_DESC_SZ3:.*]] = ptrtoint ptr %[[SCALAR_DESC_SZ4]] to i64
 // CHECK: %[[SCALAR_DESC_SZ2:.*]] = ptrtoint ptr %[[SCALAR_ALLOCA]] to i64
 // CHECK: %[[SCALAR_DESC_SZ1:.*]] = sub i64 %[[SCALAR_DESC_SZ3]], %[[SCALAR_DESC_SZ2]]
 // CHECK: %[[SCALAR_DESC_SZ:.*]] = sdiv exact i64 %[[SCALAR_DESC_SZ1]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
-
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+// CHECK: %[[SCALAR_BASE_2:.*]] = getelementptr { ptr, i64, i32, i8, i8, i8, i8 }, ptr %[[SCALAR_ALLOCA]], i32 1
+// CHECK: %[[SCALAR_BASE_OFF:.*]] = getelementptr ptr, ptr %[[SCALAR_BASE]], i32 1
+// CHECK: %[[SCALAR_BASE_OFF_SZ1:.*]] = ptrtoint ptr %[[SCALAR_BASE_2]] to i64
+// CHECK: %[[SCALAR_BASE_OFF_SZ2:.*]] = ptrtoint ptr %[[SCALAR_BASE_OFF]] to i64
+// CHECK: %[[SCALAR_BASE_OFF_SZ3:.*]] = sub i64 %[[SCALAR_BASE_OFF_SZ1]], %[[SCALAR_BASE_OFF_SZ2]]
+// CHECK: %[[SCALAR_BASE_OFF_SZ4:.*]] = sdiv exact i64 %[[SCALAR_BASE_OFF_SZ3]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64)
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
 // CHECK: store ptr @full_arr, ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 0
 // CHECK: store ptr @full_arr, ptr %[[OFFLOADPTRS]], align 8
-// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [12 x i64], ptr %.offload_sizes, i32 0, i32 0
+// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [15 x i64], ptr %.offload_sizes, i32 0, i32 0
 // CHECK: store i64 %[[FULL_ARR_DESC_SIZE]], ptr %[[OFFLOADSIZES]], align 8
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 1
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 1
 // CHECK: store ptr @full_arr, ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 1
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 1
 // CHECK: store ptr @full_arr, ptr %[[OFFLOADPTRS]], align 8
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 2
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 2
+// CHECK: store ptr @full_arr, ptr %[[OFFLOADBASEPTRS]], align 8
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 2
+// CHECK: store ptr getelementptr inbounds nuw (i8, ptr @full_arr, i64 8), ptr %[[OFFLOADPTRS]], align 8
+// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [15 x i64], ptr %.offload_sizes, i32 0, i32 2
+// CHECK: store i64 %[[FULL_ARR_SZ]], ptr %[[OFFLOADSIZES]], align 8
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 3
 // CHECK: store ptr @full_arr, ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 2
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 3
 // CHECK: store ptr @full_arr, ptr %[[OFFLOADPTRS]], align 8
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 3
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 4
 // CHECK: store ptr @full_arr, ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 3
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 4
 // CHECK: store ptr %[[FULL_ARR_PTR]], ptr %[[OFFLOADPTRS]], align 8
-// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [12 x i64], ptr %.offload_sizes, i32 0, i32 3
-// CHECK: store i64 %[[FULL_ARR_SIZE_SEL]], ptr %[[OFFLOADSIZES]], align 8
-
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 4
+// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [15 x i64], ptr %.offload_sizes, i32 0, i32 4
+// CHECK: store i64 %[[IS_NULL]], ptr %[[OFFLOADSIZES]], align 8
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 5
 // CHECK: store ptr @sect_arr, ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 4
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 5
 // CHECK: store ptr @sect_arr, ptr %[[OFFLOADPTRS]], align 8
-// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [12 x i64], ptr %.offload_sizes, i32 0, i32 4
+// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [15 x i64], ptr %.offload_sizes, i32 0, i32 5
 // CHECK: store i64 %[[ARR_SECT_DESC_SIZE]], ptr %[[OFFLOADSIZES]], align 8
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 5
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 6
 // CHECK: store ptr @sect_arr, ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 5
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 6
 // CHECK: store ptr @sect_arr, ptr %[[OFFLOADPTRS]], align 8
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 6
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 7
 // CHECK: store ptr @sect_arr, ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 6
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 7
+// CHECK: store ptr getelementptr inbounds nuw (i8, ptr @sect_arr, i64 8), ptr %[[OFFLOADPTRS]], align 8
+// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [15 x i64], ptr %.offload_sizes, i32 0, i32 7
+// CHECK: store i64 %[[ARR_SECT_SZ]], ptr %[[OFFLOADSIZES]], align 8
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 8
+// CHECK: store ptr @sect_arr, ptr %[[OFFLOADBASEPTRS]], align 8
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 8
 // CHECK: store ptr @sect_arr, ptr %[[OFFLOADPTRS]], align 8
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 7
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 9
 // CHECK: store ptr @sect_arr, ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 7
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 9
 // CHECK: store ptr %[[ARR_SECT_PTR]], ptr %[[OFFLOADPTRS]], align 8
-// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [12 x i64], ptr %.offload_sizes, i32 0, i32 7
-// CHECK: store i64 %[[ARR_SECT_SIZE_SEL]], ptr %[[OFFLOADSIZES]], align 8
-
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 8
+// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [15 x i64], ptr %.offload_sizes, i32 0, i32 9
+// CHECK: store i64 %[[IS_NULL2]], ptr %[[OFFLOADSIZES]], align 8
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 10
 // CHECK: store ptr %[[SCALAR_ALLOCA]], ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 8
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 10
 // CHECK: store ptr %[[SCALAR_ALLOCA]], ptr %[[OFFLOADPTRS]], align 8
-// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [12 x i64], ptr %.offload_sizes, i32 0, i32 8
+// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [15 x i64], ptr %.offload_sizes, i32 0, i32 10
 // CHECK: store i64 %[[SCALAR_DESC_SZ]], ptr %[[OFFLOADSIZES]], align 8
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 9
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 11
 // CHECK: store ptr %[[SCALAR_ALLOCA]], ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 9
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 11
 // CHECK: store ptr %[[SCALAR_ALLOCA]], ptr %[[OFFLOADPTRS]], align 8
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 10
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 12
+// CHECK: store ptr %[[SCALAR_ALLOCA]], ptr %[[OFFLOADBASEPTRS]], align 8
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 12
+// CHECK: store ptr %[[SCALAR_BASE_OFF]], ptr %[[OFFLOADPTRS]], align 8
+// CHECK: %[[OFFLOADSIZES:.*]] = getelementptr inbounds [15 x i64], ptr %.offload_sizes, i32 0, i32 12
+// CHECK: store i64 %[[SCALAR_BASE_OFF_SZ4]], ptr %[[OFFLOADSIZES]], align 8
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 13
 // CHECK: store ptr %[[SCALAR_ALLOCA]], ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 10
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 13
 // CHECK: store ptr %[[SCALAR_BASE]], ptr %[[OFFLOADPTRS]], align 8
-// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_baseptrs, i32 0, i32 11
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_baseptrs, i32 0, i32 14
 // CHECK: store ptr %[[SCALAR_BASE]], ptr %[[OFFLOADBASEPTRS]], align 8
-// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [12 x ptr], ptr %.offload_ptrs, i32 0, i32 11
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [15 x ptr], ptr %.offload_ptrs, i32 0, i32 14
 // CHECK: store ptr %[[SCALAR_PTR_LOAD]], ptr %[[OFFLOADPTRS]], align 8
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_fallback.c b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_fallback.c
new file mode 100644
index 0000000000000..4b67a3bc2aa7f
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_arrsec_fallback.c
@@ -0,0 +1,24 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// Test that when a use_device_addr lookup fails, the
+// list-item retains its original address by default.
+//
+// This is necessary because we must assume that the
+// list-item is device-accessible, even if it was not
+// previously mapped.
+
+// XFAIL: *
+
+#include <stdio.h>
+int h[10];
+int *ph = &h[0];
+
+void f1() {
+  printf("%p\n", &h[2]); // CHECK:      0x[[#%x,ADDR:]]
+#pragma omp target data use_device_addr(h[2])
+  printf("%p\n", &h[2]); // CHECK-NEXT: 0x{{0*}}[[#ADDR]]
+#pragma omp target data use_device_addr(ph[2])
+  printf("%p\n", &ph[2]); // CHECK-NEXT: 0x{{0*}}[[#ADDR]]
+}
+
+int main() { f1(); }
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_fallback.c b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_fallback.c
new file mode 100644
index 0000000000000..4495a46b6d204
--- /dev/null
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_var_fallback.c
@@ -0,0 +1,21 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// Test that when a use_device_addr lookup fails, the
+// list-item retains its original address by default.
+//
+// This is necessary because we must assume that the
+// list-item is device-accessible, even if it was not
+// previously mapped.
+
+// XFAIL: *
+
+#include <stdio.h>
+int x;
+
+void f1() {
+  printf("%p\n", &x); // CHECK:      0x[[#%x,ADDR:]]
+#pragma omp target data use_device_addr(x)
+  printf("%p\n", &x); // CHECK-NEXT: 0x{{0*}}[[#ADDR]]
+}
+
+int main() { f1(); }
diff --git a/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_var_fallback.c b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_var_fallback.c
new file mode 100644
index 0000000000000..e8fa3b69e9296
--- /dev/null
+++ b/offload/test/mapping/use_device_ptr/target_data_use_device_ptr_var_fallback.c
@@ -0,0 +1,32 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+
+// Test that when a use_device_ptr lookup fails, the
+// privatized pointer retains its original value by
+// default.
+//
+// This is necessary because we must assume that the
+// pointee is device-accessible, even if it was not
+// previously mapped.
+//
+// OpenMP 5.1, sec 2.14.2, target data construct, p 188, l26-31:
+// If a list item that appears in a use_device_ptr clause ... does not point to
+// a mapped object, it must contain a valid device address for the target
+// device, and the list item references are instead converted to references to a
+// local device pointer that refers to this device address.
+//
+// Note: OpenMP 6.1 will have a way to change the
+// fallback behavior: preserve or nullify.
+
+// XFAIL: *
+
+#include <stdio.h>
+int x;
+int *xp = &x;
+
+void f1() {
+  printf("%p\n", xp); // CHECK:      0x[[#%x,ADDR:]]
+#pragma omp target data use_device_ptr(xp)
+  printf("%p\n", xp); // CHECK-NEXT: 0x{{0*}}[[#ADDR]]
+}
+
+int main() { f1(); }
diff --git a/offload/test/offloading/fortran/dtype-member-overlap-map.f90 b/offload/test/offloading/fortran/dtype-member-overlap-map.f90
new file mode 100644
index 0000000000000..e45701441a0f7
--- /dev/null
+++ b/offload/test/offloading/fortran/dtype-member-overlap-map.f90
@@ -0,0 +1,56 @@
+! Basic offloading test checking the interaction of an overlapping
+! member map.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    integer :: i
+
+    type dtype2
+        integer :: int
+        real :: float
+    end type dtype2
+
+    type dtype1
+        character (LEN=30) :: characters
+        type(dtype2) :: internal_dtype2
+    end type dtype1
+
+    type dtype
+        integer :: elements(10)
+        type(dtype1) :: internal_dtype
+        integer :: value
+    end type dtype
+
+    type (dtype) :: single_dtype
+
+    do i = 1, 10
+      single_dtype%elements(i) = 0
+    end do
+
+  !$omp target map(tofrom: single_dtype%internal_dtype, single_dtype%internal_dtype%internal_dtype2%int)
+    single_dtype%internal_dtype%internal_dtype2%int = 123
+    single_dtype%internal_dtype%characters(1:1) = "Z"
+  !$omp end target
+
+  !$omp target map(to: single_dtype) map(tofrom: single_dtype%internal_dtype%internal_dtype2, single_dtype%value)
+    single_dtype%value = 20
+    do i = 1, 10
+      single_dtype%elements(i) = i
+    end do
+    single_dtype%internal_dtype%internal_dtype2%float = 32.0
+  !$omp end target
+
+  print *, single_dtype%value
+  print *, single_dtype%internal_dtype%internal_dtype2%float
+  print *, single_dtype%elements
+  print *, single_dtype%internal_dtype%internal_dtype2%int
+  print *, single_dtype%internal_dtype%characters(1:1)
+end program main
+
+! CHECK: 20
+! CHECK: 32.
+! CHECK: 0 0 0 0 0 0 0 0 0 0
+! CHECK: 123
+! CHECK: Z
diff --git a/offload/test/offloading/fortran/target-custom-reduction-derivedtype.f90 b/offload/test/offloading/fortran/target-custom-reduction-derivedtype.f90
new file mode 100644
index 0000000000000..cc390cf0881f3
--- /dev/null
+++ b/offload/test/offloading/fortran/target-custom-reduction-derivedtype.f90
@@ -0,0 +1,88 @@
+! Basic offloading test with custom OpenMP reduction on derived type
+! REQUIRES: flang, amdgpu
+!
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+module maxtype_mod
+  implicit none
+
+  type maxtype
+     integer::sumval
+     integer::maxval
+  end type maxtype
+
+contains
+
+  subroutine initme(x,n)
+    type(maxtype) :: x,n
+    x%sumval=0
+    x%maxval=0
+  end subroutine initme
+
+  function mycombine(lhs, rhs)
+    type(maxtype) :: lhs, rhs
+    type(maxtype) :: mycombine
+    mycombine%sumval = lhs%sumval + rhs%sumval
+    mycombine%maxval = max(lhs%maxval, rhs%maxval)
+  end function mycombine
+
+end module maxtype_mod
+
+program main
+  use maxtype_mod
+  implicit none
+
+  integer :: n = 100
+  integer :: i
+  integer :: error = 0
+  type(maxtype) :: x(100)
+  type(maxtype) :: res
+  integer :: expected_sum, expected_max
+
+!$omp declare reduction(red_add_max:maxtype:omp_out=mycombine(omp_out,omp_in)) initializer(initme(omp_priv,omp_orig))
+
+  ! Initialize array with test data
+  do i = 1, n
+    x(i)%sumval = i
+    x(i)%maxval = i
+  end do
+
+  ! Initialize reduction variable
+  res%sumval = 0
+  res%maxval = 0
+
+  ! Perform reduction in target region
+  !$omp target parallel do map(to:x) reduction(red_add_max:res)
+  do i = 1, n
+    res = mycombine(res, x(i))
+  end do
+  !$omp end target parallel do
+
+  ! Compute expected values
+  expected_sum = 0
+  expected_max = 0
+  do i = 1, n
+    expected_sum = expected_sum + i
+    expected_max = max(expected_max, i)
+  end do
+
+  ! Check results
+  if (res%sumval /= expected_sum) then
+    error = 1
+  endif
+
+  if (res%maxval /= expected_max) then
+    error = 1
+  endif
+
+  if (error == 0) then
+    print *,"PASSED"
+  else
+    print *,"FAILED"
+  endif
+
+end program main
+
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK:  PASSED
+
diff --git a/utils/bazel/MODULE.bazel b/utils/bazel/MODULE.bazel
index 1a8327c33d246..cf760ba24cb4c 100644
--- a/utils/bazel/MODULE.bazel
+++ b/utils/bazel/MODULE.bazel
@@ -15,6 +15,7 @@ bazel_dep(name = "rules_cc", version = "0.2.11")
 bazel_dep(name = "rules_foreign_cc", version = "0.15.1")
 bazel_dep(name = "rules_python", version = "1.6.3")
 bazel_dep(name = "rules_shell", version = "0.6.1")
+bazel_dep(name = "zstd", version = "1.5.7", repo_name = "llvm_zstd")
 
 llvm_repos_extension = use_extension(":extensions.bzl", "llvm_repos_extension")
 use_repo(
@@ -22,7 +23,6 @@ use_repo(
     "gmp",
     "llvm-raw",
     "llvm_zlib",
-    "llvm_zstd",
     "mpc",
     "mpfr",
     "nanobind",
diff --git a/utils/bazel/MODULE.bazel.lock b/utils/bazel/MODULE.bazel.lock
index 3b196231c6723..c923f3aaea68d 100644
--- a/utils/bazel/MODULE.bazel.lock
+++ b/utils/bazel/MODULE.bazel.lock
@@ -224,13 +224,15 @@
     "https://bcr.bazel.build/modules/zlib/1.2.12/MODULE.bazel": "3b1a8834ada2a883674be8cbd36ede1b6ec481477ada359cd2d3ddc562340b27",
     "https://bcr.bazel.build/modules/zlib/1.3.1.bcr.5/MODULE.bazel": "eec517b5bbe5492629466e11dae908d043364302283de25581e3eb944326c4ca",
     "https://bcr.bazel.build/modules/zlib/1.3.1.bcr.5/source.json": "22bc55c47af97246cfc093d0acf683a7869377de362b5d1c552c2c2e16b7a806",
-    "https://bcr.bazel.build/modules/zlib/1.3.1/MODULE.bazel": "751c9940dcfe869f5f7274e1295422a34623555916eb98c174c1e945594bf198"
+    "https://bcr.bazel.build/modules/zlib/1.3.1/MODULE.bazel": "751c9940dcfe869f5f7274e1295422a34623555916eb98c174c1e945594bf198",
+    "https://bcr.bazel.build/modules/zstd/1.5.7/MODULE.bazel": "f5780cdbd6f4c5bb985a20f839844316fe48fb5e463056f372dbc37cfabdf450",
+    "https://bcr.bazel.build/modules/zstd/1.5.7/source.json": "f72c48184b6528ffc908a5a2bcbf3070c6684f3db03da2182c8ca999ae5f5cfd"
   },
   "selectedYankedVersions": {},
   "moduleExtensions": {
     "//:extensions.bzl%llvm_repos_extension": {
       "general": {
-        "bzlTransitiveDigest": "ojj7cD2YU2vcH58jVPVj2juUYn5SvdSNj1pmWb8Xo/k=",
+        "bzlTransitiveDigest": "05R8ZuqDbhn1LOyXHQzta+x0dI9dEY6RIu21atUo+Kw=",
         "usagesDigest": "X0yUkkWyxQ2Y5oZVDkRSE/K4YkDWo1IjhHsL+1weKyU=",
         "recordedFileInputs": {},
         "recordedDirentsInputs": {},
@@ -314,17 +316,6 @@
               "build_file": "@@+llvm_repos_extension+llvm-raw//utils/bazel/third_party_build:pfm.BUILD"
             }
           },
-          "llvm_zstd": {
-            "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
-            "attributes": {
-              "build_file": "@@+llvm_repos_extension+llvm-raw//utils/bazel/third_party_build:zstd.BUILD",
-              "sha256": "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
-              "strip_prefix": "zstd-1.5.2",
-              "urls": [
-                "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz"
-              ]
-            }
-          },
           "pybind11": {
             "repoRuleId": "@@bazel_tools//tools/build_defs/repo:http.bzl%http_archive",
             "attributes": {
diff --git a/utils/bazel/extensions.bzl b/utils/bazel/extensions.bzl
index bb5ce1955f916..e57046530aa89 100644
--- a/utils/bazel/extensions.bzl
+++ b/utils/bazel/extensions.bzl
@@ -79,16 +79,6 @@ def _llvm_repos_extension_impl(module_ctx):
         build_file = "@llvm-raw//utils/bazel/third_party_build:pfm.BUILD",
     )
 
-    http_archive(
-        name = "llvm_zstd",
-        build_file = "@llvm-raw//utils/bazel/third_party_build:zstd.BUILD",
-        sha256 = "7c42d56fac126929a6a85dbc73ff1db2411d04f104fae9bdea51305663a83fd0",
-        strip_prefix = "zstd-1.5.2",
-        urls = [
-            "https://github.com/facebook/zstd/releases/download/v1.5.2/zstd-1.5.2.tar.gz",
-        ],
-    )
-
     http_archive(
         name = "pybind11",
         url = "https://github.com/pybind/pybind11/archive/v2.10.3.zip",
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 020b2aa68a357..4ac3e75b9c1ce 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1563,7 +1563,6 @@ cc_library(
         ":basic",
         ":config",
         ":driver_options_inc_gen",
-        ":frontend",
         ":lex",
         ":options",
         ":parse",
@@ -1719,6 +1718,7 @@ cc_library(
         ":ast",
         ":basic",
         ":config",
+        ":driver",
         ":driver_options_inc_gen",
         ":edit",
         ":lex",
diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index 162c4f955d150..8a03266da6327 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -105,8 +105,8 @@ cc_library(
         "//llvm:TargetParser",
         "//llvm:TransformUtils",
         "//llvm:config",
+        "//third-party:zstd",
         "@llvm_zlib//:zlib",
-        "@llvm_zstd//:zstd",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index f67e4ea29b51f..b59e6ea973f14 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -331,7 +331,7 @@ cc_library(
         # We unconditionally depend on the custom LLVM zstd wrapper. This will
         # be an empty library unless zstd is enabled, in which case it will
         # both provide the necessary dependencies and configuration defines.
-        "@llvm_zstd//:zstd",
+        "//third-party:zstd",
     ],
 )
 
diff --git a/utils/bazel/llvm-project-overlay/third-party/BUILD.bazel b/utils/bazel/llvm-project-overlay/third-party/BUILD.bazel
new file mode 100644
index 0000000000000..bf780b5f3c0bf
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/third-party/BUILD.bazel
@@ -0,0 +1,34 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+load(":cc_library_wrapper.bzl", "cc_library_wrapper")
+
+package(default_visibility = ["//visibility:public"])
+
+bool_flag(
+    name = "llvm_enable_zstd",
+    build_setting_default = True,
+)
+
+config_setting(
+    name = "llvm_zstd_enabled",
+    flag_values = {":llvm_enable_zstd": "true"},
+)
+
+cc_library_wrapper(
+    name = "zstd",
+    defines = select({
+        ":llvm_zstd_enabled": [
+            "LLVM_ENABLE_ZSTD=1",
+            "ZSTD_MULTITHREAD",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = select({
+        ":llvm_zstd_enabled": [
+            "@llvm_zstd//:zstd",
+        ],
+        "//conditions:default": [],
+    }),
+)
diff --git a/utils/bazel/llvm-project-overlay/third-party/cc_library_wrapper.bzl b/utils/bazel/llvm-project-overlay/third-party/cc_library_wrapper.bzl
new file mode 100644
index 0000000000000..b484a611571e3
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/third-party/cc_library_wrapper.bzl
@@ -0,0 +1,50 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""Re-export a cc_library with added LLVM specific settings.
+
+This re-exports the dependent libraries in a way that satisfies layering_check
+
+cc_library_wrapper(
+    name = "library_wrapper",
+    deps = [
+        "@example//:library",
+    ],
+    defines = [
+        "LLVM_ENABLE_EXAMPLE=1",
+    ],
+)
+"""
+
+load("@rules_cc//cc/common:cc_common.bzl", "cc_common")
+load("@rules_cc//cc/common:cc_info.bzl", "CcInfo")
+
+visibility("private")
+
+def _cc_library_wrapper_impl(ctx):
+    all_cc_infos = [dep[CcInfo] for dep in ctx.attr.deps]
+    if ctx.attr.defines:
+        all_cc_infos.append(CcInfo(
+            compilation_context = cc_common.create_compilation_context(
+                defines = depset(ctx.attr.defines),
+            ),
+        ))
+
+    return cc_common.merge_cc_infos(direct_cc_infos = all_cc_infos)
+
+cc_library_wrapper = rule(
+    implementation = _cc_library_wrapper_impl,
+    attrs = {
+        "deps": attr.label_list(
+            doc = "Dependencies to cc_library targets to re-export.",
+            providers = [CcInfo],
+        ),
+        "defines": attr.string_list(
+            doc = "Additional preprocessor definitions to add to all dependent targets.",
+            default = [],
+        ),
+    },
+    doc = "Re-export a cc_library with added LLVM specific settings.",
+    provides = [CcInfo],
+)
diff --git a/utils/bazel/third_party_build/zstd.BUILD b/utils/bazel/third_party_build/zstd.BUILD
index 7d022d4226de1..89da165225ad7 100644
--- a/utils/bazel/third_party_build/zstd.BUILD
+++ b/utils/bazel/third_party_build/zstd.BUILD
@@ -1,7 +1,7 @@
 # This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
+load("@rules_cc//cc:cc_library.bzl", "cc_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -9,20 +9,10 @@ package(
     licenses = ["notice"],
 )
 
-bool_flag(
-    name = "llvm_enable_zstd",
-    build_setting_default = True,
-)
-
-config_setting(
-    name = "llvm_zstd_enabled",
-    flag_values = {":llvm_enable_zstd": "true"},
-)
-
 cc_library(
     name = "zstd",
     srcs = select({
-        ":llvm_zstd_enabled": glob([
+        "@llvm-project//third-party:llvm_zstd_enabled": glob([
             "lib/common/*.c",
             "lib/common/*.h",
             "lib/compress/*.c",
@@ -36,7 +26,7 @@ cc_library(
         "//conditions:default": [],
     }),
     hdrs = select({
-        ":llvm_zstd_enabled": [
+        "@llvm-project//third-party:llvm_zstd_enabled": [
             "lib/zdict.h",
             "lib/zstd.h",
             "lib/zstd_errors.h",
@@ -44,7 +34,7 @@ cc_library(
         "//conditions:default": [],
     }),
     defines = select({
-        ":llvm_zstd_enabled": [
+        "@llvm-project//third-party:llvm_zstd_enabled": [
             "LLVM_ENABLE_ZSTD=1",
             "ZSTD_MULTITHREAD",
         ],