From e69d2bf6031a534c824d1ce4191f5fc334ab4ae6 Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Mon, 17 Nov 2025 17:05:18 +0100
Subject: [PATCH 01/67] [OpenMP][omptest] Fix missing source extention

The file extention was accidentally omitted from #164794.
---
 openmp/tools/omptest/test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/tools/omptest/test/CMakeLists.txt b/openmp/tools/omptest/test/CMakeLists.txt
index 1e07a1044f7d6..2b4aa78b0bc16 100644
--- a/openmp/tools/omptest/test/CMakeLists.txt
+++ b/openmp/tools/omptest/test/CMakeLists.txt
@@ -9,7 +9,7 @@ set(UNITTEST_SOURCES
   unittests/asserter-seq-test.cpp
   unittests/internal-event-eq-test.cpp
   unittests/internal-event-tostring-test.cpp
-  unittests/internal-util-test
+  unittests/internal-util-test.cpp
   unittests/main-test.cpp
 )
 add_executable(omptest-unittests ${UNITTEST_SOURCES})

From a9633aac31cb1ec42153fb3ada815aa1572eb58f Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Mon, 17 Nov 2025 17:31:55 +0100
Subject: [PATCH 02/67] [libc++] Fix __hash_table::erase(iterator, iterator) to
 update the bucket list correctly when erasing the last bucket (#167865)

Fixes #167820
---
 libcxx/include/__hash_table                   |  2 ++
 .../unord.map.modifiers/erase_range.pass.cpp  | 22 +++++++++++++++++++
 .../erase_range.pass.cpp                      | 22 +++++++++++++++++++
 .../unord/unord.multiset/erase_range.pass.cpp | 22 +++++++++++++++++++
 .../unord/unord.set/erase_range.pass.cpp      | 22 +++++++++++++++++++
 5 files changed, 90 insertions(+)

diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index e1897949a47e6..ef487fb06dd5e 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -1910,6 +1910,8 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::erase(const_iterator __first, const_it
         __bucket_list_[__next_chash] = __before_first;
         __chash                      = __next_chash;
       }
+    } else { // When __next is a nullptr we've fully erased the last bucket. Update the bucket list accordingly.
+      __bucket_list_[__chash] = nullptr;
     }
   }
 
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp
index 532413437f6be..81371638143c9 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.modifiers/erase_range.pass.cpp
@@ -57,6 +57,28 @@ int main(int, char**) {
     assert(c.size() == 0);
     assert(k == c.end());
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_map<int, int> m;
+    m.insert(std::make_pair(1, 1));
+    m.insert(std::make_pair(2, 2));
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(std::make_pair(3, 3));
+    assert(m.size() == 1);
+    assert(*m.begin() == std::make_pair(3, 3));
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_map<int,
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
index 38b75c0c1986b..aa6bc20e4090b 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.modifiers/erase_range.pass.cpp
@@ -122,6 +122,28 @@ int main(int, char**) {
     for (const auto& v : map)
       assert(v.first == 1 || v.first == collision_val);
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_multimap<int, int> m;
+    m.insert(std::make_pair(1, 1));
+    m.insert(std::make_pair(2, 2));
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(std::make_pair(3, 3));
+    assert(m.size() == 1);
+    assert(*m.begin() == std::make_pair(3, 3));
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_multimap<int,
diff --git a/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
index 3bc686ec2d86e..013e052e530de 100644
--- a/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multiset/erase_range.pass.cpp
@@ -64,6 +64,28 @@ int main(int, char**) {
     for (const auto& v : map)
       assert(v == 1 || v == collision_val);
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_multiset<int> m;
+    m.insert(1);
+    m.insert(2);
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(3);
+    assert(m.size() == 1);
+    assert(*m.begin() == 3);
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_multiset<int, std::hash<int>, std::equal_to<int>, min_allocator<int>> C;
diff --git a/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp b/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp
index 5fa6e4199f756..1f049a295b8c3 100644
--- a/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.set/erase_range.pass.cpp
@@ -47,6 +47,28 @@ int main(int, char**) {
     assert(c.size() == 0);
     assert(k == c.end());
   }
+  { // Make sure that we're properly updating the bucket list when we're erasing to the end
+    std::unordered_set<int> m;
+    m.insert(1);
+    m.insert(2);
+
+    {
+      auto pair = m.equal_range(1);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    {
+      auto pair = m.equal_range(2);
+      assert(pair.first != pair.second);
+      m.erase(pair.first, pair.second);
+    }
+
+    m.insert(3);
+    assert(m.size() == 1);
+    assert(*m.begin() == 3);
+    assert(++m.begin() == m.end());
+  }
 #if TEST_STD_VER >= 11
   {
     typedef std::unordered_set<int, std::hash<int>, std::equal_to<int>, min_allocator<int>> C;

From 7659cd42578c59d1bef1313053d493171b9146a2 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 17 Nov 2025 16:36:42 +0000
Subject: [PATCH 03/67] [VectorUtils] Use PatternMatch in findScalarElement
 (NFC) (#168389)

---
 llvm/lib/Analysis/VectorUtils.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 530fa9518f40e..a3e9b039f9225 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -317,9 +317,9 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
 
   if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
     // If this is an insert to a variable element, we don't know what it is.
-    if (!isa<ConstantInt>(III->getOperand(2)))
+    uint64_t IIElt;
+    if (!match(III->getOperand(2), m_ConstantInt(IIElt)))
       return nullptr;
-    unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
 
     // If this is an insert to the element we are looking for, return the
     // inserted value.

From 4dd27960706cf2681b72cc2cf7cd8ccbcf0f4f9d Mon Sep 17 00:00:00 2001
From: vangthao95 <vang.thao@amd.com>
Date: Mon, 17 Nov 2025 08:45:49 -0800
Subject: [PATCH 04/67] [AMDGPU][GlobalISel] Add RegBankLegalize support for
 G_FMUL (#167847)

---
 .../AMDGPU/AMDGPURegBankLegalizeRules.cpp     |   2 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll   | 165 ++++++++++++++++++
 .../CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll   |   2 +
 .../AMDGPU/GlobalISel/regbankselect-fmul.mir  |   5 +-
 4 files changed, 171 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 90114e44f1a48..b81a08de383d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -935,7 +935,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   bool hasSALUFloat = ST->hasSALUFloatInsts();
 
-  addRulesForGOpcs({G_FADD}, Standard)
+  addRulesForGOpcs({G_FADD, G_FMUL}, Standard)
       .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
       .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
       .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll
new file mode 100644
index 0000000000000..84ac58f899717
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s
+
+define amdgpu_ps half @fmul_s16_uniform(half inreg %a, half inreg %b) {
+; GFX11-FAKE16-LABEL: fmul_s16_uniform:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, s0, s1
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fmul_s16_uniform:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.l, s0, s1
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mul_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul half %a, %b
+  ret half %result
+}
+
+define amdgpu_ps half @fmul_s16_div(half %a, half %b) {
+; GFX11-FAKE16-LABEL: fmul_s16_div:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX11-TRUE16-LABEL: fmul_s16_div:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX11-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-FAKE16-LABEL: fmul_s16_div:
+; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX12-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX12-TRUE16-LABEL: fmul_s16_div:
+; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    v_mul_f16_e32 v0.l, v0.l, v1.l
+; GFX12-TRUE16-NEXT:    ; return to shader part epilog
+  %result = fmul half %a, %b
+  ret half %result
+}
+
+define amdgpu_ps float @fmul_s32_uniform(float inreg %a, float inreg %b) {
+; GFX11-LABEL: fmul_s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f32_e64 v0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul float %a, %b
+  ret float %result
+}
+
+define amdgpu_ps float @fmul_s32_div(float %a, float %b) {
+; GCN-LABEL: fmul_s32_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %result = fmul float %a, %b
+  ret float %result
+}
+
+define amdgpu_ps void @fmul_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fmul_s64_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f64 v[2:3], s[0:1], s[2:3]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fmul_s64_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mul_f64_e64 v[2:3], s[0:1], s[2:3]
+; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX12-NEXT:    s_endpgm
+  %result = fmul double %a, %b
+  store double %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps void @fmul_s64_div(double %a, double %b, ptr addrspace(1) %ptr) {
+; GFX11-LABEL: fmul_s64_div:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX11-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fmul_s64_div:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mul_f64_e32 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[0:1], off
+; GFX12-NEXT:    s_endpgm
+  %result = fmul double %a, %b
+  store double %result, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_ps <2 x half> @fmul_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {
+; GFX11-LABEL: fmul_v2s16_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_mul_f16 v0, s0, s1
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_v2s16_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX12-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX12-NEXT:    s_mul_f16 s0, s0, s1
+; GFX12-NEXT:    s_mul_f16 s1, s2, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x half> %a, %b
+  ret <2 x half> %result
+}
+
+define amdgpu_ps <2 x half> @fmul_v2s16_div(<2 x half> %a, <2 x half> %b) {
+; GCN-LABEL: fmul_v2s16_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x half> %a, %b
+  ret <2 x half> %result
+}
+
+define amdgpu_ps <2 x float> @fmul_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) {
+; GFX11-LABEL: fmul_v2s32_uniform:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mul_f32_e64 v0, s0, s2
+; GFX11-NEXT:    v_mul_f32_e64 v1, s1, s3
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: fmul_v2s32_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mul_f32 s0, s0, s2
+; GFX12-NEXT:    s_mul_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x float> %a, %b
+  ret <2 x float> %result
+}
+
+define amdgpu_ps <2 x float> @fmul_v2s32_div(<2 x float> %a, <2 x float> %b) {
+; GCN-LABEL: fmul_v2s32_div:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_mul_f32 v1, v1, v3
+; GCN-NEXT:    ; return to shader part epilog
+  %result = fmul <2 x float> %a, %b
+  ret <2 x float> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
index e03aa18d3147f..1220c0e3b1ead 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll
@@ -4,6 +4,8 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX10 %s
 
+; TODO: Switch test to use -new-reg-bank-select after adding G_FNEG support.
+
 define <2 x half> @v_fmul_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX9-LABEL: v_fmul_v2f16:
 ; GFX9:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
index 5766c05426b2d..f289566a27c12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fmul.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
 
 ---
 name: fmul_ss
@@ -17,6 +17,7 @@ body: |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[COPY1]](s32)
     ; CHECK-NEXT: [[FMUL:%[0-9]+]]:vgpr(s32) = G_FMUL [[COPY2]], [[COPY3]]
+    ; CHECK-NEXT: [[AMDGPU_READANYLANE:%[0-9]+]]:sgpr(s32) = G_AMDGPU_READANYLANE [[FMUL]]
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = COPY $sgpr1
     %2:_(s32) = G_FMUL %0, %1

From d163988dd2833f28fbca8c144265108d25ae7bd2 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Mon, 17 Nov 2025 22:18:14 +0530
Subject: [PATCH 05/67] [MLIR][NVVM][NFC] Re-order mem_scope and shared_space
 attrs (#168348)

The mbarrier Ops also require access to the `mem_scope` and
`shared_space` attributes. Hence, this patch moves their definitions
to the beginning of the file alongside the other attribute definitions.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 53 +++++++++++----------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index d4ef5104d3c1f..456d816205b58 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -228,6 +228,33 @@ def NVVMMemorySpaceAttr :
   let assemblyFormat = "`<` $value `>`";
 }
 
+// Attrs describing the scope of the Memory Operation
+def MemScopeKindCTA      : I32EnumAttrCase<"CTA", 0, "cta">;
+def MemScopeKindCluster  : I32EnumAttrCase<"CLUSTER", 1, "cluster">;
+def MemScopeKindGPU      : I32EnumAttrCase<"GPU", 2, "gpu">;
+def MemScopeKindSYS      : I32EnumAttrCase<"SYS", 3, "sys">;
+
+def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind",
+  [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def MemScopeKindAttr : EnumAttr<NVVM_Dialect, MemScopeKind, "mem_scope"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
+// Attrs to disambiguate the cta or cluster space within shared memory
+def SharedSpaceCTA : I32EnumAttrCase<"shared_cta", 0, "cta">;
+def SharedSpaceCluster   : I32EnumAttrCase<"shared_cluster", 1, "cluster">;
+def SharedSpace : I32EnumAttr<"SharedSpace", "Shared memory space",
+  [SharedSpaceCTA, SharedSpaceCluster]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::NVVM";
+}
+def SharedSpaceAttr : EnumAttr<NVVM_Dialect, SharedSpace, "shared_space"> {
+  let assemblyFormat = "`<` $value `>`";
+}
+
 //===----------------------------------------------------------------------===//
 // NVVM intrinsic operations
 //===----------------------------------------------------------------------===//
@@ -1107,17 +1134,6 @@ def NVVM_FenceScClusterOp : NVVM_Op<"fence.sc.cluster"> {
   let assemblyFormat = "attr-dict";
 }
 
-def SharedSpaceCTA : I32EnumAttrCase<"shared_cta", 0, "cta">;
-def SharedSpaceCluster   : I32EnumAttrCase<"shared_cluster", 1, "cluster">;
-def SharedSpace : I32EnumAttr<"SharedSpace", "Shared memory space",
-  [SharedSpaceCTA, SharedSpaceCluster]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::NVVM";
-}
-def SharedSpaceAttr : EnumAttr<NVVM_Dialect, SharedSpace, "shared_space"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
 def ProxyAlias : I32EnumAttrCase<"alias", 0, "alias">;
 def ProxyAsync   : I32EnumAttrCase<"async", 1, "async">;
 def ProxyAsyncGlobal   : I32EnumAttrCase<"async_global", 2, "async.global">;
@@ -1158,21 +1174,6 @@ def NVVM_FenceProxyOp : NVVM_PTXBuilder_Op<"fence.proxy">,
   let hasVerifier = 1;
 }
 
-// Attrs describing the scope of the Memory Operation
-def MemScopeKindCTA      : I32EnumAttrCase<"CTA", 0, "cta">;
-def MemScopeKindCluster  : I32EnumAttrCase<"CLUSTER", 1, "cluster">;
-def MemScopeKindGPU      : I32EnumAttrCase<"GPU", 2, "gpu">;
-def MemScopeKindSYS      : I32EnumAttrCase<"SYS", 3, "sys">;
-
-def MemScopeKind : I32EnumAttr<"MemScopeKind", "NVVM Memory Scope kind",
-  [MemScopeKindCTA, MemScopeKindCluster, MemScopeKindGPU, MemScopeKindSYS]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::mlir::NVVM";
-}
-def MemScopeKindAttr : EnumAttr<NVVM_Dialect, MemScopeKind, "mem_scope"> {
-  let assemblyFormat = "`<` $value `>`";
-}
-
 def NVVM_FenceProxyAcquireOp : NVVM_Op<"fence.proxy.acquire">,
       Arguments<(ins MemScopeKindAttr:$scope, LLVM_PointerGeneric:$addr, I32:$size,
                      DefaultValuedAttr<ProxyKindAttr,

From 83fc85c9547e7e55b326f86e946cff8358cbe0c2 Mon Sep 17 00:00:00 2001
From: Sterling-Augustine <saugustine@google.com>
Date: Mon, 17 Nov 2025 08:50:14 -0800
Subject: [PATCH 06/67] Remove shadowing "size" field from classes that inherit
 from SyntheticSection (#166323)

A field-named 'size' already available and perfectly usable via
inheritance from InputSection, and these variables shadow it for no good
reason.

The only interesting change here is in PaddingSection, because a
parent's field cannot be initialized via a constructor initializer list,
setting it needs to be done inside the constructor body.
---
 lld/ELF/SyntheticSections.cpp |  6 +++---
 lld/ELF/SyntheticSections.h   | 13 +------------
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 9a70c0d19c41d..19b08152ae081 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -2747,9 +2747,9 @@ RelroPaddingSection::RelroPaddingSection(Ctx &ctx)
     : SyntheticSection(ctx, ".relro_padding", SHT_NOBITS, SHF_ALLOC | SHF_WRITE,
                        1) {}
 
-PaddingSection::PaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent)
-    : SyntheticSection(ctx, ".padding", SHT_PROGBITS, SHF_ALLOC, 1),
-      size(size) {
+PaddingSection::PaddingSection(Ctx &ctx, uint64_t amount, OutputSection *parent)
+    : SyntheticSection(ctx, ".padding", SHT_PROGBITS, SHF_ALLOC, 1) {
+  size = amount;
   this->parent = parent;
 }
 
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 38e68110e4bc0..66c866d7e8cde 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -78,8 +78,6 @@ class EhFrameSection final : public SyntheticSection {
   // allocating one for each EhInputSection.
   llvm::DenseMap<size_t, CieRecord *> offsetToCie;
 
-  uint64_t size = 0;
-
   template <llvm::endianness E> void addRecords(EhInputSection *s);
   template <class ELFT>
   void iterateFDEWithLSDAAux(EhInputSection &sec,
@@ -127,7 +125,6 @@ class GotSection final : public SyntheticSection {
 protected:
   size_t numEntries = 0;
   uint32_t tlsIndexOff = -1;
-  uint64_t size = 0;
   struct AuthEntryInfo {
     size_t offset;
     bool isSymbolFunc;
@@ -182,7 +179,6 @@ class BssSection final : public SyntheticSection {
   static bool classof(const SectionBase *s) {
     return isa<SyntheticSection>(s) && cast<SyntheticSection>(s)->bss;
   }
-  uint64_t size;
 };
 
 class MipsGotSection final : public SyntheticSection {
@@ -312,8 +308,6 @@ class MipsGotSection final : public SyntheticSection {
   // Number of "Header" entries.
   static const unsigned headerEntriesNum = 2;
 
-  uint64_t size = 0;
-
   // Symbol and addend.
   using GotEntry = std::pair<Symbol *, int64_t>;
 
@@ -407,8 +401,6 @@ class StringTableSection final : public SyntheticSection {
 private:
   const bool dynamic;
 
-  uint64_t size = 0;
-
   llvm::DenseMap<llvm::CachedHashStringRef, unsigned> stringMap;
   SmallVector<StringRef, 0> strings;
 };
@@ -475,7 +467,6 @@ template <class ELFT> class DynamicSection final : public SyntheticSection {
 
 private:
   std::vector<std::pair<int32_t, uint64_t>> computeContents();
-  uint64_t size = 0;
 };
 
 class RelocationBaseSection : public SyntheticSection {
@@ -780,10 +771,8 @@ class RelroPaddingSection final : public SyntheticSection {
 };
 
 class PaddingSection final : public SyntheticSection {
-  uint64_t size;
-
 public:
-  PaddingSection(Ctx &ctx, uint64_t size, OutputSection *parent);
+  PaddingSection(Ctx &ctx, uint64_t amount, OutputSection *parent);
   size_t getSize() const override { return size; }
   void writeTo(uint8_t *buf) override;
 };

From 35ae5157c0ab98a90231ff655b1a47d3f8a20d2b Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Mon, 17 Nov 2025 17:53:51 +0100
Subject: [PATCH 07/67] [MLIR][NVVM][Docs] Explain memory spaces (#168059)

---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 34 +++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 456d816205b58..6e3a92b5bde42 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -79,6 +79,40 @@ def NVVM_Dialect : Dialect {
     sequence must be expressed directly, NVVM provides an `nvvm.inline_ptx` op to
     embed PTX inline as a last-resort escape hatch, with explicit operands and
     results.
+
+
+    **Memory Spaces:** The NVVM dialect introduces the following memory spaces,
+    each with distinct scopes and lifetimes:
+```
+    | Memory Space      | Address Space | Scope                | Lifetime          |
+    |-------------------|---------------|----------------------|-------------------|
+    | `generic`         | 0             | All threads          | Context-dependent |
+    | `global`          | 1             | All threads (device) | Application       |
+    | `shared`          | 3             | Thread block (CTA)   | Kernel execution  |
+    | `constant`        | 4             | All threads (RO)     | Application       |
+    | `local`           | 5             | Single thread        | Kernel execution  |
+    | `tensor`          | 6             | Thread block (CTA)   | Kernel execution  |
+    | `shared_cluster`  | 7             | Thread block cluster | Kernel execution  |
+```
+    **Memory Space Details:**
+    - **generic**: Can point to any memory space; requires runtime resolution of
+      actual address space. Use when pointer origin is unknown at compile time.
+      Performance varies based on the underlying memory space.
+    - **global**: Accessible by all threads across all blocks; persists across
+      kernel launches. Highest latency but largest capacity (device memory). Best
+      for large data and inter-kernel communication.
+    - **shared**: Shared within a thread block (CTA); very fast on-chip memory for
+      cooperation between threads in the same block. Limited capacity. Ideal for 
+      block-level collaboration, caching, and reducing global memory traffic.
+    - **constant**: Read-only memory cached per SM. Size typically limited to 
+      64KB. Best for read-only data and uniform values accessed by all threads.
+    - **local**: Private to each thread. Use for per-thread private data and
+      automatic variables that don't fit in registers.
+    - **tensor**: Special memory space for tensor core operations. Used by
+      `tcgen05` instructions on SM 100+ for tensor input/output operations.
+    - **shared_cluster**: Distributed shared memory across thread blocks within
+      a cluster (SM 90+). Enables collaboration beyond single-block scope with
+      fast access across cluster threads.
   }];
 
   let name = "nvvm";

From c66f1fdfb74802204afc425317062017d2487194 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 17 Nov 2025 09:00:15 -0800
Subject: [PATCH 08/67] [MC] Use MCRegister::id() to avoid implicit casts. NFC
 (#168233)

---
 llvm/include/llvm/MC/MCRegisterInfo.h |  6 +++---
 llvm/lib/MC/MCInst.cpp                |  2 +-
 llvm/lib/MC/MCRegisterInfo.cpp        | 25 +++++++++++++------------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/MC/MCRegisterInfo.h b/llvm/include/llvm/MC/MCRegisterInfo.h
index 6e36e580358e7..f4897b6a406fb 100644
--- a/llvm/include/llvm/MC/MCRegisterInfo.h
+++ b/llvm/include/llvm/MC/MCRegisterInfo.h
@@ -438,7 +438,7 @@ class LLVM_ABI MCRegisterInfo {
   /// number.  Returns -1 if there is no equivalent value.  The second
   /// parameter allows targets to use different numberings for EH info and
   /// debugging info.
-  virtual int64_t getDwarfRegNum(MCRegister RegNum, bool isEH) const;
+  virtual int64_t getDwarfRegNum(MCRegister Reg, bool isEH) const;
 
   /// Map a dwarf register back to a target register. Returns std::nullopt if
   /// there is no mapping.
@@ -450,11 +450,11 @@ class LLVM_ABI MCRegisterInfo {
 
   /// Map a target register to an equivalent SEH register
   /// number.  Returns LLVM register number if there is no equivalent value.
-  int getSEHRegNum(MCRegister RegNum) const;
+  int getSEHRegNum(MCRegister Reg) const;
 
   /// Map a target register to an equivalent CodeView register
   /// number.
-  int getCodeViewRegNum(MCRegister RegNum) const;
+  int getCodeViewRegNum(MCRegister Reg) const;
 
   regclass_iterator regclass_begin() const { return Classes; }
   regclass_iterator regclass_end() const { return Classes+NumClasses; }
diff --git a/llvm/lib/MC/MCInst.cpp b/llvm/lib/MC/MCInst.cpp
index 46a6a18e15963..61eeb5e5a5c71 100644
--- a/llvm/lib/MC/MCInst.cpp
+++ b/llvm/lib/MC/MCInst.cpp
@@ -29,7 +29,7 @@ void MCOperand::print(raw_ostream &OS, const MCContext *Ctx) const {
     if (Ctx && Ctx->getRegisterInfo())
       OS << Ctx->getRegisterInfo()->getName(getReg());
     else
-      OS << getReg();
+      OS << getReg().id();
   } else if (isImm())
     OS << "Imm:" << getImm();
   else if (isSFPImm())
diff --git a/llvm/lib/MC/MCRegisterInfo.cpp b/llvm/lib/MC/MCRegisterInfo.cpp
index 7fd92bf974b95..77fb7332619cd 100644
--- a/llvm/lib/MC/MCRegisterInfo.cpp
+++ b/llvm/lib/MC/MCRegisterInfo.cpp
@@ -89,7 +89,7 @@ ArrayRef<MCPhysReg> MCRegisterInfo::getCachedAliasesOf(MCRegister R) const {
     return Aliases;
 
   for (MCRegAliasIteratorImpl It(R, this); It.isValid(); ++It)
-    Aliases.push_back(*It);
+    Aliases.push_back((*It).id());
 
   sort(Aliases);
   Aliases.erase(unique(Aliases), Aliases.end());
@@ -141,15 +141,15 @@ unsigned MCRegisterInfo::getSubRegIndex(MCRegister Reg,
   return 0;
 }
 
-int64_t MCRegisterInfo::getDwarfRegNum(MCRegister RegNum, bool isEH) const {
+int64_t MCRegisterInfo::getDwarfRegNum(MCRegister Reg, bool isEH) const {
   const DwarfLLVMRegPair *M = isEH ? EHL2DwarfRegs : L2DwarfRegs;
   unsigned Size = isEH ? EHL2DwarfRegsSize : L2DwarfRegsSize;
 
   if (!M)
     return -1;
-  DwarfLLVMRegPair Key = { RegNum, 0 };
+  DwarfLLVMRegPair Key = {Reg.id(), 0};
   const DwarfLLVMRegPair *I = std::lower_bound(M, M+Size, Key);
-  if (I == M+Size || I->FromReg != RegNum)
+  if (I == M + Size || I->FromReg != Reg)
     return -1;
   // Consumers need to be able to detect -1 and -2, but at various points
   // the numbers move between unsigned and signed representations, as well as
@@ -191,20 +191,21 @@ int64_t MCRegisterInfo::getDwarfRegNumFromDwarfEHRegNum(uint64_t RegNum) const {
   return RegNum;
 }
 
-int MCRegisterInfo::getSEHRegNum(MCRegister RegNum) const {
-  const DenseMap<MCRegister, int>::const_iterator I = L2SEHRegs.find(RegNum);
-  if (I == L2SEHRegs.end()) return (int)RegNum;
+int MCRegisterInfo::getSEHRegNum(MCRegister Reg) const {
+  const DenseMap<MCRegister, int>::const_iterator I = L2SEHRegs.find(Reg);
+  if (I == L2SEHRegs.end())
+    return (int)Reg.id();
   return I->second;
 }
 
-int MCRegisterInfo::getCodeViewRegNum(MCRegister RegNum) const {
+int MCRegisterInfo::getCodeViewRegNum(MCRegister Reg) const {
   if (L2CVRegs.empty())
     report_fatal_error("target does not implement codeview register mapping");
-  const DenseMap<MCRegister, int>::const_iterator I = L2CVRegs.find(RegNum);
+  const DenseMap<MCRegister, int>::const_iterator I = L2CVRegs.find(Reg);
   if (I == L2CVRegs.end())
-    report_fatal_error("unknown codeview register " + (RegNum < getNumRegs()
-                                                           ? getName(RegNum)
-                                                           : Twine(RegNum)));
+    report_fatal_error("unknown codeview register " + (Reg.id() < getNumRegs()
+                                                           ? getName(Reg)
+                                                           : Twine(Reg.id())));
   return I->second;
 }
 

From 39e7712ac520ccfc43383b3e9d6ea8cf2958b8e3 Mon Sep 17 00:00:00 2001
From: Dharuni R Acharya <125176188+DharuniRAcharya@users.noreply.github.com>
Date: Mon, 17 Nov 2025 22:30:40 +0530
Subject: [PATCH 09/67] [LLVM-Tablegen] Pretty Printing Arguments in LLVM
 Intrinsics (#162629)

This patch adds LLVM infrastructure to support pretty printing of the
intrinsic arguments.
The motivation is to improve the readability of LLVM intrinsics and
facilitate easy
modifications and debugging of LLVM IR.

This feature adds a property `ArgInfo<ArgIndex, [ArgName<"argName">,
ImmArgPrinter<"functionName">]>`
to the intrinsic arguments to print self-explanatory inline comments for
the arguments.

The addition of pretty print support can provide a simple, low-overhead
feature that
enhances the usability of LLVM intrinsics without disrupting existing
workflows.

Link to the RFC, where this feature was discussed:

https://discourse.llvm.org/t/rfc-pretty-printing-immediate-arguments-in-llvm-intrinsics/88536

---------

Signed-off-by: Dharuni R Acharya <dharunira@nvidia.com>
Co-authored-by: Rahul Joshi <rjoshi@nvidia.com>
---
 llvm/include/llvm/IR/Intrinsics.h             |   9 ++
 llvm/include/llvm/IR/Intrinsics.td            |  19 ++++
 llvm/include/llvm/IR/IntrinsicsNVVM.td        |  16 ++-
 llvm/include/llvm/IR/NVVMIntrinsicUtils.h     |  48 ++++++++
 llvm/lib/IR/AsmWriter.cpp                     |  41 +++++--
 llvm/lib/IR/Intrinsics.cpp                    |  11 ++
 .../NVPTX/tcgen05-mma-tensor-formatted.ll     |  50 +++++++++
 llvm/test/TableGen/intrinsic-arginfo.td       |  71 ++++++++++++
 .../TableGen/Basic/CodeGenIntrinsics.cpp      |  36 ++++++
 llvm/utils/TableGen/Basic/CodeGenIntrinsics.h |  16 +++
 .../utils/TableGen/Basic/IntrinsicEmitter.cpp | 105 +++++++++++++++---
 11 files changed, 395 insertions(+), 27 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll
 create mode 100644 llvm/test/TableGen/intrinsic-arginfo.td

diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 9577d0141f168..c91fc254ebe11 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -30,6 +30,8 @@ class LLVMContext;
 class Module;
 class AttributeList;
 class AttributeSet;
+class raw_ostream;
+class Constant;
 
 /// This namespace contains an enum with a value for every intrinsic/builtin
 /// function known by LLVM. The enum values are returned by
@@ -81,6 +83,9 @@ namespace Intrinsic {
   /// Returns true if the intrinsic can be overloaded.
   LLVM_ABI bool isOverloaded(ID id);
 
+  /// Returns true if the intrinsic has pretty printed immediate arguments.
+  LLVM_ABI bool hasPrettyPrintedArgs(ID id);
+
   /// isTargetIntrinsic - Returns true if IID is an intrinsic specific to a
   /// certain target. If it is a generic intrinsic false is returned.
   LLVM_ABI bool isTargetIntrinsic(ID IID);
@@ -284,6 +289,10 @@ namespace Intrinsic {
   /// N.
   LLVM_ABI Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor);
 
+  /// Print the argument info for the arguments with ArgInfo.
+  LLVM_ABI void printImmArg(ID IID, unsigned ArgIdx, raw_ostream &OS,
+                            const Constant *ImmArgVal);
+
   } // namespace Intrinsic
 
   } // namespace llvm
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 07aa2faffa7c5..27f404a1be65c 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -142,6 +142,25 @@ class Range<AttrIndex idx, int lower, int upper> : IntrinsicProperty {
   int Upper = upper;
 }
 
+// ArgProperty - Base class for argument properties that can be specified in ArgInfo.
+class ArgProperty;
+
+// ArgName - Specifies the name of an argument for pretty-printing.
+class ArgName<string name> : ArgProperty {
+  string Name = name;
+}
+
+// ImmArgPrinter - Specifies a custom printer function for immediate arguments.
+class ImmArgPrinter<string funcname> : ArgProperty {
+  string FuncName = funcname;
+}
+
+// ArgInfo - The specified argument has properties defined by a list of ArgProperty objects.
+class ArgInfo<ArgIndex idx, list<ArgProperty> arg_properties> : IntrinsicProperty {
+  int ArgNo = idx.Value;
+  list<ArgProperty> Properties = arg_properties;
+}
+
 def IntrNoReturn : IntrinsicProperty;
 
 // Applied by default.
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 21badc2692037..1b485dc8ccd1e 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -2955,7 +2955,14 @@ foreach sp = [0, 1] in {
         defvar nargs = !size(args);
         defvar scale_d_imm = ArgIndex<!sub(nargs, 1)>;
         defvar scale_d_imm_range = [ImmArg<scale_d_imm>, Range<scale_d_imm, 0, 16>];
-        defvar intrinsic_properties = !listconcat(
+
+        // Check if this is the specific llvm.nvvm.tcgen05.mma.tensor intrinsic.
+        defvar is_target_intrinsic = !and(!eq(sp, 0), 
+                                          !eq(space, "tensor"), 
+                                          !eq(scale_d, 0), 
+                                          !eq(ashift, 0));
+
+        defvar base_properties = !listconcat(
           mma.common_intr_props,
           !if(!eq(scale_d, 1), scale_d_imm_range, []),
           [Range<ArgIndex<nargs>, 0, !if(!eq(scale_d, 1), 2, 4)>, // kind
@@ -2965,6 +2972,13 @@ foreach sp = [0, 1] in {
           ]
         );
 
+        defvar intrinsic_properties = !if(is_target_intrinsic, 
+          !listconcat(base_properties,
+            [ArgInfo<ArgIndex<nargs>, [ArgName<"kind">, ImmArgPrinter<"printTcgen05MMAKind">]>,
+             ArgInfo<ArgIndex<!add(nargs, 1)>, [ArgName<"cta_group">]>,
+             ArgInfo<ArgIndex<!add(nargs, 2)>, [ArgName<"collector">, ImmArgPrinter<"printTcgen05CollectorUsageOp">]>]),
+          base_properties);
+
         def mma.record_name:
               DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties,
                 mma.intr_name>;
diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
index d55100e5e709d..d383769043605 100644
--- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
+++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h
@@ -18,8 +18,11 @@
 #include <stdint.h>
 
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace nvvm {
@@ -659,6 +662,51 @@ inline APFloat::roundingMode GetFMARoundingMode(Intrinsic::ID IntrinsicID) {
   llvm_unreachable("Invalid FP instrinsic rounding mode for NVVM fma");
 }
 
+inline void printTcgen05MMAKind(raw_ostream &OS, const Constant *ImmArgVal) {
+  if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
+    uint64_t Val = CI->getZExtValue();
+    switch (static_cast<Tcgen05MMAKind>(Val)) {
+    case Tcgen05MMAKind::F16:
+      OS << "f16";
+      return;
+    case Tcgen05MMAKind::TF32:
+      OS << "tf32";
+      return;
+    case Tcgen05MMAKind::F8F6F4:
+      OS << "f8f6f4";
+      return;
+    case Tcgen05MMAKind::I8:
+      OS << "i8";
+      return;
+    }
+  }
+  llvm_unreachable(
+      "printTcgen05MMAKind called with invalid value for immediate argument");
+}
+
+inline void printTcgen05CollectorUsageOp(raw_ostream &OS,
+                                         const Constant *ImmArgVal) {
+  if (const auto *CI = dyn_cast<ConstantInt>(ImmArgVal)) {
+    uint64_t Val = CI->getZExtValue();
+    switch (static_cast<Tcgen05CollectorUsageOp>(Val)) {
+    case Tcgen05CollectorUsageOp::DISCARD:
+      OS << "discard";
+      return;
+    case Tcgen05CollectorUsageOp::LASTUSE:
+      OS << "lastuse";
+      return;
+    case Tcgen05CollectorUsageOp::FILL:
+      OS << "fill";
+      return;
+    case Tcgen05CollectorUsageOp::USE:
+      OS << "use";
+      return;
+    }
+  }
+  llvm_unreachable("printTcgen05CollectorUsageOp called with invalid value for "
+                   "immediate argument");
+}
+
 } // namespace nvvm
 } // namespace llvm
 #endif // LLVM_IR_NVVMINTRINSICUTILS_H
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 4d4ffe93a8067..94a1aa3087377 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -53,6 +53,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
@@ -4576,12 +4577,38 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ' ';
     writeOperand(Operand, false);
     Out << '(';
+    bool HasPrettyPrintedArgs =
+        isa<IntrinsicInst>(CI) &&
+        Intrinsic::hasPrettyPrintedArgs(CI->getIntrinsicID());
+
     ListSeparator LS;
-    for (unsigned op = 0, Eop = CI->arg_size(); op < Eop; ++op) {
-      Out << LS;
-      writeParamOperand(CI->getArgOperand(op), PAL.getParamAttrs(op));
+    Function *CalledFunc = CI->getCalledFunction();
+    auto PrintArgComment = [&](unsigned ArgNo) {
+      const auto *ConstArg = dyn_cast<Constant>(CI->getArgOperand(ArgNo));
+      if (!ConstArg)
+        return;
+      std::string ArgComment;
+      raw_string_ostream ArgCommentStream(ArgComment);
+      Intrinsic::ID IID = CalledFunc->getIntrinsicID();
+      Intrinsic::printImmArg(IID, ArgNo, ArgCommentStream, ConstArg);
+      if (ArgComment.empty())
+        return;
+      Out << "/* " << ArgComment << " */ ";
+    };
+    if (HasPrettyPrintedArgs) {
+      for (unsigned ArgNo = 0, NumArgs = CI->arg_size(); ArgNo < NumArgs;
+           ++ArgNo) {
+        Out << LS;
+        PrintArgComment(ArgNo);
+        writeParamOperand(CI->getArgOperand(ArgNo), PAL.getParamAttrs(ArgNo));
+      }
+    } else {
+      for (unsigned ArgNo = 0, NumArgs = CI->arg_size(); ArgNo < NumArgs;
+           ++ArgNo) {
+        Out << LS;
+        writeParamOperand(CI->getArgOperand(ArgNo), PAL.getParamAttrs(ArgNo));
+      }
     }
-
     // Emit an ellipsis if this is a musttail call in a vararg function.  This
     // is only to aid readability, musttail calls forward varargs by default.
     if (CI->isMustTailCall() && CI->getParent() &&
@@ -5005,12 +5032,10 @@ void AssemblyWriter::printUseLists(const Function *F) {
 //===----------------------------------------------------------------------===//
 
 void Function::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
-                     bool ShouldPreserveUseListOrder,
-                     bool IsForDebug) const {
+                     bool ShouldPreserveUseListOrder, bool IsForDebug) const {
   SlotTracker SlotTable(this->getParent());
   formatted_raw_ostream OS(ROS);
-  AssemblyWriter W(OS, SlotTable, this->getParent(), AAW,
-                   IsForDebug,
+  AssemblyWriter W(OS, SlotTable, this->getParent(), AAW, IsForDebug,
                    ShouldPreserveUseListOrder);
   W.printFunction(this);
 }
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 526800e217399..859689b9cf168 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/IntrinsicsXCore.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/NVVMIntrinsicUtils.h"
 #include "llvm/IR/Type.h"
 
 using namespace llvm;
@@ -601,6 +602,12 @@ bool Intrinsic::isOverloaded(ID id) {
 #undef GET_INTRINSIC_OVERLOAD_TABLE
 }
 
+bool Intrinsic::hasPrettyPrintedArgs(ID id){
+#define GET_INTRINSIC_PRETTY_PRINT_TABLE
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_PRETTY_PRINT_TABLE
+}
+
 /// Table of per-target intrinsic name tables.
 #define GET_INTRINSIC_TARGET_DATA
 #include "llvm/IR/IntrinsicImpl.inc"
@@ -1142,3 +1149,7 @@ Intrinsic::ID Intrinsic::getDeinterleaveIntrinsicID(unsigned Factor) {
   assert(Factor >= 2 && Factor <= 8 && "Unexpected factor");
   return InterleaveIntrinsics[Factor - 2].Deinterleave;
 }
+
+#define GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+#include "llvm/IR/IntrinsicImpl.inc"
+#undef GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll
new file mode 100644
index 0000000000000..479de53dd90f2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-tensor-formatted.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; NOTE: This sample test demonstrates the pretty print feature for NVPTX intrinsics
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+define void @tcgen05_mma_fp16_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d) {
+  ; CHECK-LABEL: define void @tcgen05_mma_fp16_cta1(
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=discard */ i32 0)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 0)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=lastuse */ i32 1)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 1)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=fill */ i32 2)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 2)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f16 */ i32 0, /* cta_group= */ i32 1, /* collector=use */ i32 3)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 0, i32 1, i32 3)
+
+  ret void
+}
+
+define void @tcgen05_mma_f8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d) {
+  ; CHECK-LABEL: define void @tcgen05_mma_f8f6f4_cta2(
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=discard */ i32 0)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 0)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=lastuse */ i32 1)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 1)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=fill */ i32 2)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 2)
+
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, /* kind=f8f6f4 */ i32 2, /* cta_group= */ i32 2, /* collector=use */ i32 3)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 %b, i32 %idesc, i1 %enable_inp_d, i32 2, i32 2, i32 3)
+
+  ret void
+}
+
+; This test verifies that printImmArg is safe to call on all constant arguments, but only prints comments for arguments that have pretty printing configured.
+define void @test_mixed_constants_edge_case(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor) {
+  ; CHECK-LABEL: define void @test_mixed_constants_edge_case(
+  ; CHECK: call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 42, i32 100, i1 true, /* kind=i8 */ i32 3, /* cta_group= */ i32 1, /* collector=discard */ i32 0)
+  call void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6) %dtmem, ptr addrspace(6) %atensor, i64 42, i32 100, i1 true, i32 3, i32 1, i32 0)
+
+  ret void
+}
+
+declare void @llvm.nvvm.tcgen05.mma.tensor(ptr addrspace(6), ptr addrspace(6), i64, i32, i1, i32, i32, i32)
diff --git a/llvm/test/TableGen/intrinsic-arginfo.td b/llvm/test/TableGen/intrinsic-arginfo.td
new file mode 100644
index 0000000000000..eab1f5e032bc3
--- /dev/null
+++ b/llvm/test/TableGen/intrinsic-arginfo.td
@@ -0,0 +1,71 @@
+// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s | FileCheck %s
+
+// Test ArgInfo property for pretty-printing intrinsic arguments.
+// This test verifies that TableGen generates the correct pretty-printing code
+// for intrinsics that use the ArgInfo property.
+
+include "llvm/IR/Intrinsics.td"
+
+// Simple intrinsic with two arguments that have ArgInfo.
+def int_dummy_foo_bar : DefaultAttrsIntrinsic<
+    [llvm_i32_ty],
+    [llvm_i32_ty,      // data
+     llvm_i32_ty,      // mode
+     llvm_i32_ty],     // stride
+    [IntrNoMem,
+     ImmArg<ArgIndex<1>>,
+     ArgInfo<ArgIndex<1>, [ArgName<"mode">, ImmArgPrinter<"printDummyMode">]>,
+     ArgInfo<ArgIndex<2>, [ArgName<"stride">]>]>;
+
+// A custom floating point add with rounding and sat mode.
+def int_my_fadd_f32 : DefaultAttrsIntrinsic<
+    [llvm_float_ty],
+    [llvm_float_ty,    // a
+     llvm_float_ty,    // b
+     llvm_i32_ty,      // rounding_mode
+     llvm_i1_ty],      // saturation_mode
+    [IntrNoMem,
+     ImmArg<ArgIndex<2>>,
+     ImmArg<ArgIndex<3>>,
+     ArgInfo<ArgIndex<2>, [ArgName<"rounding_mode">, ImmArgPrinter<"printRoundingMode">]>,
+     ArgInfo<ArgIndex<3>, [ArgName<"saturation_mode">]>]>;
+
+// CHECK: #ifdef GET_INTRINSIC_PRETTY_PRINT_TABLE
+// CHECK-NEXT: static constexpr uint8_t PPTable[] = {
+
+// CHECK: #endif // GET_INTRINSIC_PRETTY_PRINT_TABLE
+
+// CHECK: #ifdef GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+// CHECK: void Intrinsic::printImmArg(ID IID, unsigned ArgIdx, raw_ostream &OS, const Constant *ImmArgVal) {
+
+// CHECK: case dummy_foo_bar:
+// CHECK-NEXT: switch (ArgIdx) {
+
+// CHECK-NEXT: case 1:
+// CHECK-NEXT: OS << "mode=";
+// CHECK-NEXT: printDummyMode(OS, ImmArgVal);
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: case 2:
+// CHECK-NEXT: OS << "stride=";
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: }
+// CHECK-NEXT: break;
+
+// CHECK: case my_fadd_f32:
+// CHECK-NEXT: switch (ArgIdx) {
+
+// CHECK-NEXT: case 2:
+// CHECK-NEXT: OS << "rounding_mode=";
+// CHECK-NEXT: printRoundingMode(OS, ImmArgVal);
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: case 3:
+// CHECK-NEXT: OS << "saturation_mode=";
+// CHECK-NEXT: return;
+
+// CHECK-NEXT: }
+// CHECK-NEXT: break;
+
+// CHECK: #endif // GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index ff894853b9771..228969ab37f85 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -449,6 +449,29 @@ void CodeGenIntrinsic::setProperty(const Record *R) {
     int64_t Lower = R->getValueAsInt("Lower");
     int64_t Upper = R->getValueAsInt("Upper");
     addArgAttribute(ArgNo, Range, Lower, Upper);
+  } else if (R->isSubClassOf("ArgInfo")) {
+    unsigned ArgNo = R->getValueAsInt("ArgNo");
+    if (ArgNo < 1)
+      PrintFatalError(R->getLoc(),
+                      "ArgInfo requires ArgNo >= 1 (0 is return value)");
+    const ListInit *Properties = R->getValueAsListInit("Properties");
+    StringRef ArgName;
+    StringRef FuncName;
+
+    for (const Init *PropInit : Properties->getElements()) {
+      if (const auto *PropDef = dyn_cast<DefInit>(PropInit)) {
+        const Record *PropRec = PropDef->getDef();
+
+        if (PropRec->isSubClassOf("ArgName"))
+          ArgName = PropRec->getValueAsString("Name");
+        else if (PropRec->isSubClassOf("ImmArgPrinter"))
+          FuncName = PropRec->getValueAsString("FuncName");
+        else
+          PrintFatalError(PropRec->getLoc(),
+                          "Unknown ArgProperty type: " + PropRec->getName());
+      }
+    }
+    addPrettyPrintFunction(ArgNo - 1, ArgName, FuncName);
   } else {
     llvm_unreachable("Unknown property!");
   }
@@ -476,3 +499,16 @@ void CodeGenIntrinsic::addArgAttribute(unsigned Idx, ArgAttrKind AK, uint64_t V,
     ArgumentAttributes.resize(Idx + 1);
   ArgumentAttributes[Idx].emplace_back(AK, V, V2);
 }
+
+void CodeGenIntrinsic::addPrettyPrintFunction(unsigned ArgIdx,
+                                              StringRef ArgName,
+                                              StringRef FuncName) {
+  auto It = llvm::find_if(PrettyPrintFunctions, [ArgIdx](const auto &Info) {
+    return Info.ArgIdx == ArgIdx;
+  });
+  if (It != PrettyPrintFunctions.end())
+    PrintFatalError(TheDef->getLoc(), "ArgInfo for argument " + Twine(ArgIdx) +
+                                          " is already defined as '" +
+                                          It->FuncName + "'");
+  PrettyPrintFunctions.emplace_back(ArgIdx, ArgName, FuncName);
+}
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
index 15e803c4feba1..6ac6f734326d8 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
@@ -152,6 +152,22 @@ struct CodeGenIntrinsic {
   void addArgAttribute(unsigned Idx, ArgAttrKind AK, uint64_t V = 0,
                        uint64_t V2 = 0);
 
+  /// Structure to store pretty print and argument information.
+  struct PrettyPrintArgInfo {
+    unsigned ArgIdx;
+    StringRef ArgName;
+    StringRef FuncName;
+
+    PrettyPrintArgInfo(unsigned Idx, StringRef Name, StringRef Func)
+        : ArgIdx(Idx), ArgName(Name), FuncName(Func) {}
+  };
+
+  /// Vector that stores ArgInfo (ArgIndex, ArgName, FunctionName).
+  SmallVector<PrettyPrintArgInfo> PrettyPrintFunctions;
+
+  void addPrettyPrintFunction(unsigned ArgIdx, StringRef ArgName,
+                              StringRef FuncName);
+
   bool hasProperty(enum SDNP Prop) const { return Properties & (1 << Prop); }
 
   /// Goes through all IntrProperties that have IsDefault value set and sets
diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
index 452d2b08f25c3..3ac23185ef91c 100644
--- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
@@ -60,8 +60,16 @@ class IntrinsicEmitter {
                                 raw_ostream &OS);
   void EmitIntrinsicToOverloadTable(const CodeGenIntrinsicTable &Ints,
                                     raw_ostream &OS);
+  void EmitIntrinsicToPrettyPrintTable(const CodeGenIntrinsicTable &Ints,
+                                       raw_ostream &OS);
+  void EmitIntrinsicBitTable(
+      const CodeGenIntrinsicTable &Ints, raw_ostream &OS, StringRef Guard,
+      StringRef TableName, StringRef Comment,
+      function_ref<bool(const CodeGenIntrinsic &Int)> GetProperty);
   void EmitGenerator(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
   void EmitAttributes(const CodeGenIntrinsicTable &Ints, raw_ostream &OS);
+  void EmitPrettyPrintArguments(const CodeGenIntrinsicTable &Ints,
+                                raw_ostream &OS);
   void EmitIntrinsicToBuiltinMap(const CodeGenIntrinsicTable &Ints,
                                  bool IsClang, raw_ostream &OS);
 };
@@ -109,6 +117,12 @@ void IntrinsicEmitter::run(raw_ostream &OS, bool Enums) {
     // Emit the intrinsic parameter attributes.
     EmitAttributes(Ints, OS);
 
+    // Emit the intrinsic ID -> pretty print table.
+    EmitIntrinsicToPrettyPrintTable(Ints, OS);
+
+    // Emit Pretty Print attribute.
+    EmitPrettyPrintArguments(Ints, OS);
+
     // Emit code to translate Clang builtins into LLVM intrinsics.
     EmitIntrinsicToBuiltinMap(Ints, true, OS);
 
@@ -240,6 +254,29 @@ static constexpr IntrinsicTargetInfo TargetInfos[] = {
 )";
 }
 
+/// Helper function to emit a bit table for intrinsic properties.
+/// This is used for both overload and pretty print bit tables.
+void IntrinsicEmitter::EmitIntrinsicBitTable(
+    const CodeGenIntrinsicTable &Ints, raw_ostream &OS, StringRef Guard,
+    StringRef TableName, StringRef Comment,
+    function_ref<bool(const CodeGenIntrinsic &Int)> GetProperty) {
+  OS << formatv("// {}\n", Comment);
+  OS << formatv("#ifdef {}\n", Guard);
+  OS << formatv("static constexpr uint8_t {}[] = {{\n", TableName);
+  OS << "  0\n  ";
+  for (auto [I, Int] : enumerate(Ints)) {
+    // Add one to the index so we emit a null bit for the invalid #0 intrinsic.
+    size_t Idx = I + 1;
+    if (Idx % 8 == 0)
+      OS << ",\n  0";
+    if (GetProperty(Int))
+      OS << " | (1<<" << Idx % 8 << ')';
+  }
+  OS << "\n};\n\n";
+  OS << formatv("return ({}[id/8] & (1 << (id%8))) != 0;\n", TableName);
+  OS << formatv("#endif // {}\n\n", Guard);
+}
+
 void IntrinsicEmitter::EmitIntrinsicToNameTable(
     const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
   // Built up a table of the intrinsic names.
@@ -276,24 +313,10 @@ static constexpr unsigned IntrinsicNameOffsetTable[] = {
 
 void IntrinsicEmitter::EmitIntrinsicToOverloadTable(
     const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
-  OS << R"(// Intrinsic ID to overload bitset.
-#ifdef GET_INTRINSIC_OVERLOAD_TABLE
-static constexpr uint8_t OTable[] = {
-  0
-  )";
-  for (auto [I, Int] : enumerate(Ints)) {
-    // Add one to the index so we emit a null bit for the invalid #0 intrinsic.
-    size_t Idx = I + 1;
-
-    if (Idx % 8 == 0)
-      OS << ",\n  0";
-    if (Int.isOverloaded)
-      OS << " | (1<<" << Idx % 8 << ')';
-  }
-  OS << "\n};\n\n";
-  // OTable contains a true bit at the position if the intrinsic is overloaded.
-  OS << "return (OTable[id/8] & (1 << (id%8))) != 0;\n";
-  OS << "#endif\n\n";
+  EmitIntrinsicBitTable(
+      Ints, OS, "GET_INTRINSIC_OVERLOAD_TABLE", "OTable",
+      "Intrinsic ID to overload bitset.",
+      [](const CodeGenIntrinsic &Int) { return Int.isOverloaded; });
 }
 
 using TypeSigTy = SmallVector<unsigned char>;
@@ -809,6 +832,52 @@ AttributeSet Intrinsic::getFnAttributes(LLVMContext &C, ID id) {{
                 NoFunctionAttrsID);
 }
 
+void IntrinsicEmitter::EmitIntrinsicToPrettyPrintTable(
+    const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
+  EmitIntrinsicBitTable(Ints, OS, "GET_INTRINSIC_PRETTY_PRINT_TABLE", "PPTable",
+                        "Intrinsic ID to pretty print bitset.",
+                        [](const CodeGenIntrinsic &Int) {
+                          return !Int.PrettyPrintFunctions.empty();
+                        });
+}
+
+void IntrinsicEmitter::EmitPrettyPrintArguments(
+    const CodeGenIntrinsicTable &Ints, raw_ostream &OS) {
+  OS << R"(
+#ifdef GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+void Intrinsic::printImmArg(ID IID, unsigned ArgIdx, raw_ostream &OS, const Constant *ImmArgVal) {
+  using namespace Intrinsic;
+  switch (IID) {
+)";
+
+  for (const CodeGenIntrinsic &Int : Ints) {
+    if (Int.PrettyPrintFunctions.empty())
+      continue;
+
+    OS << "  case " << Int.EnumName << ":\n";
+    OS << "    switch (ArgIdx) {\n";
+    for (const auto [ArgIdx, ArgName, FuncName] : Int.PrettyPrintFunctions) {
+      OS << "    case " << ArgIdx << ":\n";
+      OS << "      OS << \"" << ArgName << "=\";\n";
+      if (!FuncName.empty()) {
+        OS << "      ";
+        if (!Int.TargetPrefix.empty())
+          OS << Int.TargetPrefix << "::";
+        OS << FuncName << "(OS, ImmArgVal);\n";
+      }
+      OS << "      return;\n";
+    }
+    OS << "    }\n";
+    OS << "    break;\n";
+  }
+  OS << R"(  default:
+    break;
+  }
+}
+#endif // GET_INTRINSIC_PRETTY_PRINT_ARGUMENTS
+)";
+}
+
 void IntrinsicEmitter::EmitIntrinsicToBuiltinMap(
     const CodeGenIntrinsicTable &Ints, bool IsClang, raw_ostream &OS) {
   StringRef CompilerName = IsClang ? "Clang" : "MS";

From 6f5c8fe1c1d24604d3328b82f5a1ed348e59326f Mon Sep 17 00:00:00 2001
From: Govind Malasani <145235389+gmalasan@users.noreply.github.com>
Date: Mon, 17 Nov 2025 12:01:44 -0500
Subject: [PATCH 10/67] [MLIR][SparseTensor] Dense Outer Loop Ordering Strategy
 (#160168)

This PR builds upon the infrastructure set up for Sparse Tensor Loop
Ordering Heuristics (#154656) by adding a preference to have dense loops
outer and sparse loops inner.

As always I'd love to get feedback and know if there's any other
direction to go with this work that might be better.
---
 .../Dialect/SparseTensor/Transforms/Passes.h  |  5 +-
 .../Dialect/SparseTensor/Transforms/Passes.td |  4 +-
 .../Transforms/Utils/IterationGraphSorter.cpp | 72 ++++++++++++++++++-
 3 files changed, 77 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index af64370a62dd7..419ecda80e9a5 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -58,9 +58,10 @@ enum class SparseEmitStrategy {
 namespace sparse_tensor {
 
 /// Defines a strategy for loop ordering during sparse code generation.
+/// See Passes.td for strategy descriptions.
 enum class LoopOrderingStrategy : unsigned {
-  kDefault, ///< Default strategy (eagerly selects last loop in topological
-            ///< sort).
+  kDefault,
+  kDenseOuter,
 };
 
 } // namespace sparse_tensor
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 75e77d67db1b3..0b8562e484f51 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -85,7 +85,9 @@ def SparseReinterpretMap : Pass<"sparse-reinterpret-map", "ModuleOp"> {
        "mlir::sparse_tensor::LoopOrderingStrategy::kDefault",
        "Set the loop ordering strategy for sparse code generation", [{llvm::cl::values(
          clEnumValN(mlir::sparse_tensor::LoopOrderingStrategy::kDefault, "default",
-                    "Default strategy (eagerly selects last loop in topological sort)"))}]>,
+                    "Default strategy (eagerly selects last loop in topological sort)"),
+         clEnumValN(mlir::sparse_tensor::LoopOrderingStrategy::kDenseOuter, "dense-outer",
+                    "Prefer dense, then compressed, then singleton dimensions outermost"))}]>,
   ];
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
index ffa8b402e0b6b..99048034b4f0c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
@@ -80,6 +80,53 @@ inline static bool includesDenseOutput(SortMask mask) {
   return includesAny(mask, SortMask::kIncludeDenseOutput);
 }
 
+/// Returns a sparsity rank for loop ordering: lower values indicate
+/// dimensions that should be placed in outer loops.
+/// 0 = Dense, 1 = Compressed, 2 = Singleton, 3 = Other/Unknown.
+static unsigned getLoopSparsityRank(unsigned loop, ArrayRef<Value> allTensors,
+                                    ArrayRef<AffineMap> allMaps) {
+  // Start with highest rank.
+  unsigned minRank = 3;
+
+  for (auto [tensor, map] : llvm::zip(allTensors, allMaps)) {
+    // Check if this loop accesses this tensor.
+    bool loopAccessesTensor = false;
+    unsigned tensorDim = 0;
+    for (AffineExpr expr : map.getResults()) {
+      if (auto dimExpr = dyn_cast<AffineDimExpr>(expr)) {
+        if (dimExpr.getPosition() == loop) {
+          loopAccessesTensor = true;
+          break;
+        }
+      }
+      tensorDim++;
+    }
+
+    if (loopAccessesTensor) {
+      const auto enc = getSparseTensorEncoding(tensor.getType());
+      if (!enc) {
+        // Dense tensor - lowest rank.
+        return 0;
+      } else {
+        // Sparse tensor - check the level type for this dimension.
+        auto lvlTypes = enc.getLvlTypes();
+        if (tensorDim < lvlTypes.size()) {
+          auto lvlType = lvlTypes[tensorDim];
+          if (isDenseLT(lvlType)) {
+            return 0; // Dense level.
+          } else if (isCompressedLT(lvlType)) {
+            minRank = std::min(minRank, 1u); // Compressed level.
+          } else if (isSingletonLT(lvlType)) {
+            minRank = std::min(minRank, 2u); // Singleton level.
+          }
+        }
+      }
+    }
+  }
+
+  return minRank;
+}
+
 AffineMap IterationGraphSorter::topoSort() {
   // The sorted result will put the first Reduction iterator to the
   // latest possible position.
@@ -107,10 +154,33 @@ AffineMap IterationGraphSorter::topoSort() {
     case sparse_tensor::LoopOrderingStrategy::kDefault:
       src = it.back();
       break;
+    case sparse_tensor::LoopOrderingStrategy::kDenseOuter: {
+      // Prefer dense, then compressed, then singleton dimensions outermost.
+      // Create combined tensor and map lists for analysis.
+      SmallVector<Value> allTensors = ins;
+      allTensors.push_back(out);
+      SmallVector<AffineMap> allMaps = loop2InsLvl;
+      allMaps.push_back(loop2OutLvl);
+
+      // Find loop with minimum (lowest) sparsity rank.
+      unsigned minLoop = it[0];
+      unsigned minRank = getLoopSparsityRank(minLoop, allTensors, allMaps);
+
+      for (auto candidateLoop : it) {
+        unsigned rank = getLoopSparsityRank(candidateLoop, allTensors, allMaps);
+        if (rank < minRank || (rank == minRank && candidateLoop < minLoop)) {
+          minLoop = candidateLoop;
+          minRank = rank;
+        }
+      }
+      src = minLoop;
+      break;
+    }
     }
 
     loopOrder.push_back(src);
-    it.pop_back();
+    // Remove the selected loop from the worklist.
+    it.erase(std::find(it.begin(), it.end(), src));
     // Update in-degree, and push 0-degree node into worklist.
     for (unsigned dst = 0; dst < numLoops; dst++) {
       if (itGraph[src][dst] && --inDegree[dst] == 0) {

From 0c8464330a510e0c3b629883ed1acd81da17da5d Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Mon, 17 Nov 2025 09:09:46 -0800
Subject: [PATCH 11/67] [CIR] Upstream handling for BaseToDerived casts
 (#167769)

Upstream handling for BaseToDerived casts, adding the
cir.base_class_addr operation and lowering to LLVM IR.
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 56 ++++++++++-
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         | 13 +++
 clang/lib/CIR/CodeGen/CIRGenClass.cpp         | 19 ++++
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          | 18 +++-
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    | 15 ++-
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |  5 +
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 35 +++++++
 clang/test/CIR/CodeGen/base-to-derived.cpp    | 97 +++++++++++++++++++
 8 files changed, 254 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/base-to-derived.cpp

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 2124b1dc62a81..7b987ea49bf97 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -3386,6 +3386,10 @@ def CIR_BaseClassAddrOp : CIR_Op<"base_class_addr"> {
     cannot be known by the operation, and that information affects how the
     operation is lowered.
 
+    The validity of the relationship of derived and base cannot yet be verified.
+    If the target class is not a valid base class for the object, the behavior
+    is undefined.
+
     Example:
     ```c++
     struct Base { };
@@ -3399,8 +3403,6 @@ def CIR_BaseClassAddrOp : CIR_Op<"base_class_addr"> {
     ```
   }];
 
-  // The validity of the relationship of derived and base cannot yet be
-  // verified, currently not worth adding a verifier.
   let arguments = (ins
     Arg<CIR_PointerType, "derived class pointer", [MemRead]>:$derived_addr,
     IndexAttr:$offset, UnitAttr:$assume_not_null);
@@ -3414,6 +3416,56 @@ def CIR_BaseClassAddrOp : CIR_Op<"base_class_addr"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// DerivedClassAddrOp
+//===----------------------------------------------------------------------===//
+
+def CIR_DerivedClassAddrOp : CIR_Op<"derived_class_addr"> {
+  let summary = "Get the derived class address for a class/struct";
+  let description = [{
+    The `cir.derived_class_addr` operaration gets the address of a particular
+    derived class given a non-virtual base class pointer. The offset in bytes
+    of the base class must be passed in, similar to `cir.base_class_addr`, but
+    going into the other direction. This means lowering to a negative offset.
+
+    The operation contains a flag for whether or not the operand may be nullptr.
+    That depends on the context and cannot be known by the operation, and that
+    information affects how the operation is lowered.
+
+    The validity of the relationship of derived and base cannot yet be verified.
+    If the target class is not a valid derived class for the object, the
+    behavior is undefined.
+
+    Example:
+    ```c++
+    class A {};
+    class B : public A {};
+
+    B *getAsB(A *a) {
+      return static_cast<B*>(a);
+    }
+    ```
+
+    leads to
+    ```mlir
+      %2 = cir.load %0 : !cir.ptr<!cir.ptr<!rec_A>>, !cir.ptr<!rec_A>
+      %3 = cir.base_class_addr %2 : !cir.ptr<!rec_B> [0] -> !cir.ptr<!rec_A>
+    ```
+  }];
+
+  let arguments = (ins
+    Arg<CIR_PointerType, "base class pointer", [MemRead]>:$base_addr,
+    IndexAttr:$offset, UnitAttr:$assume_not_null);
+
+  let results = (outs Res<CIR_PointerType, "">:$derived_addr);
+
+  let assemblyFormat = [{
+      $base_addr `:` qualified(type($base_addr))
+      (`nonnull` $assume_not_null^)?
+      ` ` `[` $offset `]` `->` qualified(type($derived_addr)) attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // ComplexCreateOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index a391d7e70ace7..5ab1d0e05cf8a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -405,6 +405,19 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return Address(baseAddr, destType, addr.getAlignment());
   }
 
+  Address createDerivedClassAddr(mlir::Location loc, Address addr,
+                                 mlir::Type destType, unsigned offset,
+                                 bool assumeNotNull) {
+    if (destType == addr.getElementType())
+      return addr;
+
+    cir::PointerType ptrTy = getPointerTo(destType);
+    auto derivedAddr =
+        cir::DerivedClassAddrOp::create(*this, loc, ptrTy, addr.getPointer(),
+                                        mlir::APInt(64, offset), assumeNotNull);
+    return Address(derivedAddr, destType, addr.getAlignment());
+  }
+
   mlir::Value createVTTAddrPoint(mlir::Location loc, mlir::Type retTy,
                                  mlir::Value addr, uint64_t offset) {
     return cir::VTTAddrPointOp::create(*this, loc, retTy,
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index a8296782ebc40..89c4696b9da94 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -1110,6 +1110,25 @@ mlir::Value CIRGenFunction::getVTTParameter(GlobalDecl gd, bool forVirtualBase,
   }
 }
 
+Address CIRGenFunction::getAddressOfDerivedClass(
+    mlir::Location loc, Address baseAddr, const CXXRecordDecl *derived,
+    llvm::iterator_range<CastExpr::path_const_iterator> path,
+    bool nullCheckValue) {
+  assert(!path.empty() && "Base path should not be empty!");
+
+  QualType derivedTy = getContext().getCanonicalTagType(derived);
+  mlir::Type derivedValueTy = convertType(derivedTy);
+  CharUnits nonVirtualOffset =
+      cgm.computeNonVirtualBaseClassOffset(derived, path);
+
+  // Note that in OG, no offset (nonVirtualOffset.getQuantity() == 0) means it
+  // just gives the address back. In CIR a `cir.derived_class` is created and
+  // made into a nop later on during lowering.
+  return builder.createDerivedClassAddr(loc, baseAddr, derivedValueTy,
+                                        nonVirtualOffset.getQuantity(),
+                                        /*assumeNotNull=*/!nullCheckValue);
+}
+
 Address CIRGenFunction::getAddressOfBaseClass(
     Address value, const CXXRecordDecl *derived,
     llvm::iterator_range<CastExpr::path_const_iterator> path,
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index d35bb0af0de14..681a801cd7d81 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1301,7 +1301,6 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
   case CK_NonAtomicToAtomic:
   case CK_AtomicToNonAtomic:
   case CK_ToUnion:
-  case CK_BaseToDerived:
   case CK_ObjCObjectLValueCast:
   case CK_VectorSplat:
   case CK_ConstructorConversion:
@@ -1336,6 +1335,7 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
                                   lv.getAddress().getAlignment()),
                           e->getType(), lv.getBaseInfo());
   }
+
   case CK_LValueBitCast: {
     // This must be a reinterpret_cast (or c-style equivalent).
     const auto *ce = cast<ExplicitCastExpr>(e);
@@ -1387,6 +1387,22 @@ LValue CIRGenFunction::emitCastLValue(const CastExpr *e) {
     return makeAddrLValue(baseAddr, e->getType(), lv.getBaseInfo());
   }
 
+  case CK_BaseToDerived: {
+    const auto *derivedClassDecl = e->getType()->castAsCXXRecordDecl();
+    LValue lv = emitLValue(e->getSubExpr());
+
+    // Perform the base-to-derived conversion
+    Address derived = getAddressOfDerivedClass(
+        getLoc(e->getSourceRange()), lv.getAddress(), derivedClassDecl,
+        e->path(), /*NullCheckValue=*/false);
+    // C++11 [expr.static.cast]p2: Behavior is undefined if a downcast is
+    // performed and the object is not of the derived type.
+    assert(!cir::MissingFeatures::sanitizers());
+
+    assert(!cir::MissingFeatures::opTBAA());
+    return makeAddrLValue(derived, e->getType(), lv.getBaseInfo());
+  }
+
   case CK_ZeroToOCLOpaqueType:
     llvm_unreachable("NULL to OpenCL opaque type lvalue cast is not valid");
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index ce95607bd468d..3b0977d213325 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -1972,6 +1972,20 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     return builder.createIntToPtr(middleVal, destCIRTy);
   }
 
+  case CK_BaseToDerived: {
+    const CXXRecordDecl *derivedClassDecl = destTy->getPointeeCXXRecordDecl();
+    assert(derivedClassDecl && "BaseToDerived arg isn't a C++ object pointer!");
+    Address base = cgf.emitPointerWithAlignment(subExpr);
+    Address derived = cgf.getAddressOfDerivedClass(
+        cgf.getLoc(ce->getSourceRange()), base, derivedClassDecl, ce->path(),
+        cgf.shouldNullCheckClassCastValue(ce));
+
+    // C++11 [expr.static.cast]p11: Behavior is undefined if a downcast is
+    // performed and the object is not of the derived type.
+    assert(!cir::MissingFeatures::sanitizers());
+
+    return cgf.getAsNaturalPointerTo(derived, ce->getType()->getPointeeType());
+  }
   case CK_UncheckedDerivedToBase:
   case CK_DerivedToBase: {
     // The EmitPointerWithAlignment path does this fine; just discard
@@ -1979,7 +1993,6 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
     return cgf.getAsNaturalPointerTo(cgf.emitPointerWithAlignment(ce),
                                      ce->getType()->getPointeeType());
   }
-
   case CK_Dynamic: {
     Address v = cgf.emitPointerWithAlignment(subExpr);
     const auto *dce = cast<CXXDynamicCastExpr>(ce);
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 2dddf26981105..b22bf2d87fc10 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -823,6 +823,11 @@ class CIRGenFunction : public CIRGenTypeCache {
       llvm::iterator_range<CastExpr::path_const_iterator> path,
       bool nullCheckValue, SourceLocation loc);
 
+  Address getAddressOfDerivedClass(
+      mlir::Location loc, Address baseAddr, const CXXRecordDecl *derived,
+      llvm::iterator_range<CastExpr::path_const_iterator> path,
+      bool nullCheckValue);
+
   /// Return the VTT parameter that should be passed to a base
   /// constructor/destructor with virtual bases.
   /// FIXME: VTTs are Itanium ABI-specific, so the definition should move
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index d88a4ad76f27b..92434d730eb31 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1360,6 +1360,41 @@ mlir::LogicalResult CIRToLLVMBaseClassAddrOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMDerivedClassAddrOpLowering::matchAndRewrite(
+    cir::DerivedClassAddrOp derivedClassOp, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  const mlir::Type resultType =
+      getTypeConverter()->convertType(derivedClassOp.getType());
+  mlir::Value baseAddr = adaptor.getBaseAddr();
+  // The offset is set in the operation as an unsigned value, but it must be
+  // applied as a negative offset.
+  int64_t offsetVal = -(adaptor.getOffset().getZExtValue());
+  if (offsetVal == 0) {
+    // If the offset is zero, we can just return the base address,
+    rewriter.replaceOp(derivedClassOp, baseAddr);
+    return mlir::success();
+  }
+  llvm::SmallVector<mlir::LLVM::GEPArg, 1> offset = {offsetVal};
+  mlir::Type byteType = mlir::IntegerType::get(resultType.getContext(), 8,
+                                               mlir::IntegerType::Signless);
+  if (derivedClassOp.getAssumeNotNull()) {
+    rewriter.replaceOpWithNewOp<mlir::LLVM::GEPOp>(
+        derivedClassOp, resultType, byteType, baseAddr, offset,
+        mlir::LLVM::GEPNoWrapFlags::inbounds);
+  } else {
+    mlir::Location loc = derivedClassOp.getLoc();
+    mlir::Value isNull = mlir::LLVM::ICmpOp::create(
+        rewriter, loc, mlir::LLVM::ICmpPredicate::eq, baseAddr,
+        mlir::LLVM::ZeroOp::create(rewriter, loc, baseAddr.getType()));
+    mlir::Value adjusted =
+        mlir::LLVM::GEPOp::create(rewriter, loc, resultType, byteType, baseAddr,
+                                  offset, mlir::LLVM::GEPNoWrapFlags::inbounds);
+    rewriter.replaceOpWithNewOp<mlir::LLVM::SelectOp>(derivedClassOp, isNull,
+                                                      baseAddr, adjusted);
+  }
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMATanOpLowering::matchAndRewrite(
     cir::ATanOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
diff --git a/clang/test/CIR/CodeGen/base-to-derived.cpp b/clang/test/CIR/CodeGen/base-to-derived.cpp
new file mode 100644
index 0000000000000..af9aa0ffd19c1
--- /dev/null
+++ b/clang/test/CIR/CodeGen/base-to-derived.cpp
@@ -0,0 +1,97 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+class A {
+    int a;
+};
+
+class B {
+    int b;
+public:
+    A *getAsA();
+};
+
+class X : public A, public B {
+    int x;
+};
+
+X *castAtoX(A *a) {
+  return static_cast<X*>(a);
+}
+
+// CIR: cir.func {{.*}} @_Z8castAtoXP1A(%[[ARG0:.*]]: !cir.ptr<!rec_A> {{.*}})
+// CIR:   %[[A_ADDR:.*]] = cir.alloca !cir.ptr<!rec_A>, !cir.ptr<!cir.ptr<!rec_A>>, ["a", init]
+// CIR:   cir.store %[[ARG0]], %[[A_ADDR]] : !cir.ptr<!rec_A>, !cir.ptr<!cir.ptr<!rec_A>>
+// CIR:   %[[A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.ptr<!rec_A>>, !cir.ptr<!rec_A>
+// CIR:   %[[X:.*]] = cir.derived_class_addr %[[A]] : !cir.ptr<!rec_A> [0] -> !cir.ptr<!rec_X>
+
+// Note: Because the offset is 0, a null check is not needed.
+
+// LLVM: define {{.*}} ptr @_Z8castAtoXP1A(ptr %[[ARG0:.*]])
+// LLVM:   %[[A_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[ARG0]], ptr %[[A_ADDR]]
+// LLVM:   %[[X:.*]] = load ptr, ptr %[[A_ADDR]]
+
+// OGCG: define {{.*}} ptr @_Z8castAtoXP1A(ptr {{.*}} %[[ARG0:.*]])
+// OGCG:   %[[A_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[ARG0]], ptr %[[A_ADDR]]
+// OGCG:   %[[X:.*]] = load ptr, ptr %[[A_ADDR]]
+
+X *castBtoX(B *b) {
+  return static_cast<X*>(b);
+}
+
+// CIR: cir.func {{.*}} @_Z8castBtoXP1B(%[[ARG0:.*]]: !cir.ptr<!rec_B> {{.*}})
+// CIR:   %[[B_ADDR:.*]] = cir.alloca !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>, ["b", init]
+// CIR:   cir.store %[[ARG0]], %[[B_ADDR]] : !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>
+// CIR:   %[[B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.ptr<!rec_B>>, !cir.ptr<!rec_B>
+// CIR:   %[[X:.*]] = cir.derived_class_addr %[[B]] : !cir.ptr<!rec_B> [4] -> !cir.ptr<!rec_X>
+
+// LLVM: define {{.*}} ptr @_Z8castBtoXP1B(ptr %[[ARG0:.*]])
+// LLVM:   %[[B_ADDR:.*]] = alloca ptr, i64 1, align 8
+// LLVM:   store ptr %[[ARG0]], ptr %[[B_ADDR]], align 8
+// LLVM:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]], align 8
+// LLVM:   %[[IS_NULL:.*]] = icmp eq ptr %[[B]], null
+// LLVM:   %[[B_NON_NULL:.*]] = getelementptr inbounds i8, ptr %[[B]], i32 -4
+// LLVM:   %[[X:.*]] = select i1 %[[IS_NULL]], ptr %[[B]], ptr %[[B_NON_NULL]]
+
+// OGCG: define {{.*}} ptr @_Z8castBtoXP1B(ptr {{.*}} %[[ARG0:.*]])
+// OGCG: entry:
+// OGCG:   %[[B_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[ARG0]], ptr %[[B_ADDR]]
+// OGCG:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]]
+// OGCG:   %[[IS_NULL:.*]] = icmp eq ptr %[[B]], null
+// OGCG:   br i1 %[[IS_NULL]], label %[[LABEL_NULL:.*]], label %[[LABEL_NOTNULL:.*]]
+// OGCG: [[LABEL_NOTNULL]]:
+// OGCG:   %[[B_NON_NULL:.*]] = getelementptr inbounds i8, ptr %[[B]], i64 -4
+// OGCG:   br label %[[LABEL_END:.*]]
+// OGCG: [[LABEL_NULL]]:
+// OGCG:   br label %[[LABEL_END:.*]]
+// OGCG: [[LABEL_END]]:
+// OGCG:   %[[X:.*]] = phi ptr [ %[[B_NON_NULL]], %[[LABEL_NOTNULL]] ], [ null, %[[LABEL_NULL]] ]
+
+X &castBReftoXRef(B &b) {
+  return static_cast<X&>(b);
+}
+
+// CIR: cir.func {{.*}} @_Z14castBReftoXRefR1B(%[[ARG0:.*]]: !cir.ptr<!rec_B> {{.*}})
+// CIR:   %[[B_ADDR:.*]] = cir.alloca !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>, ["b", init, const]
+// CIR:   cir.store %[[ARG0]], %[[B_ADDR]] : !cir.ptr<!rec_B>, !cir.ptr<!cir.ptr<!rec_B>>
+// CIR:   %[[B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!cir.ptr<!rec_B>>, !cir.ptr<!rec_B>
+// CIR:   %[[X:.*]] = cir.derived_class_addr %[[B]] : !cir.ptr<!rec_B> nonnull [4] -> !cir.ptr<!rec_X>
+
+// LLVM: define {{.*}} ptr @_Z14castBReftoXRefR1B(ptr %[[ARG0:.*]])
+// LLVM:   %[[B_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[ARG0]], ptr %[[B_ADDR]]
+// LLVM:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]]
+// LLVM:   %[[X:.*]] = getelementptr inbounds i8, ptr %[[B]], i32 -4
+
+// OGCG: define {{.*}} ptr @_Z14castBReftoXRefR1B(ptr {{.*}} %[[ARG0:.*]])
+// OGCG:   %[[B_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[ARG0]], ptr %[[B_ADDR]]
+// OGCG:   %[[B:.*]] = load ptr, ptr %[[B_ADDR]]
+// OGCG:   %[[X:.*]] = getelementptr inbounds i8, ptr %[[B]], i64 -4

From 72b02c7b376f211a6fffd5524e5db4c006ec6704 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 17 Nov 2025 09:13:55 -0800
Subject: [PATCH 12/67] [AMDGPU] Fix layering violations in AMDGPUMCExpr.cpp.
 NFC (#168242)

AMDGPUMCExpr lives in the MC layer it should not depend on Function.h or
GCNSubtarget.h

Move the function that needed GCNSubtarget to the one file that called
it.
---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   | 30 +++++++++++++++++--
 .../AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp      | 26 ----------------
 .../Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h |  5 ----
 3 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 29f8f9bc8b54c..8bfdbb7c5c310 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -358,6 +358,32 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
   return AsmPrinter::doInitialization(M);
 }
 
+/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
+///
+/// Remove dependency on GCNSubtarget and depend only only the necessary values
+/// for said occupancy computation. Should match computeOccupancy implementation
+/// without passing \p STM on.
+const AMDGPUMCExpr *createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
+                                    const MCExpr *NumVGPRs,
+                                    unsigned DynamicVGPRBlockSize,
+                                    const GCNSubtarget &STM, MCContext &Ctx) {
+  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
+  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
+  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
+  unsigned Generation = STM.getGeneration();
+
+  auto CreateExpr = [&Ctx](unsigned Value) {
+    return MCConstantExpr::create(Value, Ctx);
+  };
+
+  return AMDGPUMCExpr::create(AMDGPUMCExpr::AGVK_Occupancy,
+                              {CreateExpr(MaxWaves), CreateExpr(Granule),
+                               CreateExpr(TargetTotalNumVGPRs),
+                               CreateExpr(Generation), CreateExpr(InitOcc),
+                               NumSGPRs, NumVGPRs},
+                              Ctx);
+}
+
 void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
   if (F.isDeclaration() || !AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
     return;
@@ -459,7 +485,7 @@ void AMDGPUAsmPrinter::validateMCResourceInfo(Function &F) {
                         MaxWaves, MFI.getDynamicVGPRBlockSize())});
       uint64_t NumSGPRsForWavesPerEU = std::max(
           {NumSgpr, (uint64_t)1, (uint64_t)STM.getMinNumSGPRs(MaxWaves)});
-      const MCExpr *OccupancyExpr = AMDGPUMCExpr::createOccupancy(
+      const MCExpr *OccupancyExpr = createOccupancy(
           STM.getOccupancyWithWorkGroupSizes(*MF).second,
           MCConstantExpr::create(NumSGPRsForWavesPerEU, OutContext),
           MCConstantExpr::create(NumVGPRsForWavesPerEU, OutContext),
@@ -1270,7 +1296,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT,
                 amdhsa::COMPUTE_PGM_RSRC3_GFX125_NAMED_BAR_CNT_SHIFT);
 
-  ProgInfo.Occupancy = AMDGPUMCExpr::createOccupancy(
+  ProgInfo.Occupancy = createOccupancy(
       STM.computeOccupancy(F, ProgInfo.LDSSize).second,
       ProgInfo.NumSGPRsForWavesPerEU, ProgInfo.NumVGPRsForWavesPerEU,
       MFI->getDynamicVGPRBlockSize(), STM, Ctx);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
index c27be0250e386..093c85ecabab0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCExpr.h"
-#include "GCNSubtarget.h"
 #include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -317,30 +315,6 @@ const AMDGPUMCExpr *AMDGPUMCExpr::createTotalNumVGPR(const MCExpr *NumAGPR,
   return create(AGVK_TotalNumVGPRs, {NumAGPR, NumVGPR}, Ctx);
 }
 
-/// Mimics GCNSubtarget::computeOccupancy for MCExpr.
-///
-/// Remove dependency on GCNSubtarget and depend only only the necessary values
-/// for said occupancy computation. Should match computeOccupancy implementation
-/// without passing \p STM on.
-const AMDGPUMCExpr *AMDGPUMCExpr::createOccupancy(
-    unsigned InitOcc, const MCExpr *NumSGPRs, const MCExpr *NumVGPRs,
-    unsigned DynamicVGPRBlockSize, const GCNSubtarget &STM, MCContext &Ctx) {
-  unsigned MaxWaves = IsaInfo::getMaxWavesPerEU(&STM);
-  unsigned Granule = IsaInfo::getVGPRAllocGranule(&STM, DynamicVGPRBlockSize);
-  unsigned TargetTotalNumVGPRs = IsaInfo::getTotalNumVGPRs(&STM);
-  unsigned Generation = STM.getGeneration();
-
-  auto CreateExpr = [&Ctx](unsigned Value) {
-    return MCConstantExpr::create(Value, Ctx);
-  };
-
-  return create(AGVK_Occupancy,
-                {CreateExpr(MaxWaves), CreateExpr(Granule),
-                 CreateExpr(TargetTotalNumVGPRs), CreateExpr(Generation),
-                 CreateExpr(InitOcc), NumSGPRs, NumVGPRs},
-                Ctx);
-}
-
 const AMDGPUMCExpr *AMDGPUMCExpr::createLit(LitModifier Lit, int64_t Value,
                                             MCContext &Ctx) {
   assert(Lit == LitModifier::Lit || Lit == LitModifier::Lit64);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 246a3f88ebce4..bf7b40b1851da 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -98,11 +98,6 @@ class AMDGPUMCExpr : public MCTargetExpr {
     return create(VariantKind::AGVK_AlignTo, {Value, Align}, Ctx);
   }
 
-  static const AMDGPUMCExpr *
-  createOccupancy(unsigned InitOcc, const MCExpr *NumSGPRs,
-                  const MCExpr *NumVGPRs, unsigned DynamicVGPRBlockSize,
-                  const GCNSubtarget &STM, MCContext &Ctx);
-
   static const AMDGPUMCExpr *createLit(LitModifier Lit, int64_t Value,
                                        MCContext &Ctx);
 

From fb2b1387fb73a390d5d3e033277ed328c20553c3 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Mon, 17 Nov 2025 09:14:19 -0800
Subject: [PATCH 13/67] [CIR] Handle __asm labels on function declarations
 (#168149)

This updates the CIR direct callee builder code to handle the case of
calls to functions that were declared with an assembly label using
`__asm`. The implementation doesn't actually have any explicit handling
of the AsmLabelAttr. It is handled by the name mangler.

See https://reviews.llvm.org/D137073 and
https://reviews.llvm.org/D134362 for details on how this was implemented
in classic codegen. The test here is copied from
https://reviews.llvm.org/D134362 because the test in
https://reviews.llvm.org/D134362 requires a target that isn't yet
supported in CIR.
---
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          |  6 +-
 .../CIR/CodeGen/asm-label-inline-builtins.c   | 58 +++++++++++++++++++
 2 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/asm-label-inline-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 681a801cd7d81..8607558c1cf7d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1798,11 +1798,7 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) {
   const auto *fd = cast<FunctionDecl>(gd.getDecl());
 
   if (unsigned builtinID = fd->getBuiltinID()) {
-    if (fd->getAttr<AsmLabelAttr>()) {
-      cgm.errorNYI("AsmLabelAttr");
-    }
-
-    StringRef ident = fd->getName();
+    StringRef ident = cgm.getMangledName(gd);
     std::string fdInlineName = (ident + ".inline").str();
 
     bool isPredefinedLibFunction =
diff --git a/clang/test/CIR/CodeGen/asm-label-inline-builtins.c b/clang/test/CIR/CodeGen/asm-label-inline-builtins.c
new file mode 100644
index 0000000000000..24c9a32e7c41d
--- /dev/null
+++ b/clang/test/CIR/CodeGen/asm-label-inline-builtins.c
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 -triple x86_64 -fclangir -emit-cir -disable-llvm-passes -o %t-cir.cir %s
+// RUN: FileCheck --input-file=%t-cir.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64 -fclangir -emit-llvm -disable-llvm-passes -o %t-cir.ll %s
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64 -emit-llvm -disable-llvm-passes -o %t.ll %s
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+
+// Verifies that clang-generated *.inline carry the same name at call and callee
+// site, in spite of asm labels.
+
+typedef struct _IO_FILE FILE;
+extern FILE *stdout;
+extern int vprintf (const char *__restrict __format, __builtin_va_list __arg);
+extern int __vfprintf_chk (FILE *__restrict __stream, int __flag,
+      const char *__restrict __format, __builtin_va_list __ap);
+extern int __vprintf_chk (int __flag, const char *__restrict __format,
+     __builtin_va_list __ap);
+
+extern __typeof (vprintf) vprintf __asm ("__vprintfieee128");
+extern __typeof (__vfprintf_chk) __vfprintf_chk __asm ("__vfprintf_chkieee128");
+extern __typeof (__vprintf_chk) __vprintf_chk __asm ("__vprintf_chkieee128");
+
+extern __inline __attribute__ ((__always_inline__)) __attribute__ ((__gnu_inline__)) __attribute__ ((__artificial__)) int
+vprintf (const char *__restrict __fmt, __builtin_va_list __ap)
+{
+  return __vfprintf_chk (stdout, 2 - 1, __fmt, __ap);
+}
+
+void test(const char *fmt, __builtin_va_list ap) {
+  vprintf(fmt, ap);
+}
+
+// CIR: cir.func internal private @__vprintfieee128.inline({{.*}}) -> !s32i inline(always)
+// CIR:   cir.call @__vfprintf_chkieee128(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}})
+//
+// CIR: cir.func {{.*}} @test({{.*}})
+// CIR:   cir.call @__vprintfieee128.inline(%{{.*}}, %{{.*}})
+
+
+// LLVM: define internal i32 @__vprintfieee128.inline({{.*}}) #[[ALWAYS_INLINE_ATTR:.*]] {
+// LLVM:   call i32 @__vfprintf_chkieee128(ptr %{{.*}}, i32 1, ptr %{{.*}}, ptr %{{.*}})
+//
+// LLVM: define {{.*}} void @test{{.*}}
+// LLVM:   call i32 @__vprintfieee128.inline(ptr %{{.*}}, ptr %{{.*}})
+//
+// LLVM: attributes #[[ALWAYS_INLINE_ATTR]] = { alwaysinline }
+
+// Note: OGCG emits these in the opposite order, but the content is the same.
+
+
+// OGCG: define {{.*}} void @test{{.*}}
+// OGCG:   call i32 @__vprintfieee128.inline(ptr noundef %{{.*}}, ptr noundef %{{.*}})
+//
+// OGCG: define internal i32 @__vprintfieee128.inline({{.*}}) #[[ALWAYS_INLINE_ATTR:.*]] {
+// OGCG:   call i32 @__vfprintf_chkieee128(ptr noundef %{{.*}}, i32 noundef 1, ptr noundef %{{.*}}, ptr noundef %{{.*}})
+//
+// OGCG: attributes #[[ALWAYS_INLINE_ATTR]] = { alwaysinline {{.*}} }

From 8c674f04aa57766bbc7fac97c1e42526b22a95a4 Mon Sep 17 00:00:00 2001
From: Akash Banerjee <akash.banerjee@amd.com>
Date: Mon, 17 Nov 2025 17:18:12 +0000
Subject: [PATCH 14/67] [OpenMP][Flang] Change the OmpDefaultMapperName suffix
 (#168399)

This PR fixes a Fortran syntax violation in the OpenMP default mapper
naming convention. The suffix .omp.default.mapper contains dots which
are invalid in Fortran identifiers, causing failures when mappers are
written to and read from module files. The fix changes the suffix to
_omp_default_mapper which uses underscores instead of dots, complying
with Fortran syntax rules.

Key changes:

- Changed OmpDefaultMapperName constant from .omp.default.mapper to
_omp_default_mapper
- Added GetUltimate() calls in mapper symbol resolution to properly
handle symbols across module boundaries
- Added new test case verifying default mappers work correctly when
defined in a module and used in consuming programs

This fixes #168336.
---
 flang/lib/Lower/OpenMP/ClauseProcessor.cpp    |  3 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  4 +-
 flang/test/Lower/OpenMP/declare-mapper.f90    | 43 +++++++++++++++++--
 flang/test/Lower/OpenMP/derived-type-map.f90  |  2 +-
 flang/test/Lower/OpenMP/map-mapper.f90        |  4 +-
 flang/test/Lower/OpenMP/target.f90            |  2 +-
 .../Parser/OpenMP/declare-mapper-unparse.f90  |  2 +-
 .../OpenMP/openmp6-directive-spellings.f90    |  2 +-
 .../OpenMP/declare-mapper-symbols.f90         |  2 +-
 .../llvm/Frontend/OpenMP/OMPConstants.h       |  2 +-
 10 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index e018a2d937435..4a392381287d5 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -1278,7 +1278,8 @@ void ClauseProcessor::processMapObjects(
     std::string mapperIdName =
         typeSpec->name().ToString() + llvm::omp::OmpDefaultMapperName;
     if (auto *sym = converter.getCurrentScope().FindSymbol(mapperIdName)) {
-      mapperIdName = converter.mangleName(mapperIdName, sym->owner());
+      mapperIdName =
+          converter.mangleName(mapperIdName, sym->GetUltimate().owner());
     } else {
       mapperIdName = converter.mangleName(mapperIdName, *typeSpec->GetScope());
     }
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index f822fe3c8dd71..c6487349c4056 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2612,8 +2612,8 @@ genTargetOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
               typeSpec->name().ToString() + llvm::omp::OmpDefaultMapperName;
           if (auto *mapperSym =
                   converter.getCurrentScope().FindSymbol(mapperIdName))
-            mapperIdName =
-                converter.mangleName(mapperIdName, mapperSym->owner());
+            mapperIdName = converter.mangleName(
+                mapperIdName, mapperSym->GetUltimate().owner());
           else
             mapperIdName =
                 converter.mangleName(mapperIdName, *typeSpec->GetScope());
diff --git a/flang/test/Lower/OpenMP/declare-mapper.f90 b/flang/test/Lower/OpenMP/declare-mapper.f90
index 0266365cf03c0..a24d6bd0bf946 100644
--- a/flang/test/Lower/OpenMP/declare-mapper.f90
+++ b/flang/test/Lower/OpenMP/declare-mapper.f90
@@ -9,6 +9,8 @@
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-6.f90 -o - | FileCheck %t/omp-declare-mapper-6.f90
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -module-dir %t %t/omp-declare-mapper-7.mod.f90 -o - >/dev/null
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -J %t %t/omp-declare-mapper-7.use.f90 -o - | FileCheck %t/omp-declare-mapper-7.use.f90
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -module-dir %t %t/omp-declare-mapper-8.mod.f90 -o - >/dev/null
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 -J %t %t/omp-declare-mapper-8.use.f90 -o - | FileCheck %t/omp-declare-mapper-8.use.f90
 
 !--- omp-declare-mapper-1.f90
 subroutine declare_mapper_1
@@ -26,7 +28,7 @@ subroutine declare_mapper_1
    end type
    type(my_type2)        :: t
    real                   :: x, y(nvals)
-   !CHECK:omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_1my_type\.omp\.default\.mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_1Tmy_type\{num_vals:i32,values:!fir\.box<!fir\.heap<!fir\.array<\?xi32>>>\}>]] {
+   !CHECK:omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_1my_type_omp_default_mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_1Tmy_type\{num_vals:i32,values:!fir\.box<!fir\.heap<!fir\.array<\?xi32>>>\}>]] {
    !CHECK:      ^bb0(%[[VAL_0:.*]]: !fir.ref<[[MY_TYPE]]>):
    !CHECK:        %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdeclare_mapper_1Evar"} : (!fir.ref<[[MY_TYPE]]>) -> (!fir.ref<[[MY_TYPE]]>, !fir.ref<[[MY_TYPE]]>)
    !CHECK:        %[[VAL_2:.*]] = hlfir.designate %[[VAL_1]]#0{"values"}   {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<[[MY_TYPE]]>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
@@ -153,7 +155,7 @@ subroutine declare_mapper_4
       integer              :: num
    end type
 
-   !CHECK: omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_4my_type.omp.default.mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_4Tmy_type\{num:i32\}>]]
+   !CHECK: omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_4my_type_omp_default_mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_4Tmy_type\{num:i32\}>]]
    !$omp declare mapper (my_type :: var) map (var%num)
 
    type(my_type) :: a
@@ -185,9 +187,9 @@ program declare_mapper_5
    end type
 
    !CHECK: omp.declare_mapper @[[INNER_MAPPER_NAMED:_QQFFuse_innermy_mapper]] : [[MY_TYPE:!fir\.type<_QFTmytype\{x:i32,y:i32\}>]]
-   !CHECK: omp.declare_mapper @[[INNER_MAPPER_DEFAULT:_QQFFuse_innermytype.omp.default.mapper]] : [[MY_TYPE]]
+   !CHECK: omp.declare_mapper @[[INNER_MAPPER_DEFAULT:_QQFFuse_innermytype_omp_default_mapper]] : [[MY_TYPE]]
    !CHECK: omp.declare_mapper @[[OUTER_MAPPER_NAMED:_QQFmy_mapper]] : [[MY_TYPE]]
-   !CHECK: omp.declare_mapper @[[OUTER_MAPPER_DEFAULT:_QQFmytype.omp.default.mapper]] : [[MY_TYPE]]
+   !CHECK: omp.declare_mapper @[[OUTER_MAPPER_DEFAULT:_QQFmytype_omp_default_mapper]] : [[MY_TYPE]]
    !$omp declare mapper(mytype :: var) map(tofrom: var%x)
    !$omp declare mapper(my_mapper : mytype :: var) map(tofrom: var%y)
 
@@ -325,3 +327,36 @@ program use_module_mapper
     a%x = 42
   !$omp end target
 end program use_module_mapper
+
+!--- omp-declare-mapper-8.mod.f90
+! Module with a default DECLARE MAPPER to be compiled separately.
+module default_mapper_mod
+  implicit none
+  type :: dtype
+    integer :: x
+  end type dtype
+  !$omp declare mapper(dtype :: v) map(tofrom: v%x)
+end module default_mapper_mod
+
+!--- omp-declare-mapper-8.use.f90
+! Consumer program that USEs the module and relies on the default mapper.
+! CHECK: omp.declare_mapper @{{.*dtype_omp_default_mapper}} : !fir.type<_QMdefault_mapper_modTdtype{x:i32}>
+! CHECK: %{{.*}} = omp.map.info {{.*}} map_clauses(tofrom) {{.*}} mapper(@{{.*dtype_omp_default_mapper}}) {{.*}} {name = "a"}
+! CHECK: %{{.*}} = omp.map.info {{.*}} map_clauses(tofrom) {{.*}} mapper(@{{.*dtype_omp_default_mapper}}) {{.*}} {name = "a"}
+! CHECK: %{{.*}} = omp.map.info {{.*}} map_clauses(implicit, tofrom) {{.*}} mapper(@{{.*dtype_omp_default_mapper}}) {{.*}} {name = "a"}
+program use_module_default_mapper
+  use default_mapper_mod
+  implicit none
+  type(dtype) :: a
+  !$omp target map(a)
+    a%x = 7
+  !$omp end target
+
+  !$omp target map(mapper(default) : a)
+    a%x = 8
+  !$omp end target
+
+  !$omp target
+    a%x = 8
+  !$omp end target
+end program use_module_default_mapper
diff --git a/flang/test/Lower/OpenMP/derived-type-map.f90 b/flang/test/Lower/OpenMP/derived-type-map.f90
index 228e86d9e4dfd..921dd5663f8c5 100644
--- a/flang/test/Lower/OpenMP/derived-type-map.f90
+++ b/flang/test/Lower/OpenMP/derived-type-map.f90
@@ -1,6 +1,6 @@
 !RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
 
-!CHECK: omp.declare_mapper @[[MAPPER1:_QQFmaptype_derived_implicit_allocatablescalar_and_array.omp.default.mapper]] : !fir.type<_QFmaptype_derived_implicit_allocatableTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {
+!CHECK: omp.declare_mapper @[[MAPPER1:_QQFmaptype_derived_implicit_allocatablescalar_and_array_omp_default_mapper]] : !fir.type<_QFmaptype_derived_implicit_allocatableTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {
 
 !CHECK: %[[ALLOCA:.*]] = fir.alloca !fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}> {bindc_name = "scalar_arr", uniq_name = "_QFmaptype_derived_implicitEscalar_arr"}
 !CHECK: %[[DECLARE:.*]]:2 = hlfir.declare %[[ALLOCA]] {uniq_name = "_QFmaptype_derived_implicitEscalar_arr"} : (!fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>) -> (!fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.ref<!fir.type<_QFmaptype_derived_implicitTscalar_and_array{real:f32,array:!fir.array<10xi32>,int:i32}>>)
diff --git a/flang/test/Lower/OpenMP/map-mapper.f90 b/flang/test/Lower/OpenMP/map-mapper.f90
index 91564bfc7bc46..8934fbb5d6edf 100644
--- a/flang/test/Lower/OpenMP/map-mapper.f90
+++ b/flang/test/Lower/OpenMP/map-mapper.f90
@@ -8,7 +8,7 @@ program p
    !$omp declare mapper(xx : t1 :: nn) map(to: nn, nn%x)
    !$omp declare mapper(t1 :: nn) map(from: nn)
 
-   !CHECK-LABEL: omp.declare_mapper @_QQFt1.omp.default.mapper : !fir.type<_QFTt1{x:!fir.array<256xi32>}>
+   !CHECK-LABEL: omp.declare_mapper @_QQFt1_omp_default_mapper : !fir.type<_QFTt1{x:!fir.array<256xi32>}>
    !CHECK-LABEL: omp.declare_mapper @_QQFxx : !fir.type<_QFTt1{x:!fir.array<256xi32>}>
 
    type(t1) :: a, b
@@ -20,7 +20,7 @@ program p
    end do
    !$omp end target
 
-   !CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) mapper(@_QQFt1.omp.default.mapper) -> {{.*}} {name = "b"}
+   !CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}, {{.*}}) map_clauses(tofrom) capture(ByRef) mapper(@_QQFt1_omp_default_mapper) -> {{.*}} {name = "b"}
    !CHECK: omp.target map_entries(%[[MAP_B]] -> %{{.*}}, %{{.*}} -> %{{.*}} : {{.*}}, {{.*}}) {
    !$omp target map(mapper(default) : b)
    do i = 1, n
diff --git a/flang/test/Lower/OpenMP/target.f90 b/flang/test/Lower/OpenMP/target.f90
index 94907ba3ae74a..7cd642bcf23cf 100644
--- a/flang/test/Lower/OpenMP/target.f90
+++ b/flang/test/Lower/OpenMP/target.f90
@@ -529,7 +529,7 @@ subroutine omp_target_device_ptr
    use iso_c_binding, only : c_ptr, c_loc
    type(c_ptr) :: a
    integer, target :: b
-   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) mapper(@[[CPTR_DEFAULT:_QQM__fortran_builtinsc_ptr\.omp\.default\.mapper]]) -> {{.*}} {name = "a"}
+   !CHECK: %[[MAP:.*]] = omp.map.info var_ptr({{.*}})   map_clauses(tofrom) capture(ByRef) mapper(@[[CPTR_DEFAULT:_QQM__fortran_builtinsc_ptr_omp_default_mapper]]) -> {{.*}} {name = "a"}
    !CHECK: omp.target_data map_entries(%[[MAP]]{{.*}}) use_device_ptr({{.*}} -> %[[VAL_1:.*]] : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>)
    !$omp target data map(tofrom: a) use_device_ptr(a)
    !CHECK: {{.*}} = fir.coordinate_of %[[VAL_1:.*]], __address : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> !fir.ref<i64>
diff --git a/flang/test/Parser/OpenMP/declare-mapper-unparse.f90 b/flang/test/Parser/OpenMP/declare-mapper-unparse.f90
index b53bf5ce10557..9da6674c3a58d 100644
--- a/flang/test/Parser/OpenMP/declare-mapper-unparse.f90
+++ b/flang/test/Parser/OpenMP/declare-mapper-unparse.f90
@@ -29,7 +29,7 @@ program main
 
 !PARSE-TREE:      OpenMPDeclareMapperConstruct
 !PARSE-TREE:        OmpMapperSpecifier
-!PARSE-TREE:         string = 'ty.omp.default.mapper'
+!PARSE-TREE:         string = 'ty_omp_default_mapper'
 !PARSE-TREE:         TypeSpec -> DerivedTypeSpec
 !PARSE-TREE:           Name = 'ty'
 !PARSE-TREE:         Name = 'mapped'
diff --git a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90 b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
index 50a38c6494aa6..7a627913f9555 100644
--- a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
+++ b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
@@ -57,7 +57,7 @@ subroutine f01
 !PARSE-TREE: DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclareMapperConstruct -> OmpDirectiveSpecification
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare mapper
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpMapperSpecifier
-!PARSE-TREE: | | string = 't.omp.default.mapper'
+!PARSE-TREE: | | string = 't_omp_default_mapper'
 !PARSE-TREE: | | TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 't'
 !PARSE-TREE: | | Name = 'v'
diff --git a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90 b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
index 5d77540aa6453..9a1b86758357f 100644
--- a/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
+++ b/flang/test/Semantics/OpenMP/declare-mapper-symbols.f90
@@ -13,7 +13,7 @@ program main
 !! Note, symbols come out in their respective scope, but not in declaration order.
 !CHECK: mymapper: MapperDetails
 !CHECK: ty: DerivedType components: x
-!CHECK: ty.omp.default.mapper: MapperDetails
+!CHECK: ty_omp_default_mapper: MapperDetails
 !CHECK: DerivedType scope: ty
 !CHECK: OtherConstruct scope:
 !CHECK: mapped (OmpMapToFrom) {{.*}} ObjectEntity type: TYPE(ty)
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index 1ac9ac040468c..58fd8a490c04a 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -201,7 +201,7 @@ enum class OMPDynGroupprivateFallbackType : uint64_t {
 };
 
 // Default OpenMP mapper name suffix.
-inline constexpr const char *OmpDefaultMapperName = ".omp.default.mapper";
+inline constexpr const char *OmpDefaultMapperName = "_omp_default_mapper";
 
 /// Values for bit flags used to specify the mapping type for
 /// offloading.

From 2b22e9b13330d47ae22cb0aa8016ddbb567bf94f Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Mon, 17 Nov 2025 12:21:46 -0500
Subject: [PATCH 15/67] [SPIRV] Use a worklist in the post-legalizer (#165027)

This commit refactors the SPIRV post-legalizer to use a worklist to
process
new instructions. Previously, the post-legalizer would iterate through
all
instructions and try to assign types. This could fail if a new
instruction
depended on another new instruction that had not been processed yet.

The new implementation adds all new instructions that require a SPIR-V
type
to a worklist. It then iteratively processes the worklist until it is
empty.
This ensures that all dependencies are met before an instruction is
processed.

This change makes the post-legalizer more robust and fixes potential
ordering
issues with newly generated instructions.

Existing tests cover existing functionality. More tests will be added as
the legalizer is modified.

Part of #153091
---
 llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp | 370 +++++++++++++++----
 1 file changed, 302 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
index d17528dd882bf..751ae0fe34d33 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPostLegalizer.cpp
@@ -17,7 +17,8 @@
 #include "SPIRV.h"
 #include "SPIRVSubtarget.h"
 #include "SPIRVUtils.h"
-#include "llvm/IR/Attributes.h"
+#include "llvm/IR/IntrinsicsSPIRV.h"
+#include "llvm/Support/Debug.h"
 #include <stack>
 
 #define DEBUG_TYPE "spirv-postlegalizer"
@@ -43,79 +44,314 @@ extern void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
                          SPIRVType *KnownResType);
 } // namespace llvm
 
-static bool mayBeInserted(unsigned Opcode) {
-  switch (Opcode) {
-  case TargetOpcode::G_SMAX:
-  case TargetOpcode::G_UMAX:
-  case TargetOpcode::G_SMIN:
-  case TargetOpcode::G_UMIN:
-  case TargetOpcode::G_FMINNUM:
-  case TargetOpcode::G_FMINIMUM:
-  case TargetOpcode::G_FMAXNUM:
-  case TargetOpcode::G_FMAXIMUM:
-    return true;
+static SPIRVType *deduceIntTypeFromResult(Register ResVReg,
+                                          MachineIRBuilder &MIB,
+                                          SPIRVGlobalRegistry *GR) {
+  const LLT &Ty = MIB.getMRI()->getType(ResVReg);
+  return GR->getOrCreateSPIRVIntegerType(Ty.getScalarSizeInBits(), MIB);
+}
+
+static bool deduceAndAssignTypeForGUnmerge(MachineInstr *I, MachineFunction &MF,
+                                           SPIRVGlobalRegistry *GR) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register SrcReg = I->getOperand(I->getNumOperands() - 1).getReg();
+  SPIRVType *ScalarType = nullptr;
+  if (SPIRVType *DefType = GR->getSPIRVTypeForVReg(SrcReg)) {
+    assert(DefType->getOpcode() == SPIRV::OpTypeVector);
+    ScalarType = GR->getSPIRVTypeForVReg(DefType->getOperand(1).getReg());
+  }
+
+  if (!ScalarType) {
+    // If we could not deduce the type from the source, try to deduce it from
+    // the uses of the results.
+    for (unsigned i = 0; i < I->getNumDefs() && !ScalarType; ++i) {
+      for (const auto &Use :
+           MRI.use_nodbg_instructions(I->getOperand(i).getReg())) {
+        assert(Use.getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
+               "Expected use of G_UNMERGE_VALUES to be a G_BUILD_VECTOR");
+        if (auto *VecType =
+                GR->getSPIRVTypeForVReg(Use.getOperand(0).getReg())) {
+          ScalarType = GR->getScalarOrVectorComponentType(VecType);
+          break;
+        }
+      }
+    }
+  }
+
+  if (!ScalarType)
+    return false;
+
+  for (unsigned i = 0; i < I->getNumDefs(); ++i) {
+    Register DefReg = I->getOperand(i).getReg();
+    if (GR->getSPIRVTypeForVReg(DefReg))
+      continue;
+
+    LLT DefLLT = MRI.getType(DefReg);
+    SPIRVType *ResType =
+        DefLLT.isVector()
+            ? GR->getOrCreateSPIRVVectorType(
+                  ScalarType, DefLLT.getNumElements(), *I,
+                  *MF.getSubtarget<SPIRVSubtarget>().getInstrInfo())
+            : ScalarType;
+    setRegClassType(DefReg, ResType, GR, &MRI, MF);
+  }
+  return true;
+}
+
+static SPIRVType *deduceTypeFromSingleOperand(MachineInstr *I,
+                                              MachineIRBuilder &MIB,
+                                              SPIRVGlobalRegistry *GR,
+                                              unsigned OpIdx) {
+  Register OpReg = I->getOperand(OpIdx).getReg();
+  if (SPIRVType *OpType = GR->getSPIRVTypeForVReg(OpReg)) {
+    if (SPIRVType *CompType = GR->getScalarOrVectorComponentType(OpType)) {
+      Register ResVReg = I->getOperand(0).getReg();
+      const LLT &ResLLT = MIB.getMRI()->getType(ResVReg);
+      if (ResLLT.isVector())
+        return GR->getOrCreateSPIRVVectorType(CompType, ResLLT.getNumElements(),
+                                              MIB, false);
+      return CompType;
+    }
+  }
+  return nullptr;
+}
+
+static SPIRVType *deduceTypeFromOperandRange(MachineInstr *I,
+                                             MachineIRBuilder &MIB,
+                                             SPIRVGlobalRegistry *GR,
+                                             unsigned StartOp, unsigned EndOp) {
+  SPIRVType *ResType = nullptr;
+  for (unsigned i = StartOp; i < EndOp; ++i) {
+    if (SPIRVType *Type = deduceTypeFromSingleOperand(I, MIB, GR, i)) {
+#ifdef EXPENSIVE_CHECKS
+      assert(!ResType || Type == ResType && "Conflicting type from operands.");
+      ResType = Type;
+#else
+      return Type;
+#endif
+    }
+  }
+  return ResType;
+}
+
+static SPIRVType *deduceTypeForResultRegister(MachineInstr *Use,
+                                              Register UseRegister,
+                                              SPIRVGlobalRegistry *GR,
+                                              MachineIRBuilder &MIB) {
+  for (const MachineOperand &MO : Use->defs()) {
+    if (!MO.isReg())
+      continue;
+    if (SPIRVType *OpType = GR->getSPIRVTypeForVReg(MO.getReg())) {
+      if (SPIRVType *CompType = GR->getScalarOrVectorComponentType(OpType)) {
+        const LLT &ResLLT = MIB.getMRI()->getType(UseRegister);
+        if (ResLLT.isVector())
+          return GR->getOrCreateSPIRVVectorType(
+              CompType, ResLLT.getNumElements(), MIB, false);
+        return CompType;
+      }
+    }
+  }
+  return nullptr;
+}
+
+static SPIRVType *deduceTypeFromUses(Register Reg, MachineFunction &MF,
+                                     SPIRVGlobalRegistry *GR,
+                                     MachineIRBuilder &MIB) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineInstr &Use : MRI.use_nodbg_instructions(Reg)) {
+    SPIRVType *ResType = nullptr;
+    switch (Use.getOpcode()) {
+    case TargetOpcode::G_BUILD_VECTOR:
+    case TargetOpcode::G_EXTRACT_VECTOR_ELT:
+    case TargetOpcode::G_UNMERGE_VALUES:
+      LLVM_DEBUG(dbgs() << "Looking at use " << Use << "\n");
+      ResType = deduceTypeForResultRegister(&Use, Reg, GR, MIB);
+      break;
+    }
+    if (ResType)
+      return ResType;
+  }
+  return nullptr;
+}
+
+static SPIRVType *deduceResultTypeFromOperands(MachineInstr *I,
+                                               SPIRVGlobalRegistry *GR,
+                                               MachineIRBuilder &MIB) {
+  Register ResVReg = I->getOperand(0).getReg();
+  switch (I->getOpcode()) {
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_ANYEXT:
+    return deduceIntTypeFromResult(ResVReg, MIB, GR);
+  case TargetOpcode::G_BUILD_VECTOR:
+    return deduceTypeFromOperandRange(I, MIB, GR, 1, I->getNumOperands());
+  case TargetOpcode::G_SHUFFLE_VECTOR:
+    return deduceTypeFromOperandRange(I, MIB, GR, 1, 3);
   default:
-    return isTypeFoldingSupported(Opcode);
+    if (I->getNumDefs() == 1 && I->getNumOperands() > 1 &&
+        I->getOperand(1).isReg())
+      return deduceTypeFromSingleOperand(I, MIB, GR, 1);
+    return nullptr;
   }
 }
 
-static void processNewInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR,
-                             MachineIRBuilder MIB) {
+static bool deduceAndAssignSpirvType(MachineInstr *I, MachineFunction &MF,
+                                     SPIRVGlobalRegistry *GR,
+                                     MachineIRBuilder &MIB) {
+  LLVM_DEBUG(dbgs() << "\nProcessing instruction: " << *I);
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register ResVReg = I->getOperand(0).getReg();
+
+  // G_UNMERGE_VALUES is handled separately because it has multiple definitions,
+  // unlike the other instructions which have a single result register. The main
+  // deduction logic is designed for the single-definition case.
+  if (I->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
+    return deduceAndAssignTypeForGUnmerge(I, MF, GR);
+
+  LLVM_DEBUG(dbgs() << "Inferring type from operands\n");
+  SPIRVType *ResType = deduceResultTypeFromOperands(I, GR, MIB);
+  if (!ResType) {
+    LLVM_DEBUG(dbgs() << "Inferring type from uses\n");
+    ResType = deduceTypeFromUses(ResVReg, MF, GR, MIB);
+  }
+
+  if (!ResType)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Assigned type to " << *I << ": " << *ResType);
+  GR->assignSPIRVTypeToVReg(ResType, ResVReg, MF);
 
+  if (!MRI.getRegClassOrNull(ResVReg)) {
+    LLVM_DEBUG(dbgs() << "Updating the register class.\n");
+    setRegClassType(ResVReg, ResType, GR, &MRI, *GR->CurMF, true);
+  }
+  return true;
+}
+
+static bool requiresSpirvType(MachineInstr &I, SPIRVGlobalRegistry *GR,
+                              MachineRegisterInfo &MRI) {
+  LLVM_DEBUG(dbgs() << "Checking if instruction requires a SPIR-V type: "
+                    << I;);
+  if (I.getNumDefs() == 0) {
+    LLVM_DEBUG(dbgs() << "Instruction does not have a definition.\n");
+    return false;
+  }
+
+  if (!I.isPreISelOpcode()) {
+    LLVM_DEBUG(dbgs() << "Instruction is not a generic instruction.\n");
+    return false;
+  }
+
+  Register ResultRegister = I.defs().begin()->getReg();
+  if (GR->getSPIRVTypeForVReg(ResultRegister)) {
+    LLVM_DEBUG(dbgs() << "Instruction already has a SPIR-V type.\n");
+    if (!MRI.getRegClassOrNull(ResultRegister)) {
+      LLVM_DEBUG(dbgs() << "Updating the register class.\n");
+      setRegClassType(ResultRegister, GR->getSPIRVTypeForVReg(ResultRegister),
+                      GR, &MRI, *GR->CurMF, true);
+    }
+    return false;
+  }
+
+  return true;
+}
+
+static void registerSpirvTypeForNewInstructions(MachineFunction &MF,
+                                                SPIRVGlobalRegistry *GR) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<MachineInstr *, 8> Worklist;
   for (MachineBasicBlock &MBB : MF) {
     for (MachineInstr &I : MBB) {
-      const unsigned Opcode = I.getOpcode();
-      if (Opcode == TargetOpcode::G_UNMERGE_VALUES) {
-        unsigned ArgI = I.getNumOperands() - 1;
-        Register SrcReg = I.getOperand(ArgI).isReg()
-                              ? I.getOperand(ArgI).getReg()
-                              : Register(0);
-        SPIRVType *DefType =
-            SrcReg.isValid() ? GR->getSPIRVTypeForVReg(SrcReg) : nullptr;
-        if (!DefType || DefType->getOpcode() != SPIRV::OpTypeVector)
-          report_fatal_error(
-              "cannot select G_UNMERGE_VALUES with a non-vector argument");
-        SPIRVType *ScalarType =
-            GR->getSPIRVTypeForVReg(DefType->getOperand(1).getReg());
-        for (unsigned i = 0; i < I.getNumDefs(); ++i) {
-          Register ResVReg = I.getOperand(i).getReg();
-          SPIRVType *ResType = GR->getSPIRVTypeForVReg(ResVReg);
-          if (!ResType) {
-            // There was no "assign type" actions, let's fix this now
-            ResType = ScalarType;
-            setRegClassType(ResVReg, ResType, GR, &MRI, *GR->CurMF, true);
-          }
-        }
-      } else if (mayBeInserted(Opcode) && I.getNumDefs() == 1 &&
-                 I.getNumOperands() > 1 && I.getOperand(1).isReg()) {
-        // Legalizer may have added a new instructions and introduced new
-        // registers, we must decorate them as if they were introduced in a
-        // non-automatic way
-        Register ResVReg = I.getOperand(0).getReg();
-        // Check if the register defined by the instruction is newly generated
-        // or already processed
-        // Check if we have type defined for operands of the new instruction
-        bool IsKnownReg = MRI.getRegClassOrNull(ResVReg);
-        SPIRVType *ResVType = GR->getSPIRVTypeForVReg(
-            IsKnownReg ? ResVReg : I.getOperand(1).getReg());
-        if (!ResVType)
-          continue;
-        // Set type & class
-        if (!IsKnownReg)
-          setRegClassType(ResVReg, ResVType, GR, &MRI, *GR->CurMF, true);
-        // If this is a simple operation that is to be reduced by TableGen
-        // definition we must apply some of pre-legalizer rules here
-        if (isTypeFoldingSupported(Opcode)) {
-          processInstr(I, MIB, MRI, GR, GR->getSPIRVTypeForVReg(ResVReg));
-          if (IsKnownReg && MRI.hasOneUse(ResVReg)) {
-            MachineInstr &UseMI = *MRI.use_instr_begin(ResVReg);
-            if (UseMI.getOpcode() == SPIRV::ASSIGN_TYPE)
-              continue;
-          }
-          insertAssignInstr(ResVReg, nullptr, ResVType, GR, MIB, MRI);
+      if (requiresSpirvType(I, GR, MRI)) {
+        Worklist.push_back(&I);
+      }
+    }
+  }
+
+  if (Worklist.empty()) {
+    LLVM_DEBUG(dbgs() << "Initial worklist is empty.\n");
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "Initial worklist:\n";
+             for (auto *I : Worklist) { I->dump(); });
+
+  bool Changed;
+  do {
+    Changed = false;
+    SmallVector<MachineInstr *, 8> NextWorklist;
+
+    for (MachineInstr *I : Worklist) {
+      MachineIRBuilder MIB(*I);
+      if (deduceAndAssignSpirvType(I, MF, GR, MIB)) {
+        Changed = true;
+      } else {
+        NextWorklist.push_back(I);
+      }
+    }
+    Worklist = std::move(NextWorklist);
+    LLVM_DEBUG(dbgs() << "Worklist size: " << Worklist.size() << "\n");
+  } while (Changed);
+
+  if (Worklist.empty())
+    return;
+
+  for (auto *I : Worklist) {
+    MachineIRBuilder MIB(*I);
+    Register ResVReg = I->getOperand(0).getReg();
+    const LLT &ResLLT = MRI.getType(ResVReg);
+    SPIRVType *ResType = nullptr;
+    if (ResLLT.isVector()) {
+      SPIRVType *CompType = GR->getOrCreateSPIRVIntegerType(
+          ResLLT.getElementType().getSizeInBits(), MIB);
+      ResType = GR->getOrCreateSPIRVVectorType(
+          CompType, ResLLT.getNumElements(), MIB, false);
+    } else {
+      ResType = GR->getOrCreateSPIRVIntegerType(ResLLT.getSizeInBits(), MIB);
+    }
+    LLVM_DEBUG(dbgs() << "Could not determine type for " << *I
+                      << ", defaulting to " << *ResType << "\n");
+    setRegClassType(ResVReg, ResType, GR, &MRI, MF, true);
+  }
+}
+
+static void ensureAssignTypeForTypeFolding(MachineFunction &MF,
+                                           SPIRVGlobalRegistry *GR) {
+  LLVM_DEBUG(dbgs() << "Entering ensureAssignTypeForTypeFolding for function "
+                    << MF.getName() << "\n");
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!isTypeFoldingSupported(MI.getOpcode()))
+        continue;
+      if (MI.getNumOperands() == 1 || !MI.getOperand(1).isReg())
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Processing instruction: " << MI);
+
+      // Check uses of MI to see if it already has an use in SPIRV::ASSIGN_TYPE
+      bool HasAssignType = false;
+      Register ResultRegister = MI.defs().begin()->getReg();
+      // All uses of Result register
+      for (MachineInstr &UseInstr :
+           MRI.use_nodbg_instructions(ResultRegister)) {
+        if (UseInstr.getOpcode() == SPIRV::ASSIGN_TYPE) {
+          HasAssignType = true;
+          LLVM_DEBUG(dbgs() << "  Instruction already has an ASSIGN_TYPE use: "
+                            << UseInstr);
+          break;
         }
       }
+
+      if (!HasAssignType) {
+        Register ResultRegister = MI.defs().begin()->getReg();
+        SPIRVType *ResultType = GR->getSPIRVTypeForVReg(ResultRegister);
+        LLVM_DEBUG(
+            dbgs() << "  Adding ASSIGN_TYPE for ResultRegister: "
+                   << printReg(ResultRegister, MRI.getTargetRegisterInfo())
+                   << " with type: " << *ResultType);
+        MachineIRBuilder MIB(MI);
+        insertAssignInstr(ResultRegister, nullptr, ResultType, GR, MIB, MRI);
+      }
     }
   }
 }
@@ -155,10 +391,8 @@ bool SPIRVPostLegalizer::runOnMachineFunction(MachineFunction &MF) {
   const SPIRVSubtarget &ST = MF.getSubtarget<SPIRVSubtarget>();
   SPIRVGlobalRegistry *GR = ST.getSPIRVGlobalRegistry();
   GR->setCurrentFunc(MF);
-  MachineIRBuilder MIB(MF);
-
-  processNewInstrs(MF, GR, MIB);
-
+  registerSpirvTypeForNewInstructions(MF, GR);
+  ensureAssignTypeForTypeFolding(MF, GR);
   return true;
 }
 

From be6296ea8faccec5d2fbaa2625112e26a5deeb85 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Mon, 17 Nov 2025 18:32:32 +0100
Subject: [PATCH 16/67] [RISCV] Fold Zba-expanded (mul (shr exact X, C1), C2)
 (#168019)

---
 llvm/include/llvm/CodeGen/SDPatternMatch.h  |  5 ++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 11 ++--
 llvm/test/CodeGen/RISCV/rv64zba.ll          | 71 +++++++++++++++++++++
 3 files changed, 82 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 511cb56f73dcb..557dbf8c7ca39 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -903,6 +903,11 @@ template <typename LHS, typename RHS>
 inline BinaryOpc_match<LHS, RHS> m_Srl(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS>(ISD::SRL, L, R);
 }
+template <typename LHS, typename RHS>
+inline auto m_ExactSr(const LHS &L, const RHS &R) {
+  return m_AnyOf(BinaryOpc_match<LHS, RHS>(ISD::SRA, L, R, SDNodeFlags::Exact),
+                 BinaryOpc_match<LHS, RHS>(ISD::SRL, L, R, SDNodeFlags::Exact));
+}
 
 template <typename LHS, typename RHS>
 inline BinaryOpc_match<LHS, RHS> m_Rotl(const LHS &L, const RHS &R) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f313d3f1347d4..fb298ee35d6c2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16798,9 +16798,7 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
     // because X is exact (Y >> M + 2).
     uint64_t ShAmt = Log2_64(MulAmtLowBit) + 2;
     using namespace SDPatternMatch;
-    return sd_match(X, m_AnyOf(m_Sra(m_Value(), m_SpecificInt(ShAmt)),
-                               m_Srl(m_Value(), m_SpecificInt(ShAmt)))) &&
-           X->getFlags().hasExact();
+    return sd_match(X, m_ExactSr(m_Value(), m_SpecificInt(ShAmt)));
   };
   if (isPowerOf2_64(MulAmt - MulAmtLowBit) && !(CanSub && PreferSub())) {
     Op = ISD::ADD;
@@ -16825,10 +16823,13 @@ static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue X = N->getOperand(0);
-  // Put the shift first if we can fold a zext into the shift forming a slli.uw.
+  // Put the shift first if we can fold:
+  // a. a zext into the shift forming a slli.uw
+  // b. an exact shift right forming one shorter shift or no shift at all
   using namespace SDPatternMatch;
   if (Shift != 0 &&
-      sd_match(X, m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))))) {
+      sd_match(X, m_AnyOf(m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff))),
+                          m_ExactSr(m_Value(), m_ConstInt())))) {
     X = DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT));
     Shift = 0;
   }
diff --git a/llvm/test/CodeGen/RISCV/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64zba.ll
index 4ab4ff84dac57..fb26b8b16a290 100644
--- a/llvm/test/CodeGen/RISCV/rv64zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zba.ll
@@ -5016,3 +5016,74 @@ define ptr @shl_add_knownbits(ptr %p, i64 %i) {
   %r = getelementptr i8, ptr %p, i64 %shr
   ret ptr %r
 }
+
+define i64 @exactashr1mul6(i64 %a) {
+; RV64I-LABEL: exactashr1mul6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: exactashr1mul6:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: exactashr1mul6:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = ashr exact i64 %a, 1
+  %d = mul i64 %c, 6
+  ret i64 %d
+}
+
+define i64 @exactlshr3mul22(i64 %a) {
+; RV64I-LABEL: exactlshr3mul22:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a0, a0, 3
+; RV64I-NEXT:    li a1, 22
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: exactlshr3mul22:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a0, a0, 2
+; RV64ZBA-NEXT:    sh2add a1, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: exactlshr3mul22:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    srli a0, a0, 2
+; RV64XANDESPERF-NEXT:    nds.lea.w a1, a0, a0
+; RV64XANDESPERF-NEXT:    nds.lea.h a0, a0, a1
+; RV64XANDESPERF-NEXT:    ret
+  %c = lshr exact i64 %a, 3
+  %d = mul i64 %c, 22
+  ret i64 %d
+}
+
+define i64 @exactashr1mul36(i64 %a) {
+; RV64I-LABEL: exactashr1mul36:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    slli a0, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: exactashr1mul36:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a0, a0, 1
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+;
+; RV64XANDESPERF-LABEL: exactashr1mul36:
+; RV64XANDESPERF:       # %bb.0:
+; RV64XANDESPERF-NEXT:    slli a0, a0, 1
+; RV64XANDESPERF-NEXT:    nds.lea.d a0, a0, a0
+; RV64XANDESPERF-NEXT:    ret
+  %c = ashr exact i64 %a, 1
+  %d = mul i64 %c, 36
+  ret i64 %d
+}

From a9a4515b0a442ea58826047b7efb9aa2bfe48749 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Mon, 17 Nov 2025 09:38:02 -0800
Subject: [PATCH 17/67] [lld][MachO] Read cstring order for non deduped
 sections (#161879)

https://github.com/llvm/llvm-project/pull/140307 added support for
cstring hashes in the orderfile to layout cstrings in a specific order,
but only when `--deduplicate-strings` is used. This PR supports cstring
ordering when `--no-deduplicate-strings` is used.

1. Create `cStringPriorities`, separate from `priorities`, to hold only
priorities for cstring pieces. This allows us to lookup by hash
directly, instead of first converting to a string. It also fixes a
contrived bug where we want to order a symbol named `CSTR;12345` rather
than a cstring.

2. Rather than calling `buildCStringPriorities()` which always
constructs and returns a vector, we use `forEachStringPiece()` to
efficiently iterate over cstring pieces without creating a new vector if
no cstring is ordered.

3. Create `SymbolPriorityEntry::{get,set}Priority()` helper functions to
simplify code.
---
 lld/MachO/SectionPriorities.cpp     | 130 +++++++++++++++-------------
 lld/MachO/SectionPriorities.h       |  28 ++++--
 lld/MachO/SyntheticSections.cpp     |  62 +++++++------
 lld/test/MachO/order-file-cstring.s |  22 ++---
 4 files changed, 129 insertions(+), 113 deletions(-)

diff --git a/lld/MachO/SectionPriorities.cpp b/lld/MachO/SectionPriorities.cpp
index cf657aad5d145..b652d1ee8325f 100644
--- a/lld/MachO/SectionPriorities.cpp
+++ b/lld/MachO/SectionPriorities.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/xxhash.h"
 
 #include <numeric>
 
@@ -246,33 +247,45 @@ DenseMap<const InputSection *, int> CallGraphSort::run() {
   return orderMap;
 }
 
-std::optional<int>
-macho::PriorityBuilder::getSymbolOrCStringPriority(const StringRef key,
-                                                   InputFile *f) {
+void macho::PriorityBuilder::SymbolPriorityEntry::setPriority(
+    int priority, StringRef objectFile) {
+  if (!objectFile.empty())
+    objectFiles.try_emplace(objectFile, priority);
+  else
+    anyObjectFile = std::min(anyObjectFile, priority);
+}
 
-  auto it = priorities.find(key);
-  if (it == priorities.end())
-    return std::nullopt;
-  const SymbolPriorityEntry &entry = it->second;
+int macho::PriorityBuilder::SymbolPriorityEntry::getPriority(
+    const InputFile *f) const {
   if (!f)
-    return entry.anyObjectFile;
+    return anyObjectFile;
   // We don't use toString(InputFile *) here because it returns the full path
   // for object files, and we only want the basename.
-  StringRef filename;
-  if (f->archiveName.empty())
-    filename = path::filename(f->getName());
-  else
-    filename = saver().save(path::filename(f->archiveName) + "(" +
-                            path::filename(f->getName()) + ")");
-  return std::min(entry.objectFiles.lookup(filename), entry.anyObjectFile);
+  StringRef basename = path::filename(f->getName());
+  StringRef filename =
+      f->archiveName.empty()
+          ? basename
+          : saver().save(path::filename(f->archiveName) + "(" + basename + ")");
+  return std::min(objectFiles.lookup(filename), anyObjectFile);
 }
 
 std::optional<int>
-macho::PriorityBuilder::getSymbolPriority(const Defined *sym) {
+macho::PriorityBuilder::getCStringPriority(uint32_t hash,
+                                           const InputFile *f) const {
+  auto it = cStringPriorities.find(hash);
+  if (it == cStringPriorities.end())
+    return std::nullopt;
+  return it->second.getPriority(f);
+}
+
+std::optional<int>
+macho::PriorityBuilder::getSymbolPriority(const Defined *sym) const {
   if (sym->isAbsolute())
     return std::nullopt;
-  return getSymbolOrCStringPriority(utils::getRootSymbol(sym->getName()),
-                                    sym->isec()->getFile());
+  auto it = priorities.find(utils::getRootSymbol(sym->getName()));
+  if (it == priorities.end())
+    return std::nullopt;
+  return it->second.getPriority(sym->isec()->getFile());
 }
 
 void macho::PriorityBuilder::extractCallGraphProfile() {
@@ -307,7 +320,7 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
   int prio = std::numeric_limits<int>::min();
   MemoryBufferRef mbref = *buffer;
   for (StringRef line : args::getLines(mbref)) {
-    StringRef objectFile, symbolOrCStrHash;
+    StringRef objectFile;
     line = line.take_until([](char c) { return c == '#'; }); // ignore comments
     line = line.ltrim();
 
@@ -338,22 +351,16 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
     }
 
     // The rest of the line is either <symbol name> or
-    // CStringEntryPrefix<cstring hash>
+    // cStringEntryPrefix<cstring hash>
     line = line.trim();
-    if (line.starts_with(CStringEntryPrefix)) {
-      StringRef possibleHash = line.drop_front(CStringEntryPrefix.size());
+    if (line.consume_front(cStringEntryPrefix)) {
       uint32_t hash = 0;
-      if (to_integer(possibleHash, hash))
-        symbolOrCStrHash = possibleHash;
-    } else
-      symbolOrCStrHash = utils::getRootSymbol(line);
-
-    if (!symbolOrCStrHash.empty()) {
-      SymbolPriorityEntry &entry = priorities[symbolOrCStrHash];
-      if (!objectFile.empty())
-        entry.objectFiles.insert(std::make_pair(objectFile, prio));
-      else
-        entry.anyObjectFile = std::min(entry.anyObjectFile, prio);
+      if (to_integer(line, hash))
+        cStringPriorities[hash].setPriority(prio, objectFile);
+    } else {
+      StringRef symbol = utils::getRootSymbol(line);
+      if (!symbol.empty())
+        priorities[symbol].setPriority(prio, objectFile);
     }
 
     ++prio;
@@ -405,40 +412,39 @@ macho::PriorityBuilder::buildInputSectionPriorities() {
   return sectionPriorities;
 }
 
-std::vector<StringPiecePair> macho::PriorityBuilder::buildCStringPriorities(
-    ArrayRef<CStringInputSection *> inputs) {
-  // Split the input strings into hold and cold sets.
-  // Order hot set based on -order_file_cstring for performance improvement;
-  // TODO: Order cold set of cstrings for compression via BP.
-  std::vector<std::pair<int, StringPiecePair>>
-      hotStringPrioritiesAndStringPieces;
-  std::vector<StringPiecePair> coldStringPieces;
-  std::vector<StringPiecePair> orderedStringPieces;
-
+void macho::PriorityBuilder::forEachStringPiece(
+    ArrayRef<CStringInputSection *> inputs,
+    std::function<void(CStringInputSection &, StringPiece &, size_t)> f,
+    bool forceInputOrder, bool computeHash) const {
+  std::vector<std::tuple<int, CStringInputSection *, size_t>> orderedPieces;
+  std::vector<std::pair<CStringInputSection *, size_t>> unorderedPieces;
   for (CStringInputSection *isec : inputs) {
     for (const auto &[stringPieceIdx, piece] : llvm::enumerate(isec->pieces)) {
       if (!piece.live)
         continue;
-
-      std::optional<int> priority = getSymbolOrCStringPriority(
-          std::to_string(piece.hash), isec->getFile());
-      if (!priority)
-        coldStringPieces.emplace_back(isec, stringPieceIdx);
+      // Process pieces in input order if we have no cstrings in our orderfile
+      if (forceInputOrder || cStringPriorities.empty()) {
+        f(*isec, piece, stringPieceIdx);
+        continue;
+      }
+      uint32_t hash =
+          computeHash
+              ? (xxh3_64bits(isec->getStringRef(stringPieceIdx)) & 0x7fffffff)
+              : piece.hash;
+      if (auto priority = getCStringPriority(hash, isec->getFile()))
+        orderedPieces.emplace_back(*priority, isec, stringPieceIdx);
       else
-        hotStringPrioritiesAndStringPieces.emplace_back(
-            *priority, std::make_pair(isec, stringPieceIdx));
+        unorderedPieces.emplace_back(isec, stringPieceIdx);
     }
   }
-
-  // Order hot set for perf
-  llvm::stable_sort(hotStringPrioritiesAndStringPieces);
-  for (auto &[priority, stringPiecePair] : hotStringPrioritiesAndStringPieces)
-    orderedStringPieces.push_back(stringPiecePair);
-
-  // TODO: Order cold set for compression
-
-  orderedStringPieces.insert(orderedStringPieces.end(),
-                             coldStringPieces.begin(), coldStringPieces.end());
-
-  return orderedStringPieces;
+  if (orderedPieces.empty() && unorderedPieces.empty())
+    return;
+  llvm::stable_sort(orderedPieces, [](const auto &left, const auto &right) {
+    return std::get<0>(left) < std::get<0>(right);
+  });
+  for (auto &[priority, isec, pieceIdx] : orderedPieces)
+    f(*isec, isec->pieces[pieceIdx], pieceIdx);
+  // TODO: Add option to order the remaining cstrings for compression
+  for (auto &[isec, pieceIdx] : unorderedPieces)
+    f(*isec, isec->pieces[pieceIdx], pieceIdx);
 }
diff --git a/lld/MachO/SectionPriorities.h b/lld/MachO/SectionPriorities.h
index cc4e30fffc600..24d2dbc47e498 100644
--- a/lld/MachO/SectionPriorities.h
+++ b/lld/MachO/SectionPriorities.h
@@ -16,7 +16,6 @@
 namespace lld::macho {
 
 using SectionPair = std::pair<const InputSection *, const InputSection *>;
-using StringPiecePair = std::pair<CStringInputSection *, size_t>;
 
 class PriorityBuilder {
 public:
@@ -29,7 +28,7 @@ class PriorityBuilder {
   //
   // An order file has one entry per line, in the following format:
   //
-  //   <cpu>:<object file>:[<symbol name> | CStringEntryPrefix <cstring hash>]
+  //   <cpu>:<object file>:[<symbol name> | cStringEntryPrefix <cstring hash>]
   //
   // <cpu> and <object file> are optional.
   // If not specified, then that entry tries to match either,
@@ -42,7 +41,7 @@ class PriorityBuilder {
   // lowest-ordered entry (the one nearest to the front of the list.)
   //
   // or 2) any cstring literal with the given hash, if the entry has the
-  // CStringEntryPrefix prefix defined below in the file. <cstring hash> is the
+  // cStringEntryPrefix prefix defined below in the file. <cstring hash> is the
   // hash of cstring literal content.
   //
   // Cstring literals are not symbolized, we can't identify them by name
@@ -54,6 +53,16 @@ class PriorityBuilder {
   // The file can also have line comments that start with '#'.
   void parseOrderFile(StringRef path);
 
+  /// Call \p f for each string piece in \p inputs. If there are any cstring
+  /// literals in the orderfile (and \p forceInputOrder is false) then string
+  /// pieces are ordered by the orderfile. \p computeHash must be set when
+  /// \p deduplicateLiterals is false because then the string piece hash is not
+  /// set.
+  void forEachStringPiece(
+      ArrayRef<CStringInputSection *> inputs,
+      std::function<void(CStringInputSection &, StringPiece &, size_t)> f,
+      bool forceInputOrder = false, bool computeHash = false) const;
+
   // Returns layout priorities for some or all input sections. Sections are laid
   // out in decreasing order; that is, a higher priority section will be closer
   // to the beginning of its output section.
@@ -66,8 +75,6 @@ class PriorityBuilder {
   // Each section gets assigned the priority of the highest-priority symbol it
   // contains.
   llvm::DenseMap<const InputSection *, int> buildInputSectionPriorities();
-  std::vector<StringPiecePair>
-      buildCStringPriorities(ArrayRef<CStringInputSection *>);
 
 private:
   // The symbol with the smallest priority should be ordered first in the output
@@ -78,13 +85,16 @@ class PriorityBuilder {
     int anyObjectFile = 0;
     // The priority given to a matching symbol from a particular object file.
     llvm::DenseMap<llvm::StringRef, int> objectFiles;
+    void setPriority(int priority, StringRef objectFile);
+    int getPriority(const InputFile *f) const;
   };
-  const llvm::StringRef CStringEntryPrefix = "CSTR;";
+  const llvm::StringRef cStringEntryPrefix = "CSTR;";
 
-  std::optional<int> getSymbolPriority(const Defined *sym);
-  std::optional<int> getSymbolOrCStringPriority(const StringRef key,
-                                                InputFile *f);
+  std::optional<int> getSymbolPriority(const Defined *sym) const;
+  std::optional<int> getCStringPriority(uint32_t hash,
+                                        const InputFile *f) const;
   llvm::DenseMap<llvm::StringRef, SymbolPriorityEntry> priorities;
+  llvm::DenseMap<uint32_t, SymbolPriorityEntry> cStringPriorities;
   llvm::MapVector<SectionPair, uint64_t> callGraphProfile;
 };
 
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index 187cccbe90dbc..fecc51f912b08 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -1721,26 +1721,24 @@ void CStringSection::writeTo(uint8_t *buf) const {
 // and don't need this alignment. They will be emitted at some arbitrary address
 // `A`, but ld64 will treat them as being 16-byte aligned with an offset of
 // `16 % A`.
-static Align getStringPieceAlignment(const CStringInputSection *isec,
+static Align getStringPieceAlignment(const CStringInputSection &isec,
                                      const StringPiece &piece) {
-  return llvm::Align(1ULL << llvm::countr_zero(isec->align | piece.inSecOff));
+  return llvm::Align(1ULL << llvm::countr_zero(isec.align | piece.inSecOff));
 }
 
 void CStringSection::finalizeContents() {
   size = 0;
-  // TODO: Call buildCStringPriorities() to support cstring ordering when
-  // deduplication is off, although this may negatively impact build
-  // performance.
-  for (CStringInputSection *isec : inputs) {
-    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
-      if (!piece.live)
-        continue;
-      piece.outSecOff = alignTo(size, getStringPieceAlignment(isec, piece));
-      StringRef string = isec->getStringRef(i);
-      size = piece.outSecOff + string.size() + 1; // account for null terminator
-    }
+  priorityBuilder.forEachStringPiece(
+      inputs,
+      [&](CStringInputSection &isec, StringPiece &piece, size_t pieceIdx) {
+        piece.outSecOff = alignTo(size, getStringPieceAlignment(isec, piece));
+        StringRef string = isec.getStringRef(pieceIdx);
+        size =
+            piece.outSecOff + string.size() + 1; // account for null terminator
+      },
+      /*forceInputOrder=*/false, /*computeHash=*/true);
+  for (CStringInputSection *isec : inputs)
     isec->isFinal = true;
-  }
 }
 
 void DeduplicatedCStringSection::finalizeContents() {
@@ -1748,20 +1746,19 @@ void DeduplicatedCStringSection::finalizeContents() {
   DenseMap<CachedHashStringRef, Align> strToAlignment;
   // Used for tail merging only
   std::vector<CachedHashStringRef> deduplicatedStrs;
-  for (const CStringInputSection *isec : inputs) {
-    for (const auto &[i, piece] : llvm::enumerate(isec->pieces)) {
-      if (!piece.live)
-        continue;
-      auto s = isec->getCachedHashStringRef(i);
-      assert(isec->align != 0);
-      auto align = getStringPieceAlignment(isec, piece);
-      auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
-      if (config->tailMergeStrings && wasInserted)
-        deduplicatedStrs.push_back(s);
-      if (!wasInserted && it->second < align)
-        it->second = align;
-    }
-  }
+  priorityBuilder.forEachStringPiece(
+      inputs,
+      [&](CStringInputSection &isec, StringPiece &piece, size_t pieceIdx) {
+        auto s = isec.getCachedHashStringRef(pieceIdx);
+        assert(isec.align != 0);
+        auto align = getStringPieceAlignment(isec, piece);
+        auto [it, wasInserted] = strToAlignment.try_emplace(s, align);
+        if (config->tailMergeStrings && wasInserted)
+          deduplicatedStrs.push_back(s);
+        if (!wasInserted && it->second < align)
+          it->second = align;
+      },
+      /*forceInputOrder=*/true);
 
   // Like lexigraphical sort, except we read strings in reverse and take the
   // longest string first
@@ -1801,9 +1798,10 @@ void DeduplicatedCStringSection::finalizeContents() {
   // Sort the strings for performance and compression size win, and then
   // assign an offset for each string and save it to the corresponding
   // StringPieces for easy access.
-  for (auto &[isec, i] : priorityBuilder.buildCStringPriorities(inputs)) {
-    auto &piece = isec->pieces[i];
-    auto s = isec->getCachedHashStringRef(i);
+  priorityBuilder.forEachStringPiece(inputs, [&](CStringInputSection &isec,
+                                                 StringPiece &piece,
+                                                 size_t pieceIdx) {
+    auto s = isec.getCachedHashStringRef(pieceIdx);
     // Any string can be tail merged with itself with an offset of zero
     uint64_t tailMergeOffset = 0;
     auto mergeIt =
@@ -1829,7 +1827,7 @@ void DeduplicatedCStringSection::finalizeContents() {
       stringOffsetMap[tailMergedString] = piece.outSecOff;
       assert(isAligned(strToAlignment.at(tailMergedString), piece.outSecOff));
     }
-  }
+  });
   for (CStringInputSection *isec : inputs)
     isec->isFinal = true;
 }
diff --git a/lld/test/MachO/order-file-cstring.s b/lld/test/MachO/order-file-cstring.s
index 3c6d2a377dc38..d6734308fffdf 100644
--- a/lld/test/MachO/order-file-cstring.s
+++ b/lld/test/MachO/order-file-cstring.s
@@ -4,32 +4,34 @@
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin  %t/test.s -o %t/test.o
 # RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/more-cstrings.s -o %t/more-cstrings.o
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-0 %t/test.o %t/more-cstrings.o
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-0 %t/test.o %t/more-cstrings.o
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-0 | FileCheck %s --check-prefix=ORIGIN_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-0 | FileCheck %s --check-prefix=ORIGIN_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-1 %t/test.o %t/more-cstrings.o -order_file %t/ord-1
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-1 %t/test.o %t/more-cstrings.o -order_file %t/ord-1
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-1 | FileCheck %s --check-prefix=ONE_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-1 | FileCheck %s --check-prefix=ONE_SEC
 
+# RUN: %lld --no-deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-1-dup %t/test.o %t/more-cstrings.o -order_file %t/ord-1
+# RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-1-dup | FileCheck %s --check-prefix=ONE_SYM
+# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-1-dup | FileCheck %s --check-prefix=ONE_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-2 %t/test.o %t/more-cstrings.o -order_file %t/ord-2
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-2 %t/test.o %t/more-cstrings.o -order_file %t/ord-2
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-2 | FileCheck %s --check-prefix=TWO_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-2 | FileCheck %s --check-prefix=TWO_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-3 %t/test.o %t/more-cstrings.o -order_file %t/ord-3
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-3 %t/test.o %t/more-cstrings.o -order_file %t/ord-3
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-3 | FileCheck %s --check-prefix=THREE_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-3 | FileCheck %s --check-prefix=THREE_SEC
 
-# RUN: %lld --deduplicate-strings -arch arm64 -lSystem -e _main -o %t/test-4 %t/test.o %t/more-cstrings.o -order_file %t/ord-4
+# RUN: %lld -arch arm64 -lSystem -e _main -o %t/test-4 %t/test.o %t/more-cstrings.o -order_file %t/ord-4
 # RUN: llvm-nm --numeric-sort --format=just-symbols %t/test-4 | FileCheck %s --check-prefix=FOUR_SYM
 # RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/test-4 | FileCheck %s --check-prefix=FOUR_SEC
 # RUN: llvm-readobj --string-dump=__cstring %t/test-4 | FileCheck %s --check-prefix=FOUR_SEC_ESCAPE
 
-
 # We expect:
-# 1) Covered cstring symbols are reordered
-# 2) the rest of the cstring symbols remain original relative order within the cstring section
+# 1) Covered cstring symbols to be reordered
+# 2) the rest of the cstring symbols remain in the original relative order within the cstring section
 
 # ORIGIN_SYM: _local_foo1
 # ORIGIN_SYM: _globl_foo2
@@ -58,8 +60,8 @@ CSTR;1496286555
 #foo3
 CSTR;1343999025
 
-# ONE_SYM: _globl_foo2
-# ONE_SYM: _local_foo2
+# ONE_SYM-DAG: _globl_foo2
+# ONE_SYM-DAG: _local_foo2
 # ONE_SYM: _bar
 # ONE_SYM: _bar2
 # ONE_SYM: _globl_foo3

From eb879ac50b27d4651d8650b7d769cf651d0a89bd Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 17 Nov 2025 09:45:14 -0800
Subject: [PATCH 18/67] [CI] Make premerge upload/write comments (#166609)

This only does this for Linux currently as the issue-write workflow
currently does not support writing out multiple comments. This gets the
ball rolling as the failures that most people see are common to both
platforms. Ensuring we have coverage on Windows for comments will be
done in a future patch.
---
 .ci/premerge_advisor_explain.py | 3 ++-
 .github/workflows/premerge.yaml | 8 ++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/.ci/premerge_advisor_explain.py b/.ci/premerge_advisor_explain.py
index e1bc59f389b36..269f75cace266 100644
--- a/.ci/premerge_advisor_explain.py
+++ b/.ci/premerge_advisor_explain.py
@@ -39,6 +39,7 @@ def get_comment(
 ) -> dict[str, str]:
     repo = github.Github(github_token).get_repo("llvm/llvm-project")
     pr = repo.get_issue(pr_number).as_pull_request()
+    body = COMMENT_TAG.format(platform=platform.system()) + "\n" + body
     comment = {"body": body}
     comment_id = get_comment_id(platform.system(), pr)
     if comment_id:
@@ -128,7 +129,7 @@ def main(
                 ),
             )
         ]
-        with open("comment", "w") as comment_file_handle:
+        with open("comments", "w") as comment_file_handle:
             json.dump(comments, comment_file_handle)
     else:
         print(advisor_response.reason)
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 02a6f3b868d85..daf88b5b22125 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -119,6 +119,14 @@ jobs:
           path: artifacts/
           retention-days: 5
           include-hidden-files: 'true'
+      - name: Upload Comment
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
+        if: ${{ always() && !startsWith(matrix.runs-on, 'depot-ubuntu-24.04-arm') }}
+        continue-on-error: true
+        with:
+          name: workflow-args
+          path: |
+            comments
 
   premerge-checks-windows:
     name: Build and Test Windows

From a770d2b439ec246002cd77ce33e52f6efa577849 Mon Sep 17 00:00:00 2001
From: Jeremy Furtek <jfurtek@nvidia.com>
Date: Mon, 17 Nov 2025 11:46:39 -0600
Subject: [PATCH 19/67] Add 'exact' flag to arith.shrui/shrsi/divsi/divui
 operations (#165923)

This MR adds support for the `exact` flag to the
`arith.shrui/shrsi/divsi/divui` operations. The semantics are identical
to those of the LLVM dialect and the LLVM language reference.

This MR also modifies the mechanism for converting `arith` dialect
**attributes** to corresponding **properties** in the `LLVM` dialect.
(As a specific example, the integer overflow flags `nsw/nuw` are
**properties** in the `LLVM` dialect, as opposed to attributes.)

Previously, attribute converter classes were required to have a specific
method to support integer overflow flags:
```C++
template <typename SourceOp, typename TargetOp>
class AttrConvertPassThrough {
public:
  ...
  LLVM::IntegerOverflowFlags getOverflowFlags() const {
    return LLVM::IntegerOverflowFlags::none;
  }
};
```
This method was required, even for `arith` source operations that did
not use integer overflow flags (e.g. `AttrConvertFastMathToLLVM`).

This MR modifies the interface required by `arith` dialect attribute
converters to instead provide a (possibly NULL) properties attribute:
```C++
template <typename SourceOp, typename TargetOp>
class AttrConvertPassThrough {
public:
  ...
  Attribute getPropAttr() const { return {}; }
};
```
For `arith` operations with attributes that map to `LLVM` dialect
**properties**, the attribute converter can create a `DictionaryAttr`
containing target properties and return that attribute from the
attribute converter's `getPropAttr()` method. The `arith` attribute
conversion framework will set the `propertiesAttr` of an
`OperationState`, and the target operation's `setPropertiesFromAttr()`
method will be invoked to set the properties when the target operation
is created. The `AttrConvertOverflowToLLVM` class in this MR uses the
new approach.
---
 .../ArithCommon/AttrToLLVMConverter.h         | 32 ++++++-----
 .../mlir/Conversion/LLVMCommon/Pattern.h      | 20 ++++---
 .../Conversion/LLVMCommon/VectorPattern.h     | 34 ++++++------
 .../include/mlir/Dialect/Arith/IR/ArithOps.td | 54 +++++++++++++++----
 .../Conversion/ArithToLLVM/ArithToLLVM.cpp    |  1 +
 .../ComplexToLLVM/ComplexToLLVM.cpp           |  3 +-
 mlir/lib/Conversion/LLVMCommon/Pattern.cpp    | 21 +++-----
 .../Conversion/LLVMCommon/VectorPattern.cpp   | 21 ++++----
 .../Dialect/Arith/IR/ArithCanonicalization.td |  4 +-
 .../Conversion/ArithToLLVM/arith-to-llvm.mlir | 16 ++++++
 mlir/test/Dialect/Arith/canonicalize.mlir     | 13 +++++
 mlir/test/Dialect/Arith/ops.mlir              | 24 +++++++++
 12 files changed, 167 insertions(+), 76 deletions(-)

diff --git a/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h b/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h
index 7ffc861331760..7020e24517d09 100644
--- a/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h
+++ b/mlir/include/mlir/Conversion/ArithCommon/AttrToLLVMConverter.h
@@ -65,11 +65,8 @@ class AttrConvertFastMathToLLVM {
                         convertArithFastMathAttrToLLVM(arithFMFAttr));
     }
   }
-
   ArrayRef<NamedAttribute> getAttrs() const { return convertedAttr.getAttrs(); }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const {
-    return LLVM::IntegerOverflowFlags::none;
-  }
+  Attribute getPropAttr() const { return {}; }
 
 private:
   NamedAttrList convertedAttr;
@@ -82,23 +79,36 @@ template <typename SourceOp, typename TargetOp>
 class AttrConvertOverflowToLLVM {
 public:
   AttrConvertOverflowToLLVM(SourceOp srcOp) {
+    using IntegerOverflowFlagsAttr = LLVM::IntegerOverflowFlagsAttr;
+
     // Copy the source attributes.
     convertedAttr = NamedAttrList{srcOp->getAttrs()};
     // Get the name of the arith overflow attribute.
     StringRef arithAttrName = SourceOp::getIntegerOverflowAttrName();
-    // Remove the source overflow attribute.
+    // Remove the source overflow attribute from the set that will be present
+    // in the target.
     if (auto arithAttr = dyn_cast_if_present<arith::IntegerOverflowFlagsAttr>(
             convertedAttr.erase(arithAttrName))) {
-      overflowFlags = convertArithOverflowFlagsToLLVM(arithAttr.getValue());
+      auto llvmFlag = convertArithOverflowFlagsToLLVM(arithAttr.getValue());
+      // Create a dictionary attribute holding the overflow flags property.
+      // (In the LLVM dialect, the overflow flags are a property, not an
+      // attribute.)
+      MLIRContext *ctx = srcOp.getOperation()->getContext();
+      Builder b(ctx);
+      auto llvmFlagAttr = IntegerOverflowFlagsAttr::get(ctx, llvmFlag);
+      StringRef llvmAttrName = TargetOp::getOverflowFlagsAttrName();
+      NamedAttribute attr{llvmAttrName, llvmFlagAttr};
+      // Set the properties attribute of the operation state so that the
+      // property can be updated when the operation is created.
+      propertiesAttr = b.getDictionaryAttr(ArrayRef(attr));
     }
   }
-
   ArrayRef<NamedAttribute> getAttrs() const { return convertedAttr.getAttrs(); }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const { return overflowFlags; }
+  Attribute getPropAttr() const { return propertiesAttr; }
 
 private:
   NamedAttrList convertedAttr;
-  LLVM::IntegerOverflowFlags overflowFlags = LLVM::IntegerOverflowFlags::none;
+  DictionaryAttr propertiesAttr;
 };
 
 template <typename SourceOp, typename TargetOp>
@@ -129,9 +139,7 @@ class AttrConverterConstrainedFPToLLVM {
   }
 
   ArrayRef<NamedAttribute> getAttrs() const { return convertedAttr.getAttrs(); }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const {
-    return LLVM::IntegerOverflowFlags::none;
-  }
+  Attribute getPropAttr() const { return {}; }
 
 private:
   NamedAttrList convertedAttr;
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
index c292e3727f46c..f8e0ccc093f8b 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/Pattern.h
@@ -19,16 +19,14 @@ class CallOpInterface;
 
 namespace LLVM {
 namespace detail {
-/// Handle generically setting flags as native properties on LLVM operations.
-void setNativeProperties(Operation *op, IntegerOverflowFlags overflowFlags);
-
 /// Replaces the given operation "op" with a new operation of type "targetOp"
 /// and given operands.
-LogicalResult oneToOneRewrite(
-    Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags = IntegerOverflowFlags::none);
+LogicalResult oneToOneRewrite(Operation *op, StringRef targetOp,
+                              ValueRange operands,
+                              ArrayRef<NamedAttribute> targetAttrs,
+                              Attribute propertiesAttr,
+                              const LLVMTypeConverter &typeConverter,
+                              ConversionPatternRewriter &rewriter);
 
 /// Replaces the given operation "op" with a call to an LLVM intrinsic with the
 /// specified name "intrinsic" and operands.
@@ -307,9 +305,9 @@ class OneToOneConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    return LLVM::detail::oneToOneRewrite(op, TargetOp::getOperationName(),
-                                         adaptor.getOperands(), op->getAttrs(),
-                                         *this->getTypeConverter(), rewriter);
+    return LLVM::detail::oneToOneRewrite(
+        op, TargetOp::getOperationName(), adaptor.getOperands(), op->getAttrs(),
+        /*propertiesAttr=*/Attribute{}, *this->getTypeConverter(), rewriter);
   }
 };
 
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
index e7ab63abfeaa1..47b8381eefda8 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
@@ -54,25 +54,26 @@ LogicalResult handleMultidimensionalVectors(
     std::function<Value(Type, ValueRange)> createOperand,
     ConversionPatternRewriter &rewriter);
 
-LogicalResult vectorOneToOneRewrite(
-    Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags = IntegerOverflowFlags::none);
+LogicalResult vectorOneToOneRewrite(Operation *op, StringRef targetOp,
+                                    ValueRange operands,
+                                    ArrayRef<NamedAttribute> targetAttrs,
+                                    Attribute propertiesAttr,
+                                    const LLVMTypeConverter &typeConverter,
+                                    ConversionPatternRewriter &rewriter);
 } // namespace detail
 } // namespace LLVM
 
 // Default attribute conversion class, which passes all source attributes
-// through to the target op, unmodified.
+// through to the target op, unmodified. The attribute to set properties of the
+// target operation will be nullptr (i.e. any properties that exist in will have
+// default values).
 template <typename SourceOp, typename TargetOp>
 class AttrConvertPassThrough {
 public:
   AttrConvertPassThrough(SourceOp srcOp) : srcAttrs(srcOp->getAttrs()) {}
 
   ArrayRef<NamedAttribute> getAttrs() const { return srcAttrs; }
-  LLVM::IntegerOverflowFlags getOverflowFlags() const {
-    return LLVM::IntegerOverflowFlags::none;
-  }
+  Attribute getPropAttr() const { return {}; }
 
 private:
   ArrayRef<NamedAttribute> srcAttrs;
@@ -80,10 +81,13 @@ class AttrConvertPassThrough {
 
 /// Basic lowering implementation to rewrite Ops with just one result to the
 /// LLVM Dialect. This supports higher-dimensional vector types.
-/// The AttrConvert template template parameter should be a template class
-/// with SourceOp and TargetOp type parameters, a constructor that takes
-/// a SourceOp instance, and a getAttrs() method that returns
-/// ArrayRef<NamedAttribute>.
+/// The AttrConvert template template parameter should:
+//  - be a template class with SourceOp and TargetOp type parameters
+//  - have a constructor that takes a SourceOp instance
+//  - a getAttrs() method that returns ArrayRef<NamedAttribute> containing
+//    attributes that the target operation will have
+//  - a getPropAttr() method that returns either a NULL attribute or a
+//    DictionaryAttribute with properties that exist for the target operation
 template <typename SourceOp, typename TargetOp,
           template <typename, typename> typename AttrConvert =
               AttrConvertPassThrough,
@@ -137,8 +141,8 @@ class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
 
     return LLVM::detail::vectorOneToOneRewrite(
         op, TargetOp::getOperationName(), adaptor.getOperands(),
-        attrConvert.getAttrs(), *this->getTypeConverter(), rewriter,
-        attrConvert.getOverflowFlags());
+        attrConvert.getAttrs(), attrConvert.getPropAttr(),
+        *this->getTypeConverter(), rewriter);
   }
 };
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
index a38cf41a3e09b..77d780425c3c3 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
+++ b/mlir/include/mlir/Dialect/Arith/IR/ArithOps.td
@@ -158,6 +158,18 @@ class Arith_IntBinaryOpWithOverflowFlags<string mnemonic, list<Trait> traits = [
                           attr-dict `:` type($result) }];
 }
 
+class Arith_IntBinaryOpWithExactFlag<string mnemonic, list<Trait> traits = []> :
+    Arith_BinaryOp<mnemonic, traits #
+      [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>]>,
+    Arguments<(ins SignlessIntegerOrIndexLike:$lhs,
+               SignlessIntegerOrIndexLike:$rhs,
+               UnitAttr:$isExact)>,
+    Results<(outs SignlessIntegerOrIndexLike:$result)> {
+
+  let assemblyFormat = [{ $lhs `,` $rhs (`exact` $isExact^)?
+                          attr-dict `:` type($result) }];
+}
+
 //===----------------------------------------------------------------------===//
 // ConstantOp
 //===----------------------------------------------------------------------===//
@@ -482,7 +494,8 @@ def Arith_MulUIExtendedOp : Arith_Op<"mului_extended", [Pure, Commutative,
 // DivUIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_DivUIOp : Arith_IntBinaryOp<"divui", [ConditionallySpeculatable]> {
+def Arith_DivUIOp : Arith_IntBinaryOpWithExactFlag<"divui",
+                                                   [ConditionallySpeculatable]> {
   let summary = "unsigned integer division operation";
   let description = [{
     Unsigned integer division. Rounds towards zero. Treats the leading bit as
@@ -493,12 +506,18 @@ def Arith_DivUIOp : Arith_IntBinaryOp<"divui", [ConditionallySpeculatable]> {
     `tensor` values, the behavior is undefined if _any_ elements are divided by
     zero.
 
+    If the `exact` attribute is present, the result value is poison if `lhs` is
+    not a multiple of `rhs`.
+
     Example:
 
     ```mlir
     // Scalar unsigned integer division.
     %a = arith.divui %b, %c : i64
 
+    // Scalar unsigned integer division where %b is known to be a multiple of %c.
+    %a = arith.divui %b, %c exact : i64
+
     // SIMD vector element-wise division.
     %f = arith.divui %g, %h : vector<4xi32>
 
@@ -519,7 +538,8 @@ def Arith_DivUIOp : Arith_IntBinaryOp<"divui", [ConditionallySpeculatable]> {
 // DivSIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_DivSIOp : Arith_IntBinaryOp<"divsi", [ConditionallySpeculatable]> {
+def Arith_DivSIOp : Arith_IntBinaryOpWithExactFlag<"divsi",
+                                                   [ConditionallySpeculatable]> {
   let summary = "signed integer division operation";
   let description = [{
     Signed integer division. Rounds towards zero. Treats the leading bit as
@@ -530,12 +550,18 @@ def Arith_DivSIOp : Arith_IntBinaryOp<"divsi", [ConditionallySpeculatable]> {
     behavior is undefined if _any_ of its elements are divided by zero or has a
     signed division overflow.
 
+    If the `exact` attribute is present, the result value is poison if `lhs` is
+    not a multiple of `rhs`.
+
     Example:
 
     ```mlir
     // Scalar signed integer division.
     %a = arith.divsi %b, %c : i64
 
+    // Scalar signed integer division where %b is known to be a multiple of %c.
+    %a = arith.divsi %b, %c exact : i64
+
     // SIMD vector element-wise division.
     %f = arith.divsi %g, %h : vector<4xi32>
 
@@ -821,7 +847,7 @@ def Arith_ShLIOp : Arith_IntBinaryOpWithOverflowFlags<"shli"> {
 // ShRUIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_ShRUIOp : Arith_TotalIntBinaryOp<"shrui"> {
+def Arith_ShRUIOp : Arith_IntBinaryOpWithExactFlag<"shrui", [Pure]> {
   let summary = "unsigned integer right-shift";
   let description = [{
     The `shrui` operation shifts an integer value of the first operand to the right
@@ -830,12 +856,17 @@ def Arith_ShRUIOp : Arith_TotalIntBinaryOp<"shrui"> {
     filled with zeros. If the value of the second operand is greater or equal than the
     bitwidth of the first operand, then the operation returns poison.
 
+    If the `exact` attribute is present, the result value of shrui is a poison
+    value if any of the bits shifted out are non-zero.
+
     Example:
 
     ```mlir
-    %1 = arith.constant 160 : i8               // %1 is 0b10100000
+    %1 = arith.constant 160 : i8        // %1 is 0b10100000
     %2 = arith.constant 3 : i8
-    %3 = arith.shrui %1, %2 : (i8, i8) -> i8   // %3 is 0b00010100
+    %3 = arith.constant 6 : i8
+    %4 = arith.shrui %1, %2 exact : i8  // %4 is 0b00010100
+    %5 = arith.shrui %1, %3 : i8        // %3 is 0b00000010
     ```
   }];
   let hasFolder = 1;
@@ -845,7 +876,7 @@ def Arith_ShRUIOp : Arith_TotalIntBinaryOp<"shrui"> {
 // ShRSIOp
 //===----------------------------------------------------------------------===//
 
-def Arith_ShRSIOp : Arith_TotalIntBinaryOp<"shrsi"> {
+def Arith_ShRSIOp : Arith_IntBinaryOpWithExactFlag<"shrsi", [Pure]> {
   let summary = "signed integer right-shift";
   let description = [{
     The `shrsi` operation shifts an integer value of the first operand to the right
@@ -856,14 +887,17 @@ def Arith_ShRSIOp : Arith_TotalIntBinaryOp<"shrsi"> {
     operand is greater or equal than bitwidth of the first operand, then the operation
     returns poison.
 
+    If the `exact` attribute is present, the result value of shrsi is a poison
+    value if any of the bits shifted out are non-zero.
+
     Example:
 
     ```mlir
-    %1 = arith.constant 160 : i8               // %1 is 0b10100000
+    %1 = arith.constant 160 : i8         // %1 is 0b10100000
     %2 = arith.constant 3 : i8
-    %3 = arith.shrsi %1, %2 : (i8, i8) -> i8   // %3 is 0b11110100
-    %4 = arith.constant 96 : i8                   // %4 is 0b01100000
-    %5 = arith.shrsi %4, %2 : (i8, i8) -> i8   // %5 is 0b00001100
+    %3 = arith.shrsi %1, %2 exact : i8   // %3 is 0b11110100
+    %4 = arith.constant 98 : i8          // %4 is 0b01100010
+    %5 = arith.shrsi %4, %2 : i8         // %5 is 0b00001100
     ```
   }];
   let hasFolder = 1;
diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
index f2bacc3399144..cc3e8468f298b 100644
--- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
+++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp
@@ -281,6 +281,7 @@ ConstantOpLowering::matchAndRewrite(arith::ConstantOp op, OpAdaptor adaptor,
                                     ConversionPatternRewriter &rewriter) const {
   return LLVM::detail::oneToOneRewrite(op, LLVM::ConstantOp::getOperationName(),
                                        adaptor.getOperands(), op->getAttrs(),
+                                       /*propAttr=*/Attribute{},
                                        *getTypeConverter(), rewriter);
 }
 
diff --git a/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp b/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
index 86d02e6c6209f..6a0c21185983e 100644
--- a/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
+++ b/mlir/lib/Conversion/ComplexToLLVM/ComplexToLLVM.cpp
@@ -96,7 +96,8 @@ struct ConstantOpLowering : public ConvertOpToLLVMPattern<complex::ConstantOp> {
                   ConversionPatternRewriter &rewriter) const override {
     return LLVM::detail::oneToOneRewrite(
         op, LLVM::ConstantOp::getOperationName(), adaptor.getOperands(),
-        op->getAttrs(), *getTypeConverter(), rewriter);
+        op->getAttrs(), /*propAttr=*/Attribute{}, *getTypeConverter(),
+        rewriter);
   }
 };
 
diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
index 48a03198fd465..f28a6ccb42455 100644
--- a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
@@ -296,19 +296,13 @@ LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors(
 // Detail methods
 //===----------------------------------------------------------------------===//
 
-void LLVM::detail::setNativeProperties(Operation *op,
-                                       IntegerOverflowFlags overflowFlags) {
-  if (auto iface = dyn_cast<IntegerOverflowFlagsInterface>(op))
-    iface.setOverflowFlags(overflowFlags);
-}
-
 /// Replaces the given operation "op" with a new operation of type "targetOp"
 /// and given operands.
 LogicalResult LLVM::detail::oneToOneRewrite(
     Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags) {
+    ArrayRef<NamedAttribute> targetAttrs, Attribute propertiesAttr,
+    const LLVMTypeConverter &typeConverter,
+    ConversionPatternRewriter &rewriter) {
   unsigned numResults = op->getNumResults();
 
   SmallVector<Type> resultTypes;
@@ -320,11 +314,10 @@ LogicalResult LLVM::detail::oneToOneRewrite(
   }
 
   // Create the operation through state since we don't know its C++ type.
-  Operation *newOp =
-      rewriter.create(op->getLoc(), rewriter.getStringAttr(targetOp), operands,
-                      resultTypes, targetAttrs);
-
-  setNativeProperties(newOp, overflowFlags);
+  OperationState state(op->getLoc(), rewriter.getStringAttr(targetOp), operands,
+                       resultTypes, targetAttrs);
+  state.propertiesAttr = propertiesAttr;
+  Operation *newOp = rewriter.create(state);
 
   // If the operation produced 0 or 1 result, return them immediately.
   if (numResults == 0)
diff --git a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
index e7dd0b506e12d..24b01259f0499 100644
--- a/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/VectorPattern.cpp
@@ -105,9 +105,9 @@ LogicalResult LLVM::detail::handleMultidimensionalVectors(
 
 LogicalResult LLVM::detail::vectorOneToOneRewrite(
     Operation *op, StringRef targetOp, ValueRange operands,
-    ArrayRef<NamedAttribute> targetAttrs,
-    const LLVMTypeConverter &typeConverter, ConversionPatternRewriter &rewriter,
-    IntegerOverflowFlags overflowFlags) {
+    ArrayRef<NamedAttribute> targetAttrs, Attribute propertiesAttr,
+    const LLVMTypeConverter &typeConverter,
+    ConversionPatternRewriter &rewriter) {
   assert(!operands.empty());
 
   // Cannot convert ops if their operands are not of LLVM type.
@@ -116,15 +116,14 @@ LogicalResult LLVM::detail::vectorOneToOneRewrite(
 
   auto llvmNDVectorTy = operands[0].getType();
   if (!isa<LLVM::LLVMArrayType>(llvmNDVectorTy))
-    return oneToOneRewrite(op, targetOp, operands, targetAttrs, typeConverter,
-                           rewriter, overflowFlags);
-
-  auto callback = [op, targetOp, targetAttrs, overflowFlags,
+    return oneToOneRewrite(op, targetOp, operands, targetAttrs, propertiesAttr,
+                           typeConverter, rewriter);
+  auto callback = [op, targetOp, targetAttrs, propertiesAttr,
                    &rewriter](Type llvm1DVectorTy, ValueRange operands) {
-    Operation *newOp =
-        rewriter.create(op->getLoc(), rewriter.getStringAttr(targetOp),
-                        operands, llvm1DVectorTy, targetAttrs);
-    LLVM::detail::setNativeProperties(newOp, overflowFlags);
+    OperationState state(op->getLoc(), rewriter.getStringAttr(targetOp),
+                         operands, llvm1DVectorTy, targetAttrs);
+    state.propertiesAttr = propertiesAttr;
+    Operation *newOp = rewriter.create(state);
     return newOp->getResult(0);
   };
 
diff --git a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
index de3efc9fe3506..e256915933a71 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
+++ b/mlir/lib/Dialect/Arith/IR/ArithCanonicalization.td
@@ -389,8 +389,8 @@ def TruncIExtUIToExtUI :
 // trunci(shrsi(x, c)) -> trunci(shrui(x, c))
 def TruncIShrSIToTrunciShrUI :
     Pat<(Arith_TruncIOp:$tr
-          (Arith_ShRSIOp $x, (ConstantLikeMatcher TypedAttrInterface:$c0)), $overflow),
-        (Arith_TruncIOp (Arith_ShRUIOp $x, (Arith_ConstantOp (cast<"TypedAttr"> $c0))), $overflow),
+          (Arith_ShRSIOp $x, (ConstantLikeMatcher TypedAttrInterface:$c0), $exact), $overflow),
+        (Arith_TruncIOp (Arith_ShRUIOp $x, (Arith_ConstantOp (cast<"TypedAttr"> $c0)), $exact), $overflow),
         [(TruncationMatchesShiftAmount $x, $tr, $c0)]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index 5f1ec66234df2..6fdc1104d2609 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -738,6 +738,22 @@ func.func @ops_supporting_overflow(%arg0: i64, %arg1: i64) {
 
 // -----
 
+// CHECK-LABEL: @ops_supporting_exact
+func.func @ops_supporting_exact(i32, i32) {
+^bb0(%arg0: i32, %arg1: i32):
+// CHECK: = llvm.ashr exact %arg0, %arg1 : i32
+  %0 = arith.shrsi %arg0, %arg1 exact : i32
+// CHECK: = llvm.lshr exact %arg0, %arg1 : i32
+  %1 = arith.shrui %arg0, %arg1 exact : i32
+// CHECK: = llvm.sdiv exact %arg0, %arg1 : i32
+  %2 = arith.divsi %arg0, %arg1 exact : i32
+// CHECK: = llvm.udiv exact %arg0, %arg1 : i32
+  %3 = arith.divui %arg0, %arg1 exact : i32
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @memref_bitcast
 //  CHECK-SAME:   (%[[ARG:.*]]: memref<?xi16>)
 //       CHECK:   %[[V1:.*]] = builtin.unrealized_conversion_cast %[[ARG]] : memref<?xi16> to !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 2fe0995c9d4df..3ad1530248809 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -2958,6 +2958,19 @@ func.func @truncIShrSIToTrunciShrUI(%a: i64) -> i32 {
   return %hi : i32
 }
 
+// CHECK-LABEL: @truncIShrSIExactToTrunciShrUIExact
+//  CHECK-SAME:   (%[[A:.+]]: i64)
+//  CHECK-NEXT:   %[[C32:.+]] = arith.constant 32 : i64
+//  CHECK-NEXT:   %[[SHR:.+]] = arith.shrui %[[A]], %[[C32]] exact : i64
+//  CHECK-NEXT:   %[[TRU:.+]] = arith.trunci %[[SHR]] : i64 to i32
+//  CHECK-NEXT:   return %[[TRU]] : i32
+func.func @truncIShrSIExactToTrunciShrUIExact(%a: i64) -> i32 {
+  %c32 = arith.constant 32: i64
+  %sh = arith.shrsi %a, %c32 exact : i64
+  %hi = arith.trunci %sh: i64 to i32
+  return %hi : i32
+}
+
 // CHECK-LABEL: @truncIShrSIToTrunciShrUIBadShiftAmt1
 //       CHECK:   arith.shrsi
 func.func @truncIShrSIToTrunciShrUIBadShiftAmt1(%a: i64) -> i32 {
diff --git a/mlir/test/Dialect/Arith/ops.mlir b/mlir/test/Dialect/Arith/ops.mlir
index 1e656e84da836..58eadfda17060 100644
--- a/mlir/test/Dialect/Arith/ops.mlir
+++ b/mlir/test/Dialect/Arith/ops.mlir
@@ -151,6 +151,12 @@ func.func @test_divui(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_divui_exact
+func.func @test_divui_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.divui %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_divui_tensor
 func.func @test_divui_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.divui %arg0, %arg1 : tensor<8x8xi64>
@@ -175,6 +181,12 @@ func.func @test_divsi(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_divsi_exact
+func.func @test_divsi_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.divsi %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_divsi_tensor
 func.func @test_divsi_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.divsi %arg0, %arg1 : tensor<8x8xi64>
@@ -391,6 +403,12 @@ func.func @test_shrui(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_shrui_exact
+func.func @test_shrui_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.shrui %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_shrui_tensor
 func.func @test_shrui_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.shrui %arg0, %arg1 : tensor<8x8xi64>
@@ -415,6 +433,12 @@ func.func @test_shrsi(%arg0 : i64, %arg1 : i64) -> i64 {
   return %0 : i64
 }
 
+// CHECK-LABEL: test_shrsi_exact
+func.func @test_shrsi_exact(%arg0 : i64, %arg1 : i64) -> i64 {
+  %0 = arith.shrsi %arg0, %arg1 exact : i64
+  return %0 : i64
+}
+
 // CHECK-LABEL: test_shrsi_tensor
 func.func @test_shrsi_tensor(%arg0 : tensor<8x8xi64>, %arg1 : tensor<8x8xi64>) -> tensor<8x8xi64> {
   %0 = arith.shrsi %arg0, %arg1 : tensor<8x8xi64>

From 9349a10f93308a196499d2c80a222476c78f1065 Mon Sep 17 00:00:00 2001
From: Jeremy Furtek <jfurtek@nvidia.com>
Date: Mon, 17 Nov 2025 11:46:56 -0600
Subject: [PATCH 20/67] Fix side effects for LLVM integer operations (udiv,
 sdiv) incorrectly marked as Pure (#166648)

This MR modifies side effect traits of some integer arithmetic
operations in the LLVM dialect.

Prior to this MR, the LLVM dialect `sdiv` and `udiv` operations were
marked as `Pure` through `tblgen` inheritance of the
`LLVM_ArithmeticOpBase` class. The `Pure` trait allowed incorrect
hoisting of `sdiv`/`udiv` operations by the
`loop-independent-code-motion` pass.

This MR modifies the `sdiv` and `udiv` LLVM operations to have traits
and code motion behavior identical to their counterparts in the `arith`
dialect, which were established by the commit/review below.


https://github.com/llvm/llvm-project/commit/ed39825be48805b174d3177f1d8d41ed84784d18
https://reviews.llvm.org/D137814
---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   |  33 ++--
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    |  28 ++++
 .../loop-invariant-code-motion.mlir           | 145 ++++++++++++++++++
 mlir/unittests/Dialect/LLVMIR/CMakeLists.txt  |   1 +
 4 files changed, 192 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index e425e16a4b1a6..971710fa3ee13 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -39,7 +39,7 @@ class LLVM_TerminatorOp<string mnemonic, list<Trait> traits = []> :
 class LLVM_ArithmeticOpBase<Type type, string mnemonic,
                             string instName, list<Trait> traits = []> :
     LLVM_Op<mnemonic,
-           !listconcat([Pure, SameOperandsAndResultType], traits)>,
+           !listconcat([SameOperandsAndResultType, NoMemoryEffect], traits)>,
     LLVM_Builder<"$res = builder.Create" # instName # "($lhs, $rhs);"> {
   dag commonArgs = (ins LLVM_ScalarOrVectorOf<type>:$lhs,
                     LLVM_ScalarOrVectorOf<type>:$rhs);
@@ -116,7 +116,8 @@ class LLVM_IntArithmeticOpWithDisjointFlag<string mnemonic, string instName,
 class LLVM_FloatArithmeticOp<string mnemonic, string instName,
                              list<Trait> traits = []> :
     LLVM_ArithmeticOpBase<LLVM_AnyFloat, mnemonic, instName,
-    !listconcat([DeclareOpInterfaceMethods<FastmathFlagsInterface>], traits)> {
+    !listconcat([DeclareOpInterfaceMethods<FastmathFlagsInterface>, Pure],
+                 traits)> {
   dag fmfArg = (
     ins DefaultValuedAttr<LLVM_FastmathFlagsAttr, "{}">:$fastmathFlags);
   let arguments = !con(commonArgs, fmfArg);
@@ -149,24 +150,26 @@ class LLVM_UnaryFloatArithmeticOp<Type type, string mnemonic,
 
 // Integer binary operations.
 def LLVM_AddOp : LLVM_IntArithmeticOpWithOverflowFlag<"add", "Add",
-    [Commutative]>;
-def LLVM_SubOp : LLVM_IntArithmeticOpWithOverflowFlag<"sub", "Sub", []>;
+    [Commutative, Pure]>;
+def LLVM_SubOp : LLVM_IntArithmeticOpWithOverflowFlag<"sub", "Sub", [Pure]>;
 def LLVM_MulOp : LLVM_IntArithmeticOpWithOverflowFlag<"mul", "Mul",
-    [Commutative]>;
-def LLVM_UDivOp : LLVM_IntArithmeticOpWithExactFlag<"udiv", "UDiv">;
-def LLVM_SDivOp : LLVM_IntArithmeticOpWithExactFlag<"sdiv", "SDiv">;
-def LLVM_URemOp : LLVM_IntArithmeticOp<"urem", "URem">;
-def LLVM_SRemOp : LLVM_IntArithmeticOp<"srem", "SRem">;
-def LLVM_AndOp : LLVM_IntArithmeticOp<"and", "And">;
-def LLVM_OrOp : LLVM_IntArithmeticOpWithDisjointFlag<"or", "Or"> {
+    [Commutative, Pure]>;
+def LLVM_UDivOp : LLVM_IntArithmeticOpWithExactFlag<"udiv", "UDiv",
+    [DeclareOpInterfaceMethods<ConditionallySpeculatable>]>;
+def LLVM_SDivOp : LLVM_IntArithmeticOpWithExactFlag<"sdiv", "SDiv",
+    [DeclareOpInterfaceMethods<ConditionallySpeculatable>]>;
+def LLVM_URemOp : LLVM_IntArithmeticOp<"urem", "URem", [Pure]>;
+def LLVM_SRemOp : LLVM_IntArithmeticOp<"srem", "SRem", [Pure]>;
+def LLVM_AndOp : LLVM_IntArithmeticOp<"and", "And", [Pure]>;
+def LLVM_OrOp : LLVM_IntArithmeticOpWithDisjointFlag<"or", "Or", [Pure]> {
   let hasFolder = 1;
 }
-def LLVM_XOrOp : LLVM_IntArithmeticOp<"xor", "Xor">;
-def LLVM_ShlOp : LLVM_IntArithmeticOpWithOverflowFlag<"shl", "Shl", []> {
+def LLVM_XOrOp : LLVM_IntArithmeticOp<"xor", "Xor", [Pure]>;
+def LLVM_ShlOp : LLVM_IntArithmeticOpWithOverflowFlag<"shl", "Shl", [Pure]> {
   let hasFolder = 1;
 }
-def LLVM_LShrOp : LLVM_IntArithmeticOpWithExactFlag<"lshr", "LShr">;
-def LLVM_AShrOp : LLVM_IntArithmeticOpWithExactFlag<"ashr", "AShr">;
+def LLVM_LShrOp : LLVM_IntArithmeticOpWithExactFlag<"lshr", "LShr", [Pure]>;
+def LLVM_AShrOp : LLVM_IntArithmeticOpWithExactFlag<"ashr", "AShr", [Pure]>;
 
 // Base class for compare operations. A compare operation takes two operands
 // of the same type and returns a boolean result. If the operands are
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 1bf4a1c508843..5b819485b1be4 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -4223,6 +4223,34 @@ LogicalResult InlineAsmOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// UDivOp
+//===----------------------------------------------------------------------===//
+Speculation::Speculatability UDivOp::getSpeculatability() {
+  // X / 0 => UB
+  Value divisor = getRhs();
+  if (matchPattern(divisor, m_IntRangeWithoutZeroU()))
+    return Speculation::Speculatable;
+
+  return Speculation::NotSpeculatable;
+}
+
+//===----------------------------------------------------------------------===//
+// SDivOp
+//===----------------------------------------------------------------------===//
+Speculation::Speculatability SDivOp::getSpeculatability() {
+  // This function conservatively assumes that all signed division by -1 are
+  // not speculatable.
+  // X / 0 => UB
+  // INT_MIN / -1 => UB
+  Value divisor = getRhs();
+  if (matchPattern(divisor, m_IntRangeWithoutZeroS()) &&
+      matchPattern(divisor, m_IntRangeWithoutNegOneS()))
+    return Speculation::Speculatable;
+
+  return Speculation::NotSpeculatable;
+}
+
 //===----------------------------------------------------------------------===//
 // LLVMDialect initialization, type parsing, and registration.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Transforms/loop-invariant-code-motion.mlir b/mlir/test/Transforms/loop-invariant-code-motion.mlir
index c1604e226a334..31a4f64dd7de0 100644
--- a/mlir/test/Transforms/loop-invariant-code-motion.mlir
+++ b/mlir/test/Transforms/loop-invariant-code-motion.mlir
@@ -880,6 +880,18 @@ func.func @no_speculate_divui(
   return
 }
 
+func.func @no_speculate_udiv(
+// CHECK-LABEL: @no_speculate_udiv(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.udiv
+    %val = llvm.udiv %num, %denom : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_divsi(
 // CHECK-LABEL: @no_speculate_divsi(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -892,6 +904,18 @@ func.func @no_speculate_divsi(
   return
 }
 
+func.func @no_speculate_sdiv(
+// CHECK-LABEL: @no_speculate_sdiv(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.sdiv
+    %val = llvm.sdiv %num, %denom : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivui(
 // CHECK-LABEL: @no_speculate_ceildivui(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -928,6 +952,18 @@ func.func @no_speculate_divui_const(%num: i32, %lb: index, %ub: index, %step: in
   return
 }
 
+func.func @no_speculate_udiv_const(%num: i32, %lb: index, %ub: index, %step: index) {
+// CHECK-LABEL: @no_speculate_udiv_const(
+  %c0 = arith.constant 0 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.udiv
+    %val = llvm.udiv %num, %c0 : i32
+  }
+
+  return
+}
+
 func.func @speculate_divui_const(
 // CHECK-LABEL: @speculate_divui_const(
     %num: i32, %lb: index, %ub: index, %step: index) {
@@ -941,6 +977,19 @@ func.func @speculate_divui_const(
   return
 }
 
+func.func @speculate_udiv_const(
+// CHECK-LABEL: @speculate_udiv_const(
+    %num: i32, %lb: index, %ub: index, %step: index) {
+  %c5 = llvm.mlir.constant(5 : i32) : i32
+// CHECK: llvm.udiv
+// CHECK: scf.for
+  scf.for %i = %lb to %ub step %step {
+    %val = llvm.udiv %num, %c5 : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivui_const(%num: i32, %lb: index, %ub: index, %step: index) {
 // CHECK-LABEL: @no_speculate_ceildivui_const(
   %c0 = arith.constant 0 : i32
@@ -979,6 +1028,19 @@ func.func @no_speculate_divsi_const0(
   return
 }
 
+func.func @no_speculate_sdiv_const0(
+// CHECK-LABEL: @no_speculate_sdiv_const0(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  %c0 = arith.constant 0 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.sdiv
+    %val = llvm.sdiv %num, %c0 : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_divsi_const_minus1(
 // CHECK-LABEL: @no_speculate_divsi_const_minus1(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -992,6 +1054,19 @@ func.func @no_speculate_divsi_const_minus1(
   return
 }
 
+func.func @no_speculate_sdiv_const_minus1(
+// CHECK-LABEL: @no_speculate_sdiv_const_minus1(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  %cm1 = arith.constant -1 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.sdiv
+    %val = llvm.sdiv %num, %cm1 : i32
+  }
+
+  return
+}
+
 func.func @speculate_divsi_const(
 // CHECK-LABEL: @speculate_divsi_const(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -1005,6 +1080,19 @@ func.func @speculate_divsi_const(
   return
 }
 
+func.func @speculate_sdiv_const(
+// CHECK-LABEL: @speculate_sdiv_const(
+    %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
+  %c5 = arith.constant 5 : i32
+  scf.for %i = %lb to %ub step %step {
+// CHECK: llvm.sdiv
+// CHECK: scf.for
+    %val = llvm.sdiv %num, %c5 : i32
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivsi_const0(
 // CHECK-LABEL: @no_speculate_ceildivsi_const0(
     %num: i32, %denom: i32, %lb: index, %ub: index, %step: index) {
@@ -1057,6 +1145,19 @@ func.func @no_speculate_divui_range(
   return
 }
 
+func.func @no_speculate_udiv_range(
+// CHECK-LABEL: @no_speculate_udiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom = test.with_bounds {smax = 127 : i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK: llvm.udiv
+    %val = llvm.udiv %num, %denom : i8
+  }
+
+  return
+}
+
 func.func @no_speculate_divsi_range(
 // CHECK-LABEL: @no_speculate_divsi_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
@@ -1072,6 +1173,21 @@ func.func @no_speculate_divsi_range(
   return
 }
 
+func.func @no_speculate_sdiv_range(
+// CHECK-LABEL: @no_speculate_sdiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom0 = test.with_bounds {smax = -1: i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  %denom1 = test.with_bounds {smax = 127 : i8, smin = 0 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK: scf.for
+// CHECK-COUNT-2: llvm.sdiv
+    %val0 = llvm.sdiv %num, %denom0 : i8
+    %val1 = llvm.sdiv %num, %denom1 : i8
+  }
+
+  return
+}
+
 func.func @no_speculate_ceildivui_range(
 // CHECK-LABEL: @no_speculate_ceildivui_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
@@ -1113,6 +1229,19 @@ func.func @speculate_divui_range(
   return
 }
 
+func.func @speculate_udiv_range(
+// CHECK-LABEL: @speculate_udiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom = test.with_bounds {smax = 127 : i8, smin = -128 : i8, umax = 255 : i8, umin = 1 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK: llvm.udiv
+// CHECK: scf.for
+    %val = llvm.udiv %num, %denom : i8
+  }
+
+  return
+}
+
 func.func @speculate_divsi_range(
 // CHECK-LABEL: @speculate_divsi_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
@@ -1129,6 +1258,22 @@ func.func @speculate_divsi_range(
   return
 }
 
+func.func @speculate_sdiv_range(
+// CHECK-LABEL: @speculate_sdiv_range(
+    %num: i8, %lb: index, %ub: index, %step: index) {
+  %denom0 = test.with_bounds {smax = 127 : i8, smin = 1 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  %denom1 = test.with_bounds {smax = -2 : i8, smin = -128 : i8, umax = 255 : i8, umin = 0 : i8} : i8
+  scf.for %i = %lb to %ub step %step {
+// CHECK-COUNT-2: llvm.sdiv
+// CHECK: scf.for
+    %val0 = llvm.sdiv %num, %denom0 : i8
+    %val1 = llvm.sdiv %num, %denom1 : i8
+
+  }
+
+  return
+}
+
 func.func @speculate_ceildivui_range(
 // CHECK-LABEL: @speculate_ceildivui_range(
     %num: i8, %lb: index, %ub: index, %step: index) {
diff --git a/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt b/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
index 7cc130d02ad74..568126fd342cc 100644
--- a/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/unittests/Dialect/LLVMIR/CMakeLists.txt
@@ -4,4 +4,5 @@ add_mlir_unittest(MLIRLLVMIRTests
 mlir_target_link_libraries(MLIRLLVMIRTests
   PRIVATE
   MLIRLLVMDialect
+  MLIRInferIntRangeInterface
   )

From a7579fda53b55ae7d7d064d08e58b1269420095d Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Mon, 17 Nov 2025 12:51:04 -0500
Subject: [PATCH 21/67] [PowerPC][AIX] Remove flag for no semantic
 interposition (#168109)

Remove flag to sepecifcy "no semantic interposition" since this is the
default for AIX.
---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 22ecf4dcee368..fdd3509f03f59 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -450,13 +450,16 @@ if( LLVM_ENABLE_PIC )
     # Enable interprocedural optimizations for non-inline functions which would
     # otherwise be disabled due to GCC -fPIC's default.
     # Note: GCC<10.3 has a bug on SystemZ.
-    #
+    # Note: Default on AIX is "no semantic interposition".
     # Note: Clang allows IPO for -fPIC so this optimization is less effective.
     # Clang 13 has a bug related to -fsanitize-coverage
     # -fno-semantic-interposition (https://reviews.llvm.org/D117183).
-    if ((CMAKE_COMPILER_IS_GNUCXX AND
-         NOT (LLVM_NATIVE_ARCH STREQUAL "SystemZ" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.3))
-       OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 14))
+    if ((NOT ("${CMAKE_SYSTEM_NAME}" MATCHES "AIX"))
+        AND ((CMAKE_COMPILER_IS_GNUCXX AND
+              NOT (LLVM_NATIVE_ARCH STREQUAL "SystemZ"
+                   AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 10.3))
+             OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang"
+                 AND CMAKE_CXX_COMPILER_VERSION GREATER_EQUAL 14)))
       add_flag_if_supported("-fno-semantic-interposition" FNO_SEMANTIC_INTERPOSITION)
     endif()
   endif()

From 72059bebb3a9427dc70723a37e4c38adfa44553a Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 17 Nov 2025 09:53:16 -0800
Subject: [PATCH 22/67] [compiler-rt][Profile] Mark Darwin test work with
 internal shell

This test was using subshells and then passing the results to diff. Write out
the results to files before passing to diff as the internal shell does not
support subshells.
---
 .../Darwin/instrprof-debug-info-correlate.c      | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
index 46d25a4e386dc..1e9bd11d3f49c 100644
--- a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
+++ b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
@@ -7,7 +7,9 @@
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t.normal
 // RUN: llvm-profdata merge -o %t.normal.profdata %t.profraw
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.normal.profdata > %t.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.profdata > %t.functions
+// RUN: diff %t.normal.functions %t.functions
 
 // RUN: %clang_pgogen -o %t.cov -g -mllvm --profile-correlate=debug-info -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.cov.proflite %run %t.cov
@@ -17,7 +19,9 @@
 // RUN: env LLVM_PROFILE_FILE=%t.cov.profraw %run %t.cov.normal
 // RUN: llvm-profdata merge -o %t.cov.normal.profdata %t.cov.profraw
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.cov.normal.profdata) <(llvm-profdata show --all-functions --counts %t.cov.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.cov.normal.profdata > %t.cov.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.cov.profdata > %t.cov.functions
+// RUN: diff %t.cov.normal.functions %t.cov.functions
 
 // Test debug info correlate with online merging.
 
@@ -30,11 +34,15 @@
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t.dSYM %t.profdir/
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.normal.profdata > %t.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.profdata > %t.functions
+// RUN: diff %t.normal.functions %t.functions
 
 // RUN: rm -rf %t.profdir && mkdir %t.profdir
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.cov.proflite %run %t.cov
 // RUN: env LLVM_PROFILE_FILE=%t.profdir/%m.cov.proflite %run %t.cov
 // RUN: llvm-profdata merge -o %t.cov.profdata --debug-info=%t.cov.dSYM %t.profdir/
 
-// RUN: diff <(llvm-profdata show --all-functions --counts %t.cov.normal.profdata) <(llvm-profdata show --all-functions --counts %t.cov.profdata)
+// RUN: llvm-profdata show --all-functions --counts %t.cov.normal.profdata > %t.cov.normal.functions
+// RUN: llvm-profdata show --all-functions --counts %t.cov.profdata > %t.cov.functions
+// RUN: diff %t.cov.normal.functions %t.cov.functions

From 05bd742ad790f207f5c94c4bf327d3e87b8819dc Mon Sep 17 00:00:00 2001
From: Andres-Salamanca <andrealebarbaritos@gmail.com>
Date: Mon, 17 Nov 2025 13:01:04 -0500
Subject: [PATCH 23/67] [CIR] Upstream the initial BlockAddressOp
 implementation (#168151)

This PR adds initial support for codegen of `blockAddressOp`. This is
emitted when using the GNU extension labels as values. The operation is
used together with `indirectBrOp`, which will be implemented in a future
PR. Lowering will be added in a later PR.
---
 .../include/clang/CIR/Dialect/IR/CIRAttrs.td  | 25 ++++++
 clang/include/clang/CIR/Dialect/IR/CIROps.td  | 34 +++++++++
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    |  9 +++
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       | 29 ++++++-
 .../lib/CIR/Dialect/Transforms/GotoSolver.cpp | 17 ++++-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  6 ++
 clang/test/CIR/CodeGen/label-values.c         | 76 +++++++++++++++++++
 clang/test/CIR/IR/block-adress.cir            | 34 +++++++++
 clang/test/CIR/IR/invalid-block-address.cir   | 21 +++++
 clang/test/CIR/Transforms/goto_solver.cir     | 62 +++++++++++++++
 clang/tools/cir-opt/cir-opt.cpp               |  4 +
 11 files changed, 312 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/label-values.c
 create mode 100644 clang/test/CIR/IR/block-adress.cir
 create mode 100644 clang/test/CIR/IR/invalid-block-address.cir
 create mode 100644 clang/test/CIR/Transforms/goto_solver.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 1e0fb038b19d8..47ff9389e8028 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -1026,4 +1026,29 @@ def CIR_UnwindAttr : CIR_UnitAttr<"Unwind", "unwind"> {
   let storageType = [{ CatchUnwind }];
 }
 
+//===----------------------------------------------------------------------===//
+// CIR_BlockAddrInfoAttr
+//===----------------------------------------------------------------------===//
+
+def CIR_BlockAddrInfoAttr : CIR_Attr<"BlockAddrInfo", "block_addr_info"> {
+  let summary = "Block Addres attribute";
+  let description = [{
+    This attribute is used to represent the address of a basic block
+    within a function. It combines the symbol reference to a function
+    with the name of a label inside that function.
+  }];
+  let parameters = (ins "mlir::FlatSymbolRefAttr":$func,
+                        "mlir::StringAttr":$label);
+
+  let assemblyFormat = "`<` $func `,` $label `>`";
+  let builders = [
+    AttrBuilder<(ins "llvm::StringRef":$func_name,
+                     "llvm::StringRef":$label_name
+                     ), [{
+      return $_get($_ctxt, mlir::FlatSymbolRefAttr::get($_ctxt, func_name),
+                   mlir::StringAttr::get($_ctxt, label_name));
+    }]>
+  ];
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIRATTRS_TD
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 7b987ea49bf97..e612d6a0ba886 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -4897,4 +4897,38 @@ def CIR_AtomicClearOp : CIR_Op<"atomic.clear"> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// BlockAddressOp
+//===----------------------------------------------------------------------===//
+
+def CIR_BlockAddressOp : CIR_Op<"block_address", [Pure]> {
+  let summary = "Get the address of a cir.label within a function";
+  let description = [{
+    The `cir.blockaddress` operation takes a function name and a label and
+    produces a pointer value that represents the address of that cir.label
+    within the specified function.
+
+    This operation models GCC's "labels as values" extension (`&&label`), which
+    allows taking the address of a local label and using it as a computed
+    jump target (e.g., with `goto *addr;`).
+
+    Example:
+    ```mlir
+    %1 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init]
+                                                          {alignment = 8 : i64}
+    %addr = cir.block_address <@c, "label1"> : !cir.ptr<!cir.void>
+    cir.store align(8) %addr, %1 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+    cir.br ^bb1
+   ^bb1:
+    cir.label "label"
+    ```
+  }];
+
+  let arguments = (ins CIR_BlockAddrInfoAttr:$block_addr_info);
+  let results = (outs CIR_VoidPtrType:$addr);
+  let assemblyFormat = [{
+    $block_addr_info `:` qualified(type($addr)) attr-dict
+  }];
+}
+
 #endif // CLANG_CIR_DIALECT_IR_CIROPS_TD
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 3b0977d213325..f777562ba6309 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -168,6 +168,15 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return emitLoadOfLValue(e);
   }
 
+  mlir::Value VisitAddrLabelExpr(const AddrLabelExpr *e) {
+    auto func = cast<cir::FuncOp>(cgf.curFn);
+    auto blockInfoAttr = cir::BlockAddrInfoAttr::get(
+        &cgf.getMLIRContext(), func.getSymName(), e->getLabel()->getName());
+    return cir::BlockAddressOp::create(builder, cgf.getLoc(e->getSourceRange()),
+                                       cgf.convertType(e->getType()),
+                                       blockInfoAttr);
+  }
+
   mlir::Value VisitIntegerLiteral(const IntegerLiteral *e) {
     mlir::Type type = cgf.convertType(e->getType());
     return cir::ConstantOp::create(builder, cgf.getLoc(e->getExprLoc()),
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 9ac5efe0e41c7..22aada882defc 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -1912,22 +1912,45 @@ mlir::LogicalResult cir::FuncOp::verify() {
 
   llvm::SmallSet<llvm::StringRef, 16> labels;
   llvm::SmallSet<llvm::StringRef, 16> gotos;
-
+  llvm::SmallSet<llvm::StringRef, 16> blockAddresses;
+  bool invalidBlockAddress = false;
   getOperation()->walk([&](mlir::Operation *op) {
     if (auto lab = dyn_cast<cir::LabelOp>(op)) {
       labels.insert(lab.getLabel());
     } else if (auto goTo = dyn_cast<cir::GotoOp>(op)) {
       gotos.insert(goTo.getLabel());
+    } else if (auto blkAdd = dyn_cast<cir::BlockAddressOp>(op)) {
+      if (blkAdd.getBlockAddrInfoAttr().getFunc().getAttr() != getSymName()) {
+        // Stop the walk early, no need to continue
+        invalidBlockAddress = true;
+        return mlir::WalkResult::interrupt();
+      }
+      blockAddresses.insert(blkAdd.getBlockAddrInfoAttr().getLabel());
     }
+    return mlir::WalkResult::advance();
   });
 
+  if (invalidBlockAddress)
+    return emitOpError() << "blockaddress references a different function";
+
+  llvm::SmallSet<llvm::StringRef, 16> mismatched;
   if (!labels.empty() || !gotos.empty()) {
-    llvm::SmallSet<llvm::StringRef, 16> mismatched =
-        llvm::set_difference(gotos, labels);
+    mismatched = llvm::set_difference(gotos, labels);
 
     if (!mismatched.empty())
       return emitOpError() << "goto/label mismatch";
   }
+
+  mismatched.clear();
+
+  if (!labels.empty() || !blockAddresses.empty()) {
+    mismatched = llvm::set_difference(blockAddresses, labels);
+
+    if (!mismatched.empty())
+      return emitOpError()
+             << "expects an existing label target in the referenced function";
+  }
+
   return success();
 }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
index 00972b6976295..d590ccce1f540 100644
--- a/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/GotoSolver.cpp
@@ -8,6 +8,7 @@
 #include "PassDetail.h"
 #include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "clang/CIR/Dialect/Passes.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/Support/TimeProfiler.h"
 #include <memory>
 
@@ -30,17 +31,29 @@ static void process(cir::FuncOp func) {
   mlir::OpBuilder rewriter(func.getContext());
   llvm::StringMap<Block *> labels;
   llvm::SmallVector<cir::GotoOp, 4> gotos;
+  llvm::SmallSet<StringRef, 4> blockAddrLabel;
 
   func.getBody().walk([&](mlir::Operation *op) {
     if (auto lab = dyn_cast<cir::LabelOp>(op)) {
-      // Will construct a string copy inplace. Safely erase the label
       labels.try_emplace(lab.getLabel(), lab->getBlock());
-      lab.erase();
     } else if (auto goTo = dyn_cast<cir::GotoOp>(op)) {
       gotos.push_back(goTo);
+    } else if (auto blockAddr = dyn_cast<cir::BlockAddressOp>(op)) {
+      blockAddrLabel.insert(blockAddr.getBlockAddrInfo().getLabel());
     }
   });
 
+  for (auto &lab : labels) {
+    StringRef labelName = lab.getKey();
+    Block *block = lab.getValue();
+    if (!blockAddrLabel.contains(labelName)) {
+      // erase the LabelOp inside the block if safe
+      if (auto lab = dyn_cast<cir::LabelOp>(&block->front())) {
+        lab.erase();
+      }
+    }
+  }
+
   for (auto goTo : gotos) {
     mlir::OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPoint(goTo);
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 92434d730eb31..d43a462a25092 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -3837,6 +3837,12 @@ mlir::LogicalResult CIRToLLVMVAArgOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMBlockAddressOpLowering::matchAndRewrite(
+    cir::BlockAddressOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  return mlir::failure();
+}
+
 std::unique_ptr<mlir::Pass> createConvertCIRToLLVMPass() {
   return std::make_unique<ConvertCIRToLLVMPass>();
 }
diff --git a/clang/test/CIR/CodeGen/label-values.c b/clang/test/CIR/CodeGen/label-values.c
new file mode 100644
index 0000000000000..41178e3f62f20
--- /dev/null
+++ b/clang/test/CIR/CodeGen/label-values.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir  %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+
+void A(void) {
+  void *ptr = &&LABEL_A;
+LABEL_A:
+  return;
+}
+// CIR:  cir.func dso_local @A
+// CIR:    [[PTR:%.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+// CIR:    [[BLOCK:%.*]] = cir.block_address <@A, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) [[BLOCK]], [[PTR]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:  // pred: ^bb0
+// CIR:    cir.label "LABEL_A"
+// CIR:    cir.return
+
+void B(void) {
+LABEL_B:
+  void *ptr = &&LABEL_B;
+}
+
+// CIR:  cir.func dso_local @B()
+// CIR:    [[PTR:%.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+// CIR:    cir.br ^bb1
+// CIR:   ^bb1:
+// CIR:    cir.label "LABEL_B"
+// CIR:    [[BLOCK:%.*]] = cir.block_address <@B, "LABEL_B"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) [[BLOCK]], [[PTR]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.return
+
+void C(int x) {
+    void *ptr = (x == 0) ? &&LABEL_A : &&LABEL_B;
+LABEL_A:
+    return;
+LABEL_B:
+    return;
+}
+
+// CIR:  cir.func dso_local @C
+// CIR:    [[BLOCK1:%.*]] = cir.block_address <@C, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    [[BLOCK2:%.*]] = cir.block_address <@C, "LABEL_B"> : !cir.ptr<!void>
+// CIR:    [[COND:%.*]] = cir.select if [[CMP:%.*]] then [[BLOCK1]] else [[BLOCK2]] : (!cir.bool, !cir.ptr<!void>, !cir.ptr<!void>) -> !cir.ptr<!void>
+// CIR:    cir.store align(8) [[COND]], [[PTR:%.*]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:  // pred: ^bb0
+// CIR:    cir.label "LABEL_A"
+// CIR:    cir.br ^bb2
+// CIR:  ^bb2:  // 2 preds: ^bb1, ^bb3
+// CIR:    cir.return
+// CIR:  ^bb3:  // no predecessors
+// CIR:    cir.label "LABEL_B"
+// CIR:    cir.br ^bb2
+
+void D(void) {
+  void *ptr = &&LABEL_A;
+  void *ptr2 = &&LABEL_A;
+LABEL_A:
+  void *ptr3 = &&LABEL_A;
+  return;
+}
+
+// CIR:  cir.func dso_local @D
+// CIR:    %[[PTR:.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init]
+// CIR:    %[[PTR2:.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr2", init]
+// CIR:    %[[PTR3:.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr3", init]
+// CIR:    %[[BLK1:.*]] = cir.block_address <@D, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) %[[BLK1]], %[[PTR]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    %[[BLK2:.*]] = cir.block_address <@D, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) %[[BLK2]], %[[PTR2]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.br ^bb1
+// CIR:  ^bb1:  // pred: ^bb0
+// CIR:    cir.label "LABEL_A"
+// CIR:    %[[BLK3:.*]] = cir.block_address <@D, "LABEL_A"> : !cir.ptr<!void>
+// CIR:    cir.store align(8) %[[BLK3]], %[[PTR3]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR:    cir.return
diff --git a/clang/test/CIR/IR/block-adress.cir b/clang/test/CIR/IR/block-adress.cir
new file mode 100644
index 0000000000000..9d6840819c2d4
--- /dev/null
+++ b/clang/test/CIR/IR/block-adress.cir
@@ -0,0 +1,34 @@
+// RUN: cir-opt %s --verify-roundtrip | FileCheck %s
+
+!void = !cir.void
+
+module {
+  cir.func @block_address(){
+    %0 = cir.block_address <@block_address, "label"> : !cir.ptr<!void>
+    cir.br ^bb1
+  ^bb1:
+    cir.label "label"
+    cir.return
+  }
+// CHECK: cir.func @block_address
+// CHECK: %0 = cir.block_address <@block_address, "label"> : !cir.ptr<!void>
+// CHECK:   cir.br ^bb1
+// CHECK: ^bb1:
+// CHECK:   cir.label "label"
+// CHECK:   cir.return
+
+cir.func @block_address_inside_scope() -> () {
+  cir.scope{
+    %0 = cir.block_address <@block_address_inside_scope, "label"> : !cir.ptr<!void>
+  }
+  cir.br ^bb1
+^bb1:
+  cir.label "label"
+  cir.return
+}
+// CHECK: cir.func @block_address_inside_scope
+// CHECK: cir.scope
+// CHECK:  %0 = cir.block_address <@block_address_inside_scope, "label"> : !cir.ptr<!void>
+// CHECK:  cir.label "label"
+// CHECK: cir.return
+}
diff --git a/clang/test/CIR/IR/invalid-block-address.cir b/clang/test/CIR/IR/invalid-block-address.cir
new file mode 100644
index 0000000000000..4519485c28803
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-block-address.cir
@@ -0,0 +1,21 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+!void = !cir.void
+
+// expected-error@+1 {{expects an existing label target in the referenced function}}
+cir.func @bad_block_address() -> () {
+    %0 = cir.block_address <@bad_block_address, "label"> : !cir.ptr<!void>
+    cir.br ^bb1
+  ^bb1:
+    cir.label "wrong_label"
+    cir.return
+}
+
+// expected-error@+1 {{blockaddress references a different function}}
+cir.func @bad_block_func() -> () {
+    %0 = cir.block_address <@mismatch_func, "label"> : !cir.ptr<!void>
+    cir.br ^bb1
+  ^bb1:
+    cir.label "label"
+    cir.return
+}
diff --git a/clang/test/CIR/Transforms/goto_solver.cir b/clang/test/CIR/Transforms/goto_solver.cir
new file mode 100644
index 0000000000000..6ae019b44a39e
--- /dev/null
+++ b/clang/test/CIR/Transforms/goto_solver.cir
@@ -0,0 +1,62 @@
+// RUN: cir-opt %s -cir-goto-solver --verify-roundtrip -o - | FileCheck %s
+
+!void = !cir.void
+
+cir.func @a(){
+  %0 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+  %1 = cir.block_address <@a, "label1"> : !cir.ptr<!void>
+  cir.store align(8) %1, %0 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+  cir.br ^bb1
+^bb1:
+  cir.label "label1"
+  cir.br ^bb2
+^bb2:
+  // This label is not referenced by any blockaddressOp, so it should be removed
+  cir.label "label2"
+  cir.return
+}
+
+// CHECK:  cir.func @a()
+// CHECK:   %1 = cir.block_address <@a, "label1"> : !cir.ptr<!void>
+// CHECK: ^bb1:
+// CHECK:   cir.label "label1"
+// CHECK:   cir.br ^bb2
+// CHECK: ^bb2:
+// CHECK-NOT: cir.label "label2"
+
+cir.func @b(){
+  %0 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+  %1 = cir.block_address <@b, "label1"> : !cir.ptr<!void>
+  cir.store align(8) %1, %0 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+  cir.goto "label2"
+^bb1:
+  cir.label "label1"
+  cir.br ^bb2
+^bb2:
+  // This label is not referenced by any blockaddressOp, so it should be removed
+  cir.label "label2"
+  cir.return
+}
+
+// CHECK: cir.func @b() {
+// CHECK:   %1 = cir.block_address <@b, "label1"> : !cir.ptr<!void>
+// CHECK:   cir.store align(8) %1, {{.*}} : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CHECK:   cir.br ^bb2
+// CHECK: ^bb1:
+// CHECK:   cir.label "label1"
+// CHECK:   cir.br ^bb2
+// CHECK: ^bb2:
+// CHECK-NOT: cir.label "label2"
+
+cir.func @c() {
+  cir.label "label1"
+  %0 = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["ptr", init] {alignment = 8 : i64}
+  %1 = cir.block_address <@c, "label1"> : !cir.ptr<!void>
+  cir.store align(8) %1, %0 : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+  cir.return
+}
+
+// CHECK: cir.func @c
+// CHECK:   cir.label "label1"
+// CHECK:   %1 = cir.block_address <@c, "label1"> : !cir.ptr<!void>
+// CHECK:   cir.store align(8) %1, {{.*}} : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
diff --git a/clang/tools/cir-opt/cir-opt.cpp b/clang/tools/cir-opt/cir-opt.cpp
index c4d29a2117c75..ee42015bb38e9 100644
--- a/clang/tools/cir-opt/cir-opt.cpp
+++ b/clang/tools/cir-opt/cir-opt.cpp
@@ -58,6 +58,10 @@ int main(int argc, char **argv) {
     return mlir::createHoistAllocasPass();
   });
 
+  ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> {
+    return mlir::createGotoSolverPass();
+  });
+
   mlir::registerTransformsPasses();
 
   return mlir::asMainReturnCode(MlirOptMain(

From 1425d75c7116c33b084f49eda1c12b299b342315 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 17 Nov 2025 10:06:45 -0800
Subject: [PATCH 24/67] [X86] Delete Profile Guided Prefetch Passes (#167317)

As the PGPF effort has been turned down, there is no current way to
generate profiles that will be used by these passes. Current efforts are
also focused around inserting prefetches in PLO optimizers, which have a
more accurate view of how the code looks.
---
 llvm/lib/Target/X86/CMakeLists.txt            |   2 -
 llvm/lib/Target/X86/X86.h                     |   7 -
 llvm/lib/Target/X86/X86DiscriminateMemOps.cpp | 184 -------------
 llvm/lib/Target/X86/X86InsertPrefetch.cpp     | 259 ------------------
 llvm/lib/Target/X86/X86TargetMachine.cpp      |   2 -
 llvm/test/CodeGen/X86/O0-pipeline.ll          |   2 -
 .../X86/discriminate-mem-ops-missing-info.ll  |  55 ----
 .../X86/discriminate-mem-ops-skip-pfetch.ll   |  68 -----
 llvm/test/CodeGen/X86/discriminate-mem-ops.ll |  55 ----
 .../CodeGen/X86/insert-prefetch-inline.afdo   |   4 -
 .../CodeGen/X86/insert-prefetch-inline.ll     |  76 -----
 .../X86/insert-prefetch-invalid-instr.afdo    |   2 -
 .../X86/insert-prefetch-invalid-instr.ll      |  41 ---
 .../CodeGen/X86/insert-prefetch-other.afdo    |   3 -
 llvm/test/CodeGen/X86/insert-prefetch.afdo    |   3 -
 llvm/test/CodeGen/X86/insert-prefetch.ll      | 101 -------
 llvm/test/CodeGen/X86/opt-pipeline.ll         |   2 -
 17 files changed, 866 deletions(-)
 delete mode 100644 llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
 delete mode 100644 llvm/lib/Target/X86/X86InsertPrefetch.cpp
 delete mode 100644 llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll
 delete mode 100644 llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll
 delete mode 100644 llvm/test/CodeGen/X86/discriminate-mem-ops.ll
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch-inline.ll
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch-other.afdo
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch.afdo
 delete mode 100644 llvm/test/CodeGen/X86/insert-prefetch.ll

diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index f9bd233cf8ecf..434a6d2c3553f 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -31,7 +31,6 @@ set(sources
   X86CmovConversion.cpp
   X86CodeGenPassBuilder.cpp
   X86DomainReassignment.cpp
-  X86DiscriminateMemOps.cpp
   X86LowerTileCopy.cpp
   X86LowerAMXType.cpp
   X86LowerAMXIntrinsics.cpp
@@ -57,7 +56,6 @@ set(sources
   X86IndirectBranchTracking.cpp
   X86IndirectThunks.cpp
   X86InterleavedAccess.cpp
-  X86InsertPrefetch.cpp
   X86InstCombineIntrinsic.cpp
   X86InstrFMA3Info.cpp
   X86InstrFoldTables.cpp
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 03706aaaab237..97848bec7127e 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -166,13 +166,6 @@ FunctionPass *createX86IndirectThunksPass();
 /// This pass replaces ret instructions with jmp's to __x86_return thunk.
 FunctionPass *createX86ReturnThunksPass();
 
-/// This pass ensures instructions featuring a memory operand
-/// have distinctive <LineNumber, Discriminator> (with respect to each other)
-FunctionPass *createX86DiscriminateMemOpsPass();
-
-/// This pass applies profiling information to insert cache prefetches.
-FunctionPass *createX86InsertPrefetchPass();
-
 /// This pass insert wait instruction after X87 instructions which could raise
 /// fp exceptions when strict-fp enabled.
 FunctionPass *createX86InsertX87waitPass();
diff --git a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
deleted file mode 100644
index bd151a450394a..0000000000000
--- a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-//===- X86DiscriminateMemOps.cpp - Unique IDs for Mem Ops -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// This pass aids profile-driven cache prefetch insertion by ensuring all
-/// instructions that have a memory operand are distinguishible from each other.
-///
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/ProfileData/SampleProf.h"
-#include "llvm/ProfileData/SampleProfReader.h"
-#include "llvm/Support/Debug.h"
-#include <optional>
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-discriminate-memops"
-
-static cl::opt<bool> EnableDiscriminateMemops(
-    DEBUG_TYPE, cl::init(false),
-    cl::desc("Generate unique debug info for each instruction with a memory "
-             "operand. Should be enabled for profile-driven cache prefetching, "
-             "both in the build of the binary being profiled, as well as in "
-             "the build of the binary consuming the profile."),
-    cl::Hidden);
-
-static cl::opt<bool> BypassPrefetchInstructions(
-    "x86-bypass-prefetch-instructions", cl::init(true),
-    cl::desc("When discriminating instructions with memory operands, ignore "
-             "prefetch instructions. This ensures the other memory operand "
-             "instructions have the same identifiers after inserting "
-             "prefetches, allowing for successive insertions."),
-    cl::Hidden);
-
-namespace {
-
-using Location = std::pair<StringRef, unsigned>;
-
-Location diToLocation(const DILocation *Loc) {
-  return std::make_pair(Loc->getFilename(), Loc->getLine());
-}
-
-/// Ensure each instruction having a memory operand has a distinct <LineNumber,
-/// Discriminator> pair.
-void updateDebugInfo(MachineInstr *MI, const DILocation *Loc) {
-  DebugLoc DL(Loc);
-  MI->setDebugLoc(DL);
-}
-
-class X86DiscriminateMemOps : public MachineFunctionPass {
-  bool runOnMachineFunction(MachineFunction &MF) override;
-  StringRef getPassName() const override {
-    return "X86 Discriminate Memory Operands";
-  }
-
-public:
-  static char ID;
-
-  /// Default construct and initialize the pass.
-  X86DiscriminateMemOps();
-};
-
-bool IsPrefetchOpcode(unsigned Opcode) {
-  return Opcode == X86::PREFETCHNTA || Opcode == X86::PREFETCHT0 ||
-         Opcode == X86::PREFETCHT1 || Opcode == X86::PREFETCHT2 ||
-         Opcode == X86::PREFETCHIT0 || Opcode == X86::PREFETCHIT1 ||
-         Opcode == X86::PREFETCHRST2;
-}
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-//            Implementation
-//===----------------------------------------------------------------------===//
-
-char X86DiscriminateMemOps::ID = 0;
-
-/// Default construct and initialize the pass.
-X86DiscriminateMemOps::X86DiscriminateMemOps() : MachineFunctionPass(ID) {}
-
-bool X86DiscriminateMemOps::runOnMachineFunction(MachineFunction &MF) {
-  if (!EnableDiscriminateMemops)
-    return false;
-
-  DISubprogram *FDI = MF.getFunction().getSubprogram();
-  if (!FDI || !FDI->getUnit()->getDebugInfoForProfiling())
-    return false;
-
-  // Have a default DILocation, if we find instructions with memops that don't
-  // have any debug info.
-  const DILocation *ReferenceDI =
-      DILocation::get(FDI->getContext(), FDI->getLine(), 0, FDI);
-  assert(ReferenceDI && "ReferenceDI should not be nullptr");
-  DenseMap<Location, unsigned> MemOpDiscriminators;
-  MemOpDiscriminators[diToLocation(ReferenceDI)] = 0;
-
-  // Figure out the largest discriminator issued for each Location. When we
-  // issue new discriminators, we can thus avoid issuing discriminators
-  // belonging to instructions that don't have memops. This isn't a requirement
-  // for the goals of this pass, however, it avoids unnecessary ambiguity.
-  for (auto &MBB : MF) {
-    for (auto &MI : MBB) {
-      const auto &DI = MI.getDebugLoc();
-      if (!DI)
-        continue;
-      if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
-        continue;
-      Location Loc = diToLocation(DI);
-      unsigned &Disc = MemOpDiscriminators[Loc];
-      Disc = std::max(Disc, DI->getBaseDiscriminator());
-    }
-  }
-
-  // Keep track of the discriminators seen at each Location. If an instruction's
-  // DebugInfo has a Location and discriminator we've already seen, replace its
-  // discriminator with a new one, to guarantee uniqueness.
-  DenseMap<Location, DenseSet<unsigned>> Seen;
-
-  bool Changed = false;
-  for (auto &MBB : MF) {
-    for (auto &MI : MBB) {
-      if (X86II::getMemoryOperandNo(MI.getDesc().TSFlags) < 0)
-        continue;
-      if (BypassPrefetchInstructions && IsPrefetchOpcode(MI.getDesc().Opcode))
-        continue;
-      const DILocation *DI = MI.getDebugLoc();
-      bool HasDebug = DI;
-      if (!HasDebug) {
-        DI = ReferenceDI;
-      }
-      Location L = diToLocation(DI);
-      DenseSet<unsigned> &Set = Seen[L];
-      const std::pair<DenseSet<unsigned>::iterator, bool> TryInsert =
-          Set.insert(DI->getBaseDiscriminator());
-      if (!TryInsert.second || !HasDebug) {
-        unsigned BF, DF, CI = 0;
-        DILocation::decodeDiscriminator(DI->getDiscriminator(), BF, DF, CI);
-        std::optional<unsigned> EncodedDiscriminator =
-            DILocation::encodeDiscriminator(MemOpDiscriminators[L] + 1, DF, CI);
-
-        if (!EncodedDiscriminator) {
-          // FIXME(mtrofin): The assumption is that this scenario is infrequent/OK
-          // not to support. If evidence points otherwise, we can explore synthesizeing
-          // unique DIs by adding fake line numbers, or by constructing 64 bit
-          // discriminators.
-          LLVM_DEBUG(dbgs() << "Unable to create a unique discriminator "
-                     "for instruction with memory operand in: "
-                     << DI->getFilename() << " Line: " << DI->getLine()
-                     << " Column: " << DI->getColumn()
-                     << ". This is likely due to a large macro expansion. \n");
-          continue;
-        }
-        // Since we were able to encode, bump the MemOpDiscriminators.
-        ++MemOpDiscriminators[L];
-        DI = DI->cloneWithDiscriminator(*EncodedDiscriminator);
-        assert(DI && "DI should not be nullptr");
-        updateDebugInfo(&MI, DI);
-        Changed = true;
-        std::pair<DenseSet<unsigned>::iterator, bool> MustInsert =
-            Set.insert(DI->getBaseDiscriminator());
-        (void)MustInsert; // Silence warning in release build.
-        assert(MustInsert.second && "New discriminator shouldn't be present in set");
-      }
-
-      // Bump the reference DI to avoid cramming discriminators on line 0.
-      // FIXME(mtrofin): pin ReferenceDI on blocks or first instruction with DI
-      // in a block. It's more consistent than just relying on the last memop
-      // instruction we happened to see.
-      ReferenceDI = DI;
-    }
-  }
-  return Changed;
-}
-
-FunctionPass *llvm::createX86DiscriminateMemOpsPass() {
-  return new X86DiscriminateMemOps();
-}
diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
deleted file mode 100644
index 953b755a0ca4c..0000000000000
--- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ /dev/null
@@ -1,259 +0,0 @@
-//===------- X86InsertPrefetch.cpp - Insert cache prefetch hints ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass applies cache prefetch instructions based on a profile. The pass
-// assumes DiscriminateMemOps ran immediately before, to ensure debug info
-// matches the one used at profile generation time. The profile is encoded in
-// afdo format (text or binary). It contains prefetch hints recommendations.
-// Each recommendation is made in terms of debug info locations, a type (i.e.
-// nta, t{0|1|2}) and a delta. The debug info identifies an instruction with a
-// memory operand (see X86DiscriminateMemOps). The prefetch will be made for
-// a location at that memory operand + the delta specified in the
-// recommendation.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/ProfileData/SampleProf.h"
-#include "llvm/ProfileData/SampleProfReader.h"
-#include "llvm/Support/VirtualFileSystem.h"
-#include "llvm/Transforms/IPO/SampleProfile.h"
-using namespace llvm;
-using namespace sampleprof;
-
-static cl::opt<std::string>
-    PrefetchHintsFile("prefetch-hints-file",
-                      cl::desc("Path to the prefetch hints profile. See also "
-                               "-x86-discriminate-memops"),
-                      cl::Hidden);
-namespace {
-
-class X86InsertPrefetch : public MachineFunctionPass {
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool doInitialization(Module &) override;
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-  struct PrefetchInfo {
-    unsigned InstructionID;
-    int64_t Delta;
-  };
-  typedef SmallVectorImpl<PrefetchInfo> Prefetches;
-  bool findPrefetchInfo(const FunctionSamples *Samples, const MachineInstr &MI,
-                        Prefetches &prefetches) const;
-
-public:
-  static char ID;
-  X86InsertPrefetch(const std::string &PrefetchHintsFilename);
-  StringRef getPassName() const override {
-    return "X86 Insert Cache Prefetches";
-  }
-
-private:
-  std::string Filename;
-  std::unique_ptr<SampleProfileReader> Reader;
-};
-
-using PrefetchHints = SampleRecord::CallTargetMap;
-
-// Return any prefetching hints for the specified MachineInstruction. The hints
-// are returned as pairs (name, delta).
-ErrorOr<const PrefetchHints &>
-getPrefetchHints(const FunctionSamples *TopSamples, const MachineInstr &MI) {
-  if (const auto &Loc = MI.getDebugLoc())
-    if (const auto *Samples = TopSamples->findFunctionSamples(Loc))
-      return Samples->findCallTargetMapAt(FunctionSamples::getOffset(Loc),
-                                          Loc->getBaseDiscriminator());
-  return std::error_code();
-}
-
-// The prefetch instruction can't take memory operands involving vector
-// registers.
-bool IsMemOpCompatibleWithPrefetch(const MachineInstr &MI, int Op) {
-  Register BaseReg = MI.getOperand(Op + X86::AddrBaseReg).getReg();
-  Register IndexReg = MI.getOperand(Op + X86::AddrIndexReg).getReg();
-  return (BaseReg == 0 ||
-          X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) ||
-          X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg)) &&
-         (IndexReg == 0 ||
-          X86MCRegisterClasses[X86::GR64RegClassID].contains(IndexReg) ||
-          X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg));
-}
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-//            Implementation
-//===----------------------------------------------------------------------===//
-
-char X86InsertPrefetch::ID = 0;
-
-X86InsertPrefetch::X86InsertPrefetch(const std::string &PrefetchHintsFilename)
-    : MachineFunctionPass(ID), Filename(PrefetchHintsFilename) {}
-
-/// Return true if the provided MachineInstruction has cache prefetch hints. In
-/// that case, the prefetch hints are stored, in order, in the Prefetches
-/// vector.
-bool X86InsertPrefetch::findPrefetchInfo(const FunctionSamples *TopSamples,
-                                         const MachineInstr &MI,
-                                         Prefetches &Prefetches) const {
-  assert(Prefetches.empty() &&
-         "Expected caller passed empty PrefetchInfo vector.");
-
-  // There is no point to match prefetch hints if the profile is using MD5.
-  if (FunctionSamples::UseMD5)
-    return false;
-
-  static constexpr std::pair<StringLiteral, unsigned> HintTypes[] = {
-      {"_nta_", X86::PREFETCHNTA},
-      {"_t0_", X86::PREFETCHT0},
-      {"_t1_", X86::PREFETCHT1},
-      {"_t2_", X86::PREFETCHT2},
-  };
-  static const char *SerializedPrefetchPrefix = "__prefetch";
-
-  auto T = getPrefetchHints(TopSamples, MI);
-  if (!T)
-    return false;
-  int16_t max_index = -1;
-  // Convert serialized prefetch hints into PrefetchInfo objects, and populate
-  // the Prefetches vector.
-  for (const auto &S_V : *T) {
-    StringRef Name = S_V.first.stringRef();
-    if (Name.consume_front(SerializedPrefetchPrefix)) {
-      int64_t D = static_cast<int64_t>(S_V.second);
-      unsigned IID = 0;
-      for (const auto &HintType : HintTypes) {
-        if (Name.consume_front(HintType.first)) {
-          IID = HintType.second;
-          break;
-        }
-      }
-      if (IID == 0)
-        return false;
-      uint8_t index = 0;
-      Name.consumeInteger(10, index);
-
-      if (index >= Prefetches.size())
-        Prefetches.resize(index + 1);
-      Prefetches[index] = {IID, D};
-      max_index = std::max(max_index, static_cast<int16_t>(index));
-    }
-  }
-  assert(max_index + 1 >= 0 &&
-         "Possible overflow: max_index + 1 should be positive.");
-  assert(static_cast<size_t>(max_index + 1) == Prefetches.size() &&
-         "The number of prefetch hints received should match the number of "
-         "PrefetchInfo objects returned");
-  return !Prefetches.empty();
-}
-
-bool X86InsertPrefetch::doInitialization(Module &M) {
-  if (Filename.empty())
-    return false;
-
-  LLVMContext &Ctx = M.getContext();
-  // TODO: Propagate virtual file system into LLVM targets.
-  auto FS = vfs::getRealFileSystem();
-  ErrorOr<std::unique_ptr<SampleProfileReader>> ReaderOrErr =
-      SampleProfileReader::create(Filename, Ctx, *FS);
-  if (std::error_code EC = ReaderOrErr.getError()) {
-    std::string Msg = "Could not open profile: " + EC.message();
-    Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg,
-                                             DiagnosticSeverity::DS_Warning));
-    return false;
-  }
-  Reader = std::move(ReaderOrErr.get());
-  Reader->read();
-  return true;
-}
-
-void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
-  if (!Reader)
-    return false;
-  const FunctionSamples *Samples = Reader->getSamplesFor(MF.getFunction());
-  if (!Samples)
-    return false;
-
-  bool Changed = false;
-
-  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-  SmallVector<PrefetchInfo, 4> Prefetches;
-  for (auto &MBB : MF) {
-    for (auto MI = MBB.instr_begin(); MI != MBB.instr_end();) {
-      auto Current = MI;
-      ++MI;
-
-      int Offset = X86II::getMemoryOperandNo(Current->getDesc().TSFlags);
-      if (Offset < 0)
-        continue;
-      unsigned Bias = X86II::getOperandBias(Current->getDesc());
-      int MemOpOffset = Offset + Bias;
-      // FIXME(mtrofin): ORE message when the recommendation cannot be taken.
-      if (!IsMemOpCompatibleWithPrefetch(*Current, MemOpOffset))
-        continue;
-      Prefetches.clear();
-      if (!findPrefetchInfo(Samples, *Current, Prefetches))
-        continue;
-      assert(!Prefetches.empty() &&
-             "The Prefetches vector should contain at least a value if "
-             "findPrefetchInfo returned true.");
-      for (auto &PrefInfo : Prefetches) {
-        unsigned PFetchInstrID = PrefInfo.InstructionID;
-        int64_t Delta = PrefInfo.Delta;
-        const MCInstrDesc &Desc = TII->get(PFetchInstrID);
-        MachineInstr *PFetch =
-            MF.CreateMachineInstr(Desc, Current->getDebugLoc(), true);
-        MachineInstrBuilder MIB(MF, PFetch);
-
-        static_assert(X86::AddrBaseReg == 0 && X86::AddrScaleAmt == 1 &&
-                          X86::AddrIndexReg == 2 && X86::AddrDisp == 3 &&
-                          X86::AddrSegmentReg == 4,
-                      "Unexpected change in X86 operand offset order.");
-
-        // This assumes X86::AddBaseReg = 0, {...}ScaleAmt = 1, etc.
-        // FIXME(mtrofin): consider adding a:
-        //     MachineInstrBuilder::set(unsigned offset, op).
-        MIB.addReg(Current->getOperand(MemOpOffset + X86::AddrBaseReg).getReg())
-            .addImm(
-                Current->getOperand(MemOpOffset + X86::AddrScaleAmt).getImm())
-            .addReg(
-                Current->getOperand(MemOpOffset + X86::AddrIndexReg).getReg())
-            .addImm(Current->getOperand(MemOpOffset + X86::AddrDisp).getImm() +
-                    Delta)
-            .addReg(Current->getOperand(MemOpOffset + X86::AddrSegmentReg)
-                        .getReg());
-
-        if (!Current->memoperands_empty()) {
-          MachineMemOperand *CurrentOp = *(Current->memoperands_begin());
-          MIB.addMemOperand(MF.getMachineMemOperand(
-              CurrentOp, CurrentOp->getOffset() + Delta, CurrentOp->getSize()));
-        }
-
-        // Insert before Current. This is because Current may clobber some of
-        // the registers used to describe the input memory operand.
-        MBB.insert(Current, PFetch);
-        Changed = true;
-      }
-    }
-  }
-  return Changed;
-}
-
-FunctionPass *llvm::createX86InsertPrefetchPass() {
-  return new X86InsertPrefetch(PrefetchHintsFile);
-}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 543220b2fd3b9..713df63479987 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -563,8 +563,6 @@ void X86PassConfig::addPreEmitPass() {
     addPass(createX86FixupVectorConstants());
   }
   addPass(createX86CompressEVEXPass());
-  addPass(createX86DiscriminateMemOpsPass());
-  addPass(createX86InsertPrefetchPass());
   addPass(createX86InsertX87waitPass());
 }
 
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index 0fbfb42d2a4dd..78a02b11b17bb 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -68,8 +68,6 @@
 ; CHECK-NEXT:       X86 Indirect Branch Tracking
 ; CHECK-NEXT:       X86 vzeroupper inserter
 ; CHECK-NEXT:       Compressing EVEX instrs when possible
-; CHECK-NEXT:       X86 Discriminate Memory Operands
-; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       X86 insert wait instruction
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       Remove Loads Into Fake Uses
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll
deleted file mode 100644
index 6bbf3eb307da3..0000000000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops-missing-info.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -x86-discriminate-memops  < %s | FileCheck %s
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.prefetch(ptr, i32, i32, i32)
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom
-  %0 = load i32, ptr %arrayidx, align 4
-  %idxprom1 = sext i32 %pos2 to i64
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1
-  %1 = load i32, ptr %arrayidx2, align 4
-  %add = add nsw i32 %1, %0, !dbg !15
-  ret i32 %add
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       .loc 1 1 0 {{.*}} discriminator 2
-;CHECK-NEXT:  movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 2 20
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll
deleted file mode 100644
index ca412c590b2e3..0000000000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops-skip-pfetch.ll
+++ /dev/null
@@ -1,68 +0,0 @@
-; RUN: llc -x86-discriminate-memops  < %s | FileCheck %s
-; RUN: llc -x86-discriminate-memops  -x86-bypass-prefetch-instructions=0 < %s | FileCheck %s -check-prefix=NOBYPASS
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @llvm.prefetch(ptr, i32, i32, i32)
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !9
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !9
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !9, !tbaa !10
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !14
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !14
-  call void @llvm.prefetch(ptr %arrayidx2, i32 0, i32 3, i32 1)
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !14, !tbaa !10
-  %add = add nsw i32 %1, %0, !dbg !15
-  ret i32 %add, !dbg !16
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!14 = !DILocation(line: 2, column: 22, scope: !7)
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-!16 = !DILocation(line: 2, column: 3, scope: !7)
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       prefetcht0 (%rdi,%rax,4)
-;CHECK-NEXT:  movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 2 20 discriminator 2  # test.cc:2:20
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT:  .loc 1 2 3                   # test.cc:2:3
-
-;NOBYPASS-LABEL: sum:
-;NOBYPASS:       # %bb.0:
-;NOBYPASS:       prefetcht0 (%rdi,%rax,4)
-;NOBYPASS-NEXT: .loc 1 2 22
-;NOBYPASS-NEXT:  movl (%rdi,%rax,4), %eax
-;NOBYPASS-NEXT:  .loc 1 2 20 {{.*}} discriminator 2  # test.cc:2:20
-;NOBYPASS-NEXT:  addl (%rdi,%rcx,4), %eax
-;NOBYPASS-NEXT:  .loc 1 2 3                   # test.cc:2:3
diff --git a/llvm/test/CodeGen/X86/discriminate-mem-ops.ll b/llvm/test/CodeGen/X86/discriminate-mem-ops.ll
deleted file mode 100644
index a8421d9506a87..0000000000000
--- a/llvm/test/CodeGen/X86/discriminate-mem-ops.ll
+++ /dev/null
@@ -1,55 +0,0 @@
-; RUN: llc -x86-discriminate-memops  < %s | FileCheck %s
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind readonly uwtable
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !9
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !9
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !9, !tbaa !10
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !14
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !14
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !14, !tbaa !10
-  %add = add nsw i32 %1, %0, !dbg !15
-  ret i32 %add, !dbg !16
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 322155) (llvm/trunk 322159)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 2, column: 10, scope: !7)
-!10 = !{!11, !11, i64 0}
-!11 = !{!"int", !12, i64 0}
-!12 = !{!"omnipotent char", !13, i64 0}
-!13 = !{!"Simple C++ TBAA"}
-!14 = !DILocation(line: 2, column: 22, scope: !7)
-!15 = !DILocation(line: 2, column: 20, scope: !7)
-!16 = !DILocation(line: 2, column: 3, scope: !7)
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 2 20 discriminator 2  # test.cc:2:20
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT:  .loc 1 2 3                   # test.cc:2:3
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo b/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
deleted file mode 100644
index 935b707ff1072..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo
+++ /dev/null
@@ -1,4 +0,0 @@
-caller:0:0
- 2: sum:0
-  3: 0 __prefetch_nta_0:23456
-  3.1: 0 __prefetch_nta_0:8764 __prefetch_nta_1:64
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.ll b/llvm/test/CodeGen/X86/insert-prefetch-inline.ll
deleted file mode 100644
index 05f542799c08b..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-inline.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-inline.afdo | FileCheck %s
-;
-; Verify we can insert prefetch instructions in code belonging to inlined
-; functions.
-;
-; ModuleID = 'test.cc'
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind readonly uwtable
-define dso_local i32 @sum(ptr nocapture readonly %arr, i32 %pos1, i32 %pos2) local_unnamed_addr #0 !dbg !7 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !10
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !10
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !10, !tbaa !11
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !15
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !15
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !15, !tbaa !11
-  %add = add nsw i32 %1, %0, !dbg !16
-  ret i32 %add, !dbg !17
-}
-
-; "caller" inlines "sum". The associated .afdo file references instructions
-; in "caller" that came from "sum"'s inlining.
-;
-; Function Attrs: norecurse nounwind readonly uwtable
-define dso_local i32 @caller(ptr nocapture readonly %arr) local_unnamed_addr #0 !dbg !18 {
-entry:
-  %0 = load i32, ptr %arr, align 4, !dbg !19, !tbaa !11
-  %arrayidx2.i = getelementptr inbounds i32, ptr %arr, i64 2, !dbg !21
-  %1 = load i32, ptr %arrayidx2.i, align 4, !dbg !21, !tbaa !11
-  %add.i = add nsw i32 %1, %0, !dbg !22
-  ret i32 %add.i, !dbg !23
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 324940) (llvm/trunk 324941)"}
-!7 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !8, file: !8, line: 3, type: !9, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DIFile(filename: "./test.h", directory: "/tmp")
-!9 = !DISubroutineType(types: !2)
-!10 = !DILocation(line: 6, column: 10, scope: !7)
-!11 = !{!12, !12, i64 0}
-!12 = !{!"int", !13, i64 0}
-!13 = !{!"omnipotent char", !14, i64 0}
-!14 = !{!"Simple C++ TBAA"}
-!15 = !DILocation(line: 6, column: 22, scope: !7)
-!16 = !DILocation(line: 6, column: 20, scope: !7)
-!17 = !DILocation(line: 6, column: 3, scope: !7)
-!18 = distinct !DISubprogram(name: "caller", linkageName: "caller", scope: !1, file: !1, line: 4, type: !9, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!19 = !DILocation(line: 6, column: 10, scope: !7, inlinedAt: !20)
-!20 = distinct !DILocation(line: 6, column: 10, scope: !18)
-!21 = !DILocation(line: 6, column: 22, scope: !7, inlinedAt: !20)
-!22 = !DILocation(line: 6, column: 20, scope: !7, inlinedAt: !20)
-!23 = !DILocation(line: 6, column: 3, scope: !18)
-
-; CHECK-LABEL: caller:
-; CHECK-LABEL: # %bb.0:
-; CHECK-NEXT: .loc 1 6 22 prologue_end
-; CHECK-NEXT: prefetchnta 23464(%rdi)
-; CHECK-NEXT: movl 8(%rdi), %eax
-; CHECK-NEXT: .loc 1 6 20 is_stmt 0 discriminator 2
-; CHECK-NEXT: prefetchnta 8764(%rdi)
-; CHECK-NEXT: prefetchnta 64(%rdi)
-; CHECK-NEXT: addl (%rdi), %eax
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
deleted file mode 100644
index 6385a498b8f92..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.afdo
+++ /dev/null
@@ -1,2 +0,0 @@
-main:0:0
- 6: 0 __prefetch_nta_0:42
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll b/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
deleted file mode 100644
index f8e25028cfdee..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-invalid-instr.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-invalid-instr.afdo | FileCheck %s
-; ModuleID = 'prefetch.cc'
-source_filename = "prefetch.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #0 !dbg !7 {
-entry:
-  tail call void @llvm.prefetch(ptr inttoptr (i64 291 to ptr), i32 0, i32 0, i32 1), !dbg !9
-  ret i32 291, !dbg !11
-}
-
-; Function Attrs: inaccessiblemem_or_argmemonly nounwind
-declare void @llvm.prefetch(ptr nocapture readonly, i32, i32, i32) #1
-
-attributes #0 = {"target-cpu"="x86-64" "target-features"="+sse4.2,+ssse3"}
-attributes #1 = { inaccessiblemem_or_argmemonly nounwind }
-attributes #2 = { argmemonly nounwind }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "prefetch.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{!"clang version 7.0.0 (trunk 327078) (llvm/trunk 327086)"}
-!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!8 = !DISubroutineType(types: !2)
-!9 = !DILocation(line: 12, column: 3, scope: !7)
-!10 = !DILocation(line: 14, column: 3, scope: !7)
-!11 = !DILocation(line: 15, column: 3, scope: !7)
-
-;CHECK-LABEL: main:
-;CHECK:       # %bb.0:
-;CHECK:       prefetchnta 291
-;CHECK-NOT:   prefetchnta 42(%rax,%ymm0)
diff --git a/llvm/test/CodeGen/X86/insert-prefetch-other.afdo b/llvm/test/CodeGen/X86/insert-prefetch-other.afdo
deleted file mode 100644
index 783da34f7f84c..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch-other.afdo
+++ /dev/null
@@ -1,3 +0,0 @@
-sum:0:0
- 1: 0 __prefetch_t0_1:0 __prefetch_t2_0:42
- 1.1: 0 __prefetch_t1_0:18446744073709551615
diff --git a/llvm/test/CodeGen/X86/insert-prefetch.afdo b/llvm/test/CodeGen/X86/insert-prefetch.afdo
deleted file mode 100644
index 96487e85eaaf2..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch.afdo
+++ /dev/null
@@ -1,3 +0,0 @@
-sum:0:0
- 1: 0 __prefetch_nta_1:0 __prefetch_nta_0:42
- 1.1: 0 __prefetch_nta_0:18446744073709551615
diff --git a/llvm/test/CodeGen/X86/insert-prefetch.ll b/llvm/test/CodeGen/X86/insert-prefetch.ll
deleted file mode 100644
index 971a6193862d0..0000000000000
--- a/llvm/test/CodeGen/X86/insert-prefetch.ll
+++ /dev/null
@@ -1,101 +0,0 @@
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch.afdo | FileCheck %s
-; RUN: llc < %s -x86-discriminate-memops -prefetch-hints-file=%S/insert-prefetch-other.afdo | FileCheck %s -check-prefix=OTHERS
-;
-; original source, compiled with -O3 -gmlt -fdebug-info-for-profiling:
-; int sum(int* arr, int pos1, int pos2) {
-;   return arr[pos1] + arr[pos2];
-; }
-;
-; NOTE: debug line numbers were adjusted such that the function would start
-; at line 15 (an arbitrary number). The sample profile file format uses
-; offsets from the start of the symbol instead of file-relative line numbers.
-; The .afdo file reflects that - the instructions are offset '1'.
-;
-; ModuleID = 'test.cc'
-source_filename = "test.cc"
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define i32 @sum(ptr %arr, i32 %pos1, i32 %pos2) !dbg !35 !prof !37 {
-entry:
-  %idxprom = sext i32 %pos1 to i64, !dbg !38
-  %arrayidx = getelementptr inbounds i32, ptr %arr, i64 %idxprom, !dbg !38
-  %0 = load i32, ptr %arrayidx, align 4, !dbg !38, !tbaa !39
-  %idxprom1 = sext i32 %pos2 to i64, !dbg !43
-  %arrayidx2 = getelementptr inbounds i32, ptr %arr, i64 %idxprom1, !dbg !43
-  %1 = load i32, ptr %arrayidx2, align 4, !dbg !43, !tbaa !39
-  %add = add nsw i32 %1, %0, !dbg !44
-  ret i32 %add, !dbg !45
-}
-
-attributes #0 = { "target-cpu"="x86-64" }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5, !6}
-!llvm.ident = !{!33}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, debugInfoForProfiling: true)
-!1 = !DIFile(filename: "test.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 1, !"wchar_size", i32 4}
-!6 = !{i32 1, !"ProfileSummary", !7}
-!7 = !{!8, !9, !10, !11, !12, !13, !14, !15}
-!8 = !{!"ProfileFormat", !"SampleProfile"}
-!9 = !{!"TotalCount", i64 0}
-!10 = !{!"MaxCount", i64 0}
-!11 = !{!"MaxInternalCount", i64 0}
-!12 = !{!"MaxFunctionCount", i64 0}
-!13 = !{!"NumCounts", i64 2}
-!14 = !{!"NumFunctions", i64 1}
-!15 = !{!"DetailedSummary", !16}
-!16 = !{!17, !18, !19, !20, !21, !22, !22, !23, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32}
-!17 = !{i32 10000, i64 0, i32 0}
-!18 = !{i32 100000, i64 0, i32 0}
-!19 = !{i32 200000, i64 0, i32 0}
-!20 = !{i32 300000, i64 0, i32 0}
-!21 = !{i32 400000, i64 0, i32 0}
-!22 = !{i32 500000, i64 0, i32 0}
-!23 = !{i32 600000, i64 0, i32 0}
-!24 = !{i32 700000, i64 0, i32 0}
-!25 = !{i32 800000, i64 0, i32 0}
-!26 = !{i32 900000, i64 0, i32 0}
-!27 = !{i32 950000, i64 0, i32 0}
-!28 = !{i32 990000, i64 0, i32 0}
-!29 = !{i32 999000, i64 0, i32 0}
-!30 = !{i32 999900, i64 0, i32 0}
-!31 = !{i32 999990, i64 0, i32 0}
-!32 = !{i32 999999, i64 0, i32 0}
-!33 = !{!"clang version 7.0.0 (trunk 322593) (llvm/trunk 322526)"}
-!35 = distinct !DISubprogram(name: "sum", linkageName: "sum", scope: !1, file: !1, line: 15, type: !36, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
-!36 = !DISubroutineType(types: !2)
-!37 = !{!"function_entry_count", i64 -1}
-!38 = !DILocation(line: 16, column: 10, scope: !35)
-!39 = !{!40, !40, i64 0}
-!40 = !{!"int", !41, i64 0}
-!41 = !{!"omnipotent char", !42, i64 0}
-!42 = !{!"Simple C++ TBAA"}
-!43 = !DILocation(line: 16, column: 22, scope: !35)
-!44 = !DILocation(line: 16, column: 20, scope: !35)
-!45 = !DILocation(line: 16, column: 3, scope: !35)
-
-;CHECK-LABEL: sum:
-;CHECK:       # %bb.0:
-;CHECK:       prefetchnta 42(%rdi,%rax,4)
-;CHECK-NEXT:  prefetchnta (%rdi,%rax,4)
-;CHECK-NEXT:  movl (%rdi,%rax,4), %eax
-;CHECK-NEXT:  .loc 1 16 20 discriminator 2  # test.cc:16:20
-;CHECK-NEXT:  prefetchnta -1(%rdi,%rcx,4)
-;CHECK-NEXT:  addl (%rdi,%rcx,4), %eax
-;CHECK-NEXT:  .loc 1 16 3                   # test.cc:16:3
-
-;OTHERS-LABEL: sum:
-;OTHERS:       # %bb.0:
-;OTHERS:       prefetcht2 42(%rdi,%rax,4)
-;OTHERS-NEXT:  prefetcht0 (%rdi,%rax,4)
-;OTHERS-NEXT:  movl (%rdi,%rax,4), %eax
-;OTHERS-NEXT:  .loc 1 16 20 discriminator 2  # test.cc:16:20
-;OTHERS-NEXT:  prefetcht1 -1(%rdi,%rcx,4)
-;OTHERS-NEXT:  addl (%rdi,%rcx,4), %eax
-;OTHERS-NEXT:  .loc 1 16 3                   # test.cc:16:3
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 81390e59d0d0a..276232e27c000 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -208,8 +208,6 @@
 ; CHECK-NEXT:       X86 Fixup Inst Tuning
 ; CHECK-NEXT:       X86 Fixup Vector Constants
 ; CHECK-NEXT:       Compressing EVEX instrs when possible
-; CHECK-NEXT:       X86 Discriminate Memory Operands
-; CHECK-NEXT:       X86 Insert Cache Prefetches
 ; CHECK-NEXT:       X86 insert wait instruction
 ; CHECK-NEXT:       Contiguously Lay Out Funclets
 ; CHECK-NEXT:       Remove Loads Into Fake Uses

From 472e4ab0b02d3dec001f885beb535c9d727d1ea2 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 17 Nov 2025 10:07:48 -0800
Subject: [PATCH 25/67] [MLGO] Fully Remove MLRegalloc Experimental Features
 (#168252)

20a22a45e96bc94c3a8295cccc9031bd87552725 was supposed to fully remove
these, but left around the functionality to actually compute them and a
unittest that ensured they worked. These are not development features in
the sense of features used in development mode, but experimental
features that have been superseded by MIR2Vec.
---
 llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp   | 137 --------
 llvm/unittests/CodeGen/CMakeLists.txt         |   1 -
 .../CodeGen/MLRegAllocDevelopmentFeatures.cpp | 293 ------------------
 3 files changed, 431 deletions(-)
 delete mode 100644 llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp

diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index 32b6c46303828..34531dd7ab17f 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -133,10 +133,6 @@ INITIALIZE_PASS(RegAllocScoring, "regallocscoringpass",
 // Common ML Advisor declarations
 // ===================================
 namespace {
-// The model can only accept a specified number of opcodes and will error it if
-// fed an opcode it hasn't seen before. This constant sets the current cutoff.
-static const int OpcodeValueCutoff = 17716;
-
 // Most features are as described above, so we'll reuse this vector in defining
 // them.
 static const std::vector<int64_t> PerLiveRangeShape{1, NumberOfInterferences};
@@ -948,139 +944,6 @@ void MLEvictAdvisor::extractFeatures(
 #undef SET
 }
 
-void llvm::extractInstructionFeatures(
-    SmallVectorImpl<LRStartEndInfo> &LRPosInfo, MLModelRunner *RegallocRunner,
-    function_ref<int(SlotIndex)> GetOpcode,
-    function_ref<float(SlotIndex)> GetMBBFreq,
-    function_ref<MachineBasicBlock *(SlotIndex)> GetMBBReference,
-    const int InstructionsIndex, const int InstructionsMappingIndex,
-    const int MBBFreqIndex, const int MBBMappingIndex,
-    const SlotIndex LastIndex) {
-  // This function extracts instruction based features relevant to the eviction
-  // problem currently being solved. This function ends up extracting two
-  // tensors.
-  // 1 - A vector of size max instruction count. It contains the opcodes of the
-  // instructions spanned by all the intervals in the current instance of the
-  // eviction problem.
-  // 2 - A binary mapping matrix of size (LR count * max
-  // instruction count) which maps where the LRs are live to the actual opcodes
-  // for which they are live.
-  // 3 - A vector of size max supported MBB count storing MBB frequencies,
-  // encompassing all of the MBBs covered by the eviction problem.
-  // 4 - A vector of size max instruction count of indices to members of the MBB
-  // frequency vector, mapping each instruction to its associated MBB.
-
-  // Start off by sorting the segments based on the beginning slot index.
-  std::sort(
-      LRPosInfo.begin(), LRPosInfo.end(),
-      [](LRStartEndInfo A, LRStartEndInfo B) { return A.Begin < B.Begin; });
-  size_t InstructionIndex = 0;
-  size_t CurrentSegmentIndex = 0;
-  SlotIndex CurrentIndex = LRPosInfo[0].Begin;
-  std::map<MachineBasicBlock *, size_t> VisitedMBBs;
-  size_t CurrentMBBIndex = 0;
-  // This loop processes all the segments sequentially by starting at the
-  // beginning slot index of the first segment, iterating through all the slot
-  // indices before the end slot index of that segment (while checking for
-  // overlaps with segments that start at greater slot indices). After hitting
-  // that end index, the current segment being processed gets bumped until they
-  // are all processed or the max instruction count is hit, where everything is
-  // just truncated.
-  while (true) {
-    // If the index that we are currently at is within the current segment and
-    // we haven't hit the max instruction count, continue processing the current
-    // segment.
-    while (CurrentIndex <= LRPosInfo[CurrentSegmentIndex].End &&
-           InstructionIndex < ModelMaxSupportedInstructionCount) {
-      int CurrentOpcode = GetOpcode(CurrentIndex);
-      // If the current machine instruction is null, skip it
-      if (CurrentOpcode == -1) {
-        // If we're currently at the last index in the SlotIndex analysis,
-        // we can't go any further, so return from the function
-        if (CurrentIndex >= LastIndex) {
-          return;
-        }
-        CurrentIndex = CurrentIndex.getNextIndex();
-        continue;
-      }
-      MachineBasicBlock *CurrentMBBReference = GetMBBReference(CurrentIndex);
-      if (VisitedMBBs.count(CurrentMBBReference) == 0) {
-        VisitedMBBs[CurrentMBBReference] = CurrentMBBIndex;
-        ++CurrentMBBIndex;
-      }
-      extractMBBFrequency(CurrentIndex, InstructionIndex, VisitedMBBs,
-                          GetMBBFreq, CurrentMBBReference, RegallocRunner,
-                          MBBFreqIndex, MBBMappingIndex);
-      // Current code assumes we're not going to get any disjointed segments
-      assert(LRPosInfo[CurrentSegmentIndex].Begin <= CurrentIndex);
-      RegallocRunner->getTensor<int64_t>(InstructionsIndex)[InstructionIndex] =
-          CurrentOpcode < OpcodeValueCutoff ? CurrentOpcode : 0;
-      // set value in the binary mapping matrix for the current instruction
-      auto CurrentSegmentPosition = LRPosInfo[CurrentSegmentIndex].Pos;
-      RegallocRunner->getTensor<int64_t>(
-          InstructionsMappingIndex)[CurrentSegmentPosition *
-                                        ModelMaxSupportedInstructionCount +
-                                    InstructionIndex] = 1;
-      // All of the segments are sorted based on the beginning slot index, but
-      // this doesn't mean that the beginning slot index of the next segment is
-      // after the end segment of the one being currently processed. This while
-      // loop checks for overlapping segments and modifies the portion of the
-      // column in the mapping matrix for the currently processed instruction
-      // for the LR it is checking. Also make sure that the beginning of the
-      // current segment we're checking for overlap in is less than the current
-      // index, otherwise we're done checking overlaps.
-      size_t OverlapCheckCurrentSegment = CurrentSegmentIndex + 1;
-      while (OverlapCheckCurrentSegment < LRPosInfo.size() &&
-             LRPosInfo[OverlapCheckCurrentSegment].Begin <= CurrentIndex) {
-        auto OverlapCurrentSegmentPosition =
-            LRPosInfo[OverlapCheckCurrentSegment].Pos;
-        if (LRPosInfo[OverlapCheckCurrentSegment].End >= CurrentIndex) {
-          RegallocRunner->getTensor<int64_t>(
-              InstructionsMappingIndex)[OverlapCurrentSegmentPosition *
-                                            ModelMaxSupportedInstructionCount +
-                                        InstructionIndex] = 1;
-        }
-        ++OverlapCheckCurrentSegment;
-      }
-      ++InstructionIndex;
-      if (CurrentIndex >= LastIndex) {
-        return;
-      }
-      CurrentIndex = CurrentIndex.getNextIndex();
-    }
-    // if we've just finished processing through the last segment or if we've
-    // hit the maximum number of instructions, break out of the loop.
-    if (CurrentSegmentIndex == LRPosInfo.size() - 1 ||
-        InstructionIndex >= ModelMaxSupportedInstructionCount) {
-      break;
-    }
-    // If the segments are not overlapping, we need to move to the beginning
-    // index of the next segment to avoid having instructions not attached to
-    // any register.
-    if (LRPosInfo[CurrentSegmentIndex + 1].Begin >
-        LRPosInfo[CurrentSegmentIndex].End) {
-      CurrentIndex = LRPosInfo[CurrentSegmentIndex + 1].Begin;
-    }
-    ++CurrentSegmentIndex;
-  }
-}
-
-void llvm::extractMBBFrequency(
-    const SlotIndex CurrentIndex, const size_t CurrentInstructionIndex,
-    std::map<MachineBasicBlock *, size_t> &VisitedMBBs,
-    function_ref<float(SlotIndex)> GetMBBFreq,
-    MachineBasicBlock *CurrentMBBReference, MLModelRunner *RegallocRunner,
-    const int MBBFreqIndex, const int MBBMappingIndex) {
-  size_t CurrentMBBIndex = VisitedMBBs[CurrentMBBReference];
-  float CurrentMBBFreq = GetMBBFreq(CurrentIndex);
-  if (CurrentMBBIndex < ModelMaxSupportedMBBCount) {
-    RegallocRunner->getTensor<float>(MBBFreqIndex)[CurrentMBBIndex] =
-        CurrentMBBFreq;
-    RegallocRunner->getTensor<int64_t>(
-        MBBMappingIndex)[CurrentInstructionIndex] = CurrentMBBIndex;
-  }
-}
-
 // Development mode-specific implementations
 #ifdef LLVM_HAVE_TFLITE
 
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index 4d07462babefa..80d10138d7bfe 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -49,7 +49,6 @@ add_llvm_unittest(CodeGenTests
   TypeTraitsTest.cpp
   TargetOptionsTest.cpp
   TestAsmPrinter.cpp
-  MLRegAllocDevelopmentFeatures.cpp
   X86MCInstLowerTest.cpp
   )
 
diff --git a/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp b/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp
deleted file mode 100644
index 00c2c3abf8533..0000000000000
--- a/llvm/unittests/CodeGen/MLRegAllocDevelopmentFeatures.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-//===- MLRegAllocDevelopmentFeatures.cpp - test dev MLRegAlloc features ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "../../lib/CodeGen/MLRegAllocEvictAdvisor.h"
-#include "llvm/Analysis/NoInferenceModelRunner.h"
-#include "llvm/CodeGen/CodeGenTargetMachineImpl.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
-#include "llvm/CodeGen/TargetInstrInfo.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/TargetParser/Triple.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-#include <string>
-#include <vector>
-
-using namespace llvm;
-using testing::ContainerEq;
-using testing::Test;
-
-namespace {
-
-#include "MFCommon.inc"
-
-struct LRPosInfoIndexes {
-  size_t StartIndex;
-  size_t EndIndex;
-  size_t PhysReg;
-};
-
-class RegAllocDevelopmentFeaturesTest : public ::Test {
-protected:
-  SmallVector<LRStartEndInfo>
-  setupOverlapProblem(const SmallVectorImpl<LRPosInfoIndexes> &Segments,
-                      simple_ilist<IndexListEntry> &IndexList) {
-    SmallVector<LRStartEndInfo> PositionsToReturn;
-    PositionsToReturn.reserve(Segments.size());
-    for (auto CurrentPosIndexInfo : Segments) {
-      LRStartEndInfo CurrentPosInfo = {};
-      CurrentPosInfo.Pos = CurrentPosIndexInfo.PhysReg;
-      PositionsToReturn.push_back(CurrentPosInfo);
-    }
-    size_t CurrentSegmentIndex = 0;
-    size_t CurrentIndex = 0;
-    while (CurrentSegmentIndex < Segments.size()) {
-      auto *CurrentLEMem = static_cast<IndexListEntry *>(
-          Allocator.Allocate(sizeof(IndexListEntry), alignof(IndexListEntry)));
-      auto *CurrentListEntry =
-          new (CurrentLEMem) IndexListEntry(nullptr, CurrentIndex);
-      IndexList.push_back(*CurrentListEntry);
-      for (size_t CurrentPosInfoIndex = 0;
-           CurrentPosInfoIndex < Segments.size(); ++CurrentPosInfoIndex) {
-        if ((CurrentIndex / SlotIndex::InstrDist) ==
-            Segments[CurrentPosInfoIndex].StartIndex) {
-          PositionsToReturn[CurrentPosInfoIndex].Begin =
-              SlotIndex(CurrentListEntry, 0);
-        } else if ((CurrentIndex / SlotIndex::InstrDist) ==
-                   Segments[CurrentPosInfoIndex].EndIndex) {
-          PositionsToReturn[CurrentPosInfoIndex].End =
-              SlotIndex(CurrentListEntry, 0);
-          ++CurrentSegmentIndex;
-        }
-      }
-      CurrentIndex += SlotIndex::InstrDist;
-    }
-    return PositionsToReturn;
-  }
-
-  NoInferenceModelRunner setupModelRunner() {
-    const std::vector<TensorSpec> Inputs{
-        TensorSpec::createSpec<int64_t>("instructions", InstructionsShape),
-        TensorSpec::createSpec<int64_t>("instructions_mapping",
-                                        InstructionsMappingShape),
-        TensorSpec::createSpec<float>("mbb_frequencies", MBBFrequencyShape),
-        TensorSpec::createSpec<int64_t>("mbb_mapping", InstructionsShape)};
-    LLVMContext Ctx;
-    return NoInferenceModelRunner(Ctx, Inputs);
-  }
-
-  std::vector<int64_t>
-  getExpectedMappingMatrix(SmallVectorImpl<LRPosInfoIndexes> &OverlapSetup) {
-    std::vector<int64_t> ExpectedMappingMatrix(
-        NumberOfInterferences * ModelMaxSupportedInstructionCount, 0);
-    for (auto NewSegment : OverlapSetup) {
-      for (size_t CurrentIndex = NewSegment.StartIndex;
-           CurrentIndex <= NewSegment.EndIndex; ++CurrentIndex) {
-        ExpectedMappingMatrix[NewSegment.PhysReg *
-                                  ModelMaxSupportedInstructionCount +
-                              CurrentIndex] = 1;
-      }
-    }
-    return ExpectedMappingMatrix;
-  }
-
-  void runOverlapTest(SmallVectorImpl<LRPosInfoIndexes> &OverlapSetup) {
-    simple_ilist<IndexListEntry> IndexList;
-    auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-    NoInferenceModelRunner ModelRunner = setupModelRunner();
-    size_t MaxIndex = 0;
-    for (size_t CurrentOverlap = 0; CurrentOverlap < OverlapSetup.size();
-         ++CurrentOverlap) {
-      if (OverlapSetup[CurrentOverlap].EndIndex >
-          OverlapSetup[MaxIndex].EndIndex) {
-        MaxIndex = CurrentOverlap;
-      }
-    }
-    SlotIndex LastIndex = OverlapProblem[MaxIndex].End;
-    extractInstructionFeatures(
-        OverlapProblem, &ModelRunner,
-        [](SlotIndex InputSlot) -> int { return 0; },
-        [](SlotIndex InputSlot) -> float { return 0.0f; },
-        [](SlotIndex InputSlot) -> MachineBasicBlock * { return nullptr; }, 0,
-        1, 2, 3, LastIndex);
-    std::vector<int64_t> MappingMatrix(
-        ModelRunner.getTensor<int64_t>(1),
-        ModelRunner.getTensor<int64_t>(1) +
-            NumberOfInterferences * ModelMaxSupportedInstructionCount);
-    ASSERT_THAT(MappingMatrix,
-                ContainerEq(getExpectedMappingMatrix(OverlapSetup)));
-    IndexList.clear();
-  }
-
-  BumpPtrAllocator Allocator;
-};
-
-// meta tests to ensure that test setup works correctly
-
-TEST_F(RegAllocDevelopmentFeaturesTest,
-       MetaOverlapInstructionDistancesAreCorrect) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, 5, 0});
-  OverlapSetup.push_back({5, 10, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  ASSERT_EQ(OverlapProblem[0].End.distance(OverlapProblem[1].End),
-            5 * SlotIndex::InstrDist);
-  ASSERT_EQ(OverlapProblem[0].End.distance(OverlapProblem[1].Begin), 0);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, MetaSlotIndicesAreValid) {
-  SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
-  OverlapSetup.push_back({0, 10, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  ASSERT_TRUE(OverlapProblem[0].Begin.isValid());
-  ASSERT_TRUE(OverlapProblem[0].End.isValid());
-}
-
-// Testing of feature extraction for per-instruction features
-
-TEST_F(RegAllocDevelopmentFeaturesTest, InstructionOpcodesAreCorrect) {
-  SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  NoInferenceModelRunner ModelRunner = setupModelRunner();
-  SlotIndex LastIndex = OverlapProblem[0].End;
-  SlotIndex FirstIndex = OverlapProblem[0].Begin;
-  extractInstructionFeatures(
-      OverlapProblem, &ModelRunner,
-      [FirstIndex](SlotIndex InputSlot) -> int {
-        return FirstIndex.distance(InputSlot) / SlotIndex::InstrDist;
-      },
-      [](SlotIndex InputSlot) -> float { return 0.0f; },
-      [](SlotIndex InputSlot) -> MachineBasicBlock * { return nullptr; }, 0, 1,
-      2, 3, LastIndex);
-  for (size_t CurrentInstructionIndex = 0;
-       CurrentInstructionIndex < ModelMaxSupportedInstructionCount;
-       ++CurrentInstructionIndex) {
-    ASSERT_EQ(
-        (size_t)ModelRunner.getTensor<int64_t>(0)[CurrentInstructionIndex],
-        CurrentInstructionIndex);
-  }
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, FullOverlap) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 0});
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 1});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, PartialOverlap) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, 20, 0});
-  OverlapSetup.push_back({15, 30, 1});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, PartialOverlapOpposite) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({15, 30, 1});
-  OverlapSetup.push_back({0, 20, 0});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, InternalOverlap) {
-  SmallVector<LRPosInfoIndexes, 2> OverlapSetup;
-  OverlapSetup.push_back({0, 30, 0});
-  OverlapSetup.push_back({10, 20, 1});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, TripleInternalOverlap) {
-  SmallVector<LRPosInfoIndexes, 3> OverlapSetup;
-  OverlapSetup.push_back({0, 30, 0});
-  OverlapSetup.push_back({10, 25, 1});
-  OverlapSetup.push_back({15, 20, 2});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, InternalMultiOverlap) {
-  SmallVector<LRPosInfoIndexes, 3> OverlapSetup;
-  OverlapSetup.push_back({0, 45, 0});
-  OverlapSetup.push_back({30, 40, 1});
-  OverlapSetup.push_back({35, 60, 2});
-  runOverlapTest(OverlapSetup);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, SingleMBBTest) {
-  NoInferenceModelRunner ModelRunner = setupModelRunner();
-  SlotIndex CurrentIndex;
-  // set index to 1 so we can ensure that the mapping actually get set
-  std::map<MachineBasicBlock *, size_t> VisitedMBBs = {{nullptr, 1}};
-  extractMBBFrequency(
-      CurrentIndex, 0, VisitedMBBs,
-      [](SlotIndex InputSlot) -> float { return 1.0f; }, nullptr, &ModelRunner,
-      2, 3);
-  ASSERT_FLOAT_EQ(ModelRunner.getTensor<float>(2)[1], 1.0f);
-  ASSERT_EQ(ModelRunner.getTensor<int64_t>(3)[0], 1);
-}
-
-TEST_F(RegAllocDevelopmentFeaturesTest, MBBFullTruncated) {
-  SmallVector<LRPosInfoIndexes, 1> OverlapSetup;
-  OverlapSetup.push_back({0, ModelMaxSupportedInstructionCount - 1, 0});
-  simple_ilist<IndexListEntry> IndexList;
-  auto OverlapProblem = setupOverlapProblem(OverlapSetup, IndexList);
-  NoInferenceModelRunner ModelRunner = setupModelRunner();
-  SlotIndex LastIndex = OverlapProblem[0].End;
-  SlotIndex FirstIndex = OverlapProblem[0].Begin;
-
-  LLVMContext Ctx;
-  Module Mod("Module", Ctx);
-  auto MF = createMachineFunction(Ctx, Mod);
-  std::array<MachineBasicBlock *, ModelMaxSupportedInstructionCount>
-      MBBsForTest;
-  for (size_t I = 0; I < ModelMaxSupportedInstructionCount; ++I) {
-    MBBsForTest[I] = MF->CreateMachineBasicBlock();
-  }
-
-  extractInstructionFeatures(
-      OverlapProblem, &ModelRunner,
-      [](SlotIndex InputSlot) -> int { return 0; },
-      [FirstIndex](SlotIndex InputSlot) -> float {
-        return static_cast<float>(FirstIndex.distance(InputSlot) /
-                                  SlotIndex::InstrDist);
-      },
-      [FirstIndex, MBBsForTest](SlotIndex InputSlot) -> MachineBasicBlock * {
-        return MBBsForTest[FirstIndex.distance(InputSlot) /
-                           SlotIndex::InstrDist];
-      },
-      0, 1, 2, 3, LastIndex);
-  for (size_t MBBIndex = 0; MBBIndex < ModelMaxSupportedMBBCount; ++MBBIndex) {
-    ASSERT_FLOAT_EQ(ModelRunner.getTensor<float>(2)[MBBIndex],
-                    static_cast<float>(MBBIndex));
-    ASSERT_EQ(ModelRunner.getTensor<int64_t>(3)[MBBIndex],
-              static_cast<int64_t>(MBBIndex));
-  }
-  // the rest of the mapping values should be zero (truncated to 100 MBBs)
-  for (size_t MBBIndex = ModelMaxSupportedMBBCount;
-       MBBIndex < ModelMaxSupportedInstructionCount; ++MBBIndex) {
-    ASSERT_EQ(ModelRunner.getTensor<int64_t>(3)[MBBIndex],
-              static_cast<int64_t>(0));
-  }
-}
-
-} // end namespace

From 4be0ab659e6a65436c4e3629706318acd0c1cdc9 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 17 Nov 2025 12:23:02 -0600
Subject: [PATCH 26/67] [flang][OpenMP] Undeprecate accidentally deprecated
 TARGET LOOP (#167495)

---
 flang/lib/Semantics/resolve-directives.cpp             |  7 +++----
 .../test/Semantics/OpenMP/target-loop-still-there.f90  | 10 ++++++++++
 2 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/Semantics/OpenMP/target-loop-still-there.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 68d007bc2de7e..c4d103613b587 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2038,8 +2038,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPLoopConstruct &x) {
   if (beginName.v == llvm::omp::OMPD_master_taskloop ||
       beginName.v == llvm::omp::OMPD_master_taskloop_simd ||
       beginName.v == llvm::omp::OMPD_parallel_master_taskloop ||
-      beginName.v == llvm::omp::OMPD_parallel_master_taskloop_simd ||
-      beginName.v == llvm::omp::Directive::OMPD_target_loop) {
+      beginName.v == llvm::omp::OMPD_parallel_master_taskloop_simd) {
     unsigned version{context_.langOptions().OpenMPVersion};
     IssueNonConformanceWarning(beginName.v, beginName.source, version);
   }
@@ -3622,8 +3621,8 @@ void OmpAttributeVisitor::IssueNonConformanceWarning(llvm::omp::Directive D,
   case llvm::omp::OMPD_allocate:
     setAlternativeStr("ALLOCATORS");
     break;
-  case llvm::omp::OMPD_target_loop:
-  default:;
+  default:
+    break;
   }
   context_.Warn(common::UsageWarning::OpenMPUsage, source, "%s"_warn_en_US,
       warnStrOS.str());
diff --git a/flang/test/Semantics/OpenMP/target-loop-still-there.f90 b/flang/test/Semantics/OpenMP/target-loop-still-there.f90
new file mode 100644
index 0000000000000..2d3b1820e23d4
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/target-loop-still-there.f90
@@ -0,0 +1,10 @@
+!RUN: %flang_fc1 -fsyntax-only -fopenmp -fopenmp-version=60 -Werror %s | FileCheck --allow-empty %s
+
+!CHECK-NOT: deprecated
+subroutine f00
+  implicit none
+  integer :: i
+  !$omp target loop
+  do i = 1, 10
+  end do
+end

From c4be17a8877ba406bcda63c5398bc09ebb32598a Mon Sep 17 00:00:00 2001
From: Andrew Haberlandt <ndrewh@users.noreply.github.com>
Date: Mon, 17 Nov 2025 10:28:26 -0800
Subject: [PATCH 27/67] [compiler-rt] [libFuzzer] Fix merge-posix.test file
 size test (#168137)

This test uses `ulimit -f 1` to test what libFuzzer does when trying to
create a file > **_1KB_**. However, the none of the input files used by
this test are actually >= 1KB, so there's no reason to expect this test
to pass.

This test appears to be passing on accident since the "control file"
happens to be > 1KB, but this is not always the case depending upon the
length of the path where the test is run from.

This modifies the test to ensure that one of the input file is actually
>1KB.
---
 compiler-rt/test/fuzzer/merge-posix.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/fuzzer/merge-posix.test b/compiler-rt/test/fuzzer/merge-posix.test
index 2721668fb9706..5e342142216f8 100644
--- a/compiler-rt/test/fuzzer/merge-posix.test
+++ b/compiler-rt/test/fuzzer/merge-posix.test
@@ -14,7 +14,7 @@ RUN: echo ....U. > %tmp/T2/2
 RUN: echo ...Z.. > %tmp/T2/3
 RUN: echo ...Z.. > %tmp/T2/4
 RUN: echo ....E. > %tmp/T2/5
-RUN: echo .....R > %tmp/T2/6
+RUN: %python -c "print('.....R' + 'X' * 1024, end='')" > %tmp/T2/6
 
 # Check that we can report an error if file size exceeded
 RUN: (ulimit -f 1; not %run %t-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=SIGXFSZ)

From cc304e5a5cf43d454d597eb9108f0bc7e6605722 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Mon, 17 Nov 2025 18:39:41 +0000
Subject: [PATCH 28/67] [TableGen] Strip directories from filename prefixes.
 (#168355)

Fixes https://github.com/llvm/llvm-project/pull/167700 to support
builds where TableGen's output file is specified as full path
rather than just filename.
---
 llvm/lib/TableGen/Main.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index c3869c3fb9a5a..3330b70cdc2e1 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -167,12 +167,11 @@ int llvm::TableGenMain(const char *argv0,
 
   // Write output to memory.
   Timer.startBackendTimer("Backend overall");
-  SmallString<128> FilenamePrefix(OutputFilename);
-  sys::path::replace_extension(FilenamePrefix, "");
   TableGenOutputFiles OutFiles;
   unsigned status = 0;
   // ApplyCallback will return true if it did not apply any callback. In that
   // case, attempt to apply the MainFn.
+  StringRef FilenamePrefix(sys::path::stem(OutputFilename));
   if (TableGen::Emitter::ApplyCallback(Records, OutFiles, FilenamePrefix))
     status = MainFn ? MainFn(OutFiles, Records) : 1;
   Timer.stopBackendTimer();
@@ -195,7 +194,7 @@ int llvm::TableGenMain(const char *argv0,
     SmallString<128> Filename(OutputFilename);
     // TODO: Format using the split-file convention when writing to stdout?
     if (Filename != "-") {
-      Filename = FilenamePrefix;
+      sys::path::replace_extension(Filename, "");
       Filename.append(Suffix);
     }
     if (int Ret = WriteOutput(Parser, argv0, Filename, Content))

From aa4de7b4ef510427b5997e525feb642fc0c51053 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Mon, 17 Nov 2025 10:41:05 -0800
Subject: [PATCH 29/67] [bazel] Add support for multiple tblgen outputs
 (#168158)

Required after https://github.com/llvm/llvm-project/pull/167700

This adds yet another format for `tbl_outs` where you pass the list of
opts, and a list of outputs (where previously you could only have 1
output). In that case all outputs must be produced, but the first is
used for the `-o` arg since tblgen is generating the other names based
on that single argument.
---
 .../llvm-project-overlay/llvm/BUILD.bazel     | 1506 +++++++++++++----
 .../llvm-project-overlay/mlir/tblgen.bzl      |   23 +-
 2 files changed, 1187 insertions(+), 342 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 67c397e34b8c7..a94442af376e5 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2193,92 +2193,249 @@ llvm_target_lib_list = [lib for lib in [
     {
         "name": "AArch64",
         "short_name": "AArch64",
-        "tbl_outs": {
-            "lib/Target/AArch64/AArch64GenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/AArch64/AArch64GenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/AArch64/AArch64GenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/AArch64/AArch64GenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/AArch64/AArch64GenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/AArch64/AArch64GenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/AArch64/AArch64GenAsmWriter1.inc": [
-                "-gen-asm-writer",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/AArch64/AArch64GenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/AArch64/AArch64GenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/AArch64/AArch64GenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/AArch64/AArch64GenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/AArch64/AArch64GenO0PreLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64O0PreLegalizerCombiner",
-            ],
-            "lib/Target/AArch64/AArch64GenPreLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64PreLegalizerCombiner",
-            ],
-            "lib/Target/AArch64/AArch64GenPostLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64PostLegalizerCombiner",
-            ],
-            "lib/Target/AArch64/AArch64GenPostLegalizeGILowering.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=AArch64PostLegalizerLowering",
-            ],
-            "lib/Target/AArch64/AArch64GenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/AArch64/AArch64GenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/AArch64/AArch64GenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/AArch64/AArch64GenDisassemblerTables.inc": [
-                "-gen-disassembler",
-            ],
-            "lib/Target/AArch64/AArch64GenSystemOperands.inc": ["-gen-searchable-tables"],
-            "lib/Target/AArch64/AArch64GenExegesis.inc": ["-gen-exegesis"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/AArch64/AArch64GenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/AArch64/AArch64GenRegisterInfo.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoEnums.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoMCDesc.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoHeader.inc",
+                    "lib/Target/AArch64/AArch64GenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/AArch64/AArch64GenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/AArch64/AArch64GenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/AArch64/AArch64GenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/AArch64/AArch64GenAsmWriter.inc",
+            ),
+            (
+                [
+                    "-gen-asm-writer",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/AArch64/AArch64GenAsmWriter1.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/AArch64/AArch64GenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/AArch64/AArch64GenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/AArch64/AArch64GenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/AArch64/AArch64GenGlobalISel.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64O0PreLegalizerCombiner",
+                ],
+                "lib/Target/AArch64/AArch64GenO0PreLegalizeGICombiner.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64PreLegalizerCombiner",
+                ],
+                "lib/Target/AArch64/AArch64GenPreLegalizeGICombiner.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64PostLegalizerCombiner",
+                ],
+                "lib/Target/AArch64/AArch64GenPostLegalizeGICombiner.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=AArch64PostLegalizerLowering",
+                ],
+                "lib/Target/AArch64/AArch64GenPostLegalizeGILowering.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/AArch64/AArch64GenCallingConv.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/AArch64/AArch64GenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/AArch64/AArch64GenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/AArch64/AArch64GenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/AArch64/AArch64GenSystemOperands.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/AArch64/AArch64GenExegesis.inc",
+            ),
+        ],
     },
     {
         "name": "ARM",
         "short_name": "ARM",
-        "tbl_outs": {
-            "lib/Target/ARM/ARMGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/ARM/ARMGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/ARM/ARMGenSystemRegister.inc": ["-gen-searchable-tables"],
-            "lib/Target/ARM/ARMGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/ARM/ARMGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/ARM/ARMGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/ARM/ARMGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/ARM/ARMGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/ARM/ARMGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/ARM/ARMGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/ARM/ARMGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/ARM/ARMGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/ARM/ARMGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/ARM/ARMGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "-ignore-non-decodable-operands",
-            ],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/ARM/ARMGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/ARM/ARMGenRegisterInfo.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoEnums.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoMCDesc.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoHeader.inc",
+                    "lib/Target/ARM/ARMGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/ARM/ARMGenSystemRegister.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/ARM/ARMGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/ARM/ARMGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/ARM/ARMGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/ARM/ARMGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/ARM/ARMGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/ARM/ARMGenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/ARM/ARMGenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/ARM/ARMGenGlobalISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/ARM/ARMGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/ARM/ARMGenSubtargetInfo.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "-ignore-non-decodable-operands",
+                ],
+                "lib/Target/ARM/ARMGenDisassemblerTables.inc",
+            ),
+        ],
     },
     {
         "name": "AMDGPU",
         "short_name": "AMDGPU",
-        "tbl_outs": {
-            "lib/Target/AMDGPU/AMDGPUGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/AMDGPU/AMDGPUGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/AMDGPU/AMDGPUGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/AMDGPU/AMDGPUGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/AMDGPU/AMDGPUGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/AMDGPU/AMDGPUGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/AMDGPU/AMDGPUGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/AMDGPU/AMDGPUGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/AMDGPU/AMDGPUGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/AMDGPU/AMDGPUGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/AMDGPU/AMDGPUGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "--specialize-decoders-per-bitwidth",
-                "-ignore-non-decodable-operands",
-                "-ignore-fully-defined-operands",
-            ],
-            "lib/Target/AMDGPU/AMDGPUGenSearchableTables.inc": ["-gen-searchable-tables"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/AMDGPU/AMDGPUGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfo.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoEnums.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoMCDesc.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoHeader.inc",
+                    "lib/Target/AMDGPU/AMDGPUGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/AMDGPU/AMDGPUGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/AMDGPU/AMDGPUGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/AMDGPU/AMDGPUGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/AMDGPU/AMDGPUGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/AMDGPU/AMDGPUGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/AMDGPU/AMDGPUGenDAGISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/AMDGPU/AMDGPUGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/AMDGPU/AMDGPUGenSubtargetInfo.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "--specialize-decoders-per-bitwidth",
+                    "-ignore-non-decodable-operands",
+                    "-ignore-fully-defined-operands",
+                ],
+                "lib/Target/AMDGPU/AMDGPUGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/AMDGPU/AMDGPUGenSearchableTables.inc",
+            ),
+        ],
         "tbl_deps": [
             ":InstCombineTableGen",
             ":amdgpu_isel_target_gen",
@@ -2288,184 +2445,567 @@ llvm_target_lib_list = [lib for lib in [
     {
         "name": "AVR",
         "short_name": "AVR",
-        "tbl_outs": {
-            "lib/Target/AVR/AVRGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/AVR/AVRGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/AVR/AVRGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/AVR/AVRGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/AVR/AVRGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-            ],
-            "lib/Target/AVR/AVRGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/AVR/AVRGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/AVR/AVRGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/AVR/AVRGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/AVR/AVRGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/AVR/AVRGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/AVR/AVRGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/AVR/AVRGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/AVR/AVRGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/AVR/AVRGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/AVR/AVRGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/AVR/AVRGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/AVR/AVRGenRegisterInfo.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoEnums.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoMCDesc.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoHeader.inc",
+                    "lib/Target/AVR/AVRGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/AVR/AVRGenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/AVR/AVRGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "BPF",
         "short_name": "BPF",
-        "tbl_outs": {
-            "lib/Target/BPF/BPFGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/BPF/BPFGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/BPF/BPFGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/BPF/BPFGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/BPF/BPFGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/BPF/BPFGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/BPF/BPFGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/BPF/BPFGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/BPF/BPFGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/BPF/BPFGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/BPF/BPFGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/BPF/BPFGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/BPF/BPFGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/BPF/BPFGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/BPF/BPFGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/BPF/BPFGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/BPF/BPFGenDAGISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/BPF/BPFGenGlobalISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/BPF/BPFGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/BPF/BPFGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/BPF/BPFGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/BPF/BPFGenRegisterInfo.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoEnums.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoMCDesc.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoHeader.inc",
+                    "lib/Target/BPF/BPFGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/BPF/BPFGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/BPF/BPFGenSDNodeInfo.inc",
+            ),
+        ],
     },
     {
         "name": "Hexagon",
         "short_name": "Hexagon",
-        "tbl_outs": {
-            "lib/Target/Hexagon/HexagonGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Hexagon/HexagonGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Hexagon/HexagonGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Hexagon/HexagonGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Hexagon/HexagonGenDFAPacketizer.inc": ["-gen-dfa-packetizer"],
-            "lib/Target/Hexagon/HexagonGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-            ],
-            "lib/Target/Hexagon/HexagonGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Hexagon/HexagonGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Hexagon/HexagonGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Hexagon/HexagonGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Hexagon/HexagonGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Hexagon/HexagonGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Hexagon/HexagonGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Hexagon/HexagonGenDAGISel.inc",
+            ),
+            (
+                ["-gen-dfa-packetizer"],
+                "lib/Target/Hexagon/HexagonGenDFAPacketizer.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/Hexagon/HexagonGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Hexagon/HexagonGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Hexagon/HexagonGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Hexagon/HexagonGenRegisterInfo.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoEnums.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoHeader.inc",
+                    "lib/Target/Hexagon/HexagonGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Hexagon/HexagonGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "Lanai",
         "short_name": "Lanai",
-        "tbl_outs": {
-            "lib/Target/Lanai/LanaiGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Lanai/LanaiGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Lanai/LanaiGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Lanai/LanaiGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Lanai/LanaiGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/Lanai/LanaiGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Lanai/LanaiGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Lanai/LanaiGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Lanai/LanaiGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/Lanai/LanaiGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Lanai/LanaiGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Lanai/LanaiGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Lanai/LanaiGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Lanai/LanaiGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/Lanai/LanaiGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Lanai/LanaiGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Lanai/LanaiGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Lanai/LanaiGenRegisterInfo.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoEnums.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoHeader.inc",
+                    "lib/Target/Lanai/LanaiGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/Lanai/LanaiGenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Lanai/LanaiGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "LoongArch",
         "short_name": "LoongArch",
-        "tbl_outs": {
-            "lib/Target/LoongArch/LoongArchGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/LoongArch/LoongArchGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/LoongArch/LoongArchGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/LoongArch/LoongArchGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/LoongArch/LoongArchGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/LoongArch/LoongArchGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/LoongArch/LoongArchGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/LoongArch/LoongArchGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/LoongArch/LoongArchGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/LoongArch/LoongArchGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/LoongArch/LoongArchGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/LoongArch/LoongArchGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/LoongArch/LoongArchGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/LoongArch/LoongArchGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/LoongArch/LoongArchGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/LoongArch/LoongArchGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfo.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoEnums.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoMCDesc.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoHeader.inc",
+                    "lib/Target/LoongArch/LoongArchGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/LoongArch/LoongArchGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "Mips",
         "short_name": "Mips",
-        "tbl_outs": {
-            "lib/Target/Mips/MipsGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Mips/MipsGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Mips/MipsGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Mips/MipsGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Mips/MipsGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "-ignore-non-decodable-operands",
-            ],
-            "lib/Target/Mips/MipsGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Mips/MipsGenExegesis.inc": ["-gen-exegesis"],
-            "lib/Target/Mips/MipsGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/Mips/MipsGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/Mips/MipsGenPostLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=MipsPostLegalizerCombiner",
-            ],
-            "lib/Target/Mips/MipsGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Mips/MipsGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/Mips/MipsGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/Mips/MipsGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Mips/MipsGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Mips/MipsGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Mips/MipsGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Mips/MipsGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Mips/MipsGenDAGISel.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "-ignore-non-decodable-operands",
+                ],
+                "lib/Target/Mips/MipsGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Mips/MipsGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/Mips/MipsGenExegesis.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/Mips/MipsGenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/Mips/MipsGenGlobalISel.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=MipsPostLegalizerCombiner",
+                ],
+                "lib/Target/Mips/MipsGenPostLegalizeGICombiner.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Mips/MipsGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/Mips/MipsGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/Mips/MipsGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Mips/MipsGenRegisterInfo.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoEnums.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoHeader.inc",
+                    "lib/Target/Mips/MipsGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Mips/MipsGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "MSP430",
         "short_name": "MSP430",
-        "tbl_outs": {
-            "lib/Target/MSP430/MSP430GenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/MSP430/MSP430GenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/MSP430/MSP430GenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/MSP430/MSP430GenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/MSP430/MSP430GenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/MSP430/MSP430GenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/MSP430/MSP430GenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/MSP430/MSP430GenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/MSP430/MSP430GenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/MSP430/MSP430GenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/MSP430/MSP430GenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/MSP430/MSP430GenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/MSP430/MSP430GenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/MSP430/MSP430GenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/MSP430/MSP430GenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/MSP430/MSP430GenInstrInfo.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/MSP430/MSP430GenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/MSP430/MSP430GenRegisterInfo.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoEnums.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoMCDesc.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoHeader.inc",
+                    "lib/Target/MSP430/MSP430GenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/MSP430/MSP430GenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/MSP430/MSP430GenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "NVPTX",
         "short_name": "NVPTX",
-        "tbl_outs": {
-            "lib/Target/NVPTX/NVPTXGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/NVPTX/NVPTXGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/NVPTX/NVPTXGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/NVPTX/NVPTXGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/NVPTX/NVPTXGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfo.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoEnums.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoMCDesc.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoHeader.inc",
+                    "lib/Target/NVPTX/NVPTXGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/NVPTX/NVPTXGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/NVPTX/NVPTXGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/NVPTX/NVPTXGenDAGISel.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/NVPTX/NVPTXGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "PowerPC",
         "short_name": "PPC",
-        "tbl_outs": {
-            "lib/Target/PowerPC/PPCGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/PowerPC/PPCGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/PowerPC/PPCGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/PowerPC/PPCGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/PowerPC/PPCGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/PowerPC/PPCGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/PowerPC/PPCGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/PowerPC/PPCGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/PowerPC/PPCGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/PowerPC/PPCGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/PowerPC/PPCGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/PowerPC/PPCGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/PowerPC/PPCGenExegesis.inc": ["-gen-exegesis"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/PowerPC/PPCGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/PowerPC/PPCGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/PowerPC/PPCGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/PowerPC/PPCGenRegisterInfo.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoEnums.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoMCDesc.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoHeader.inc",
+                    "lib/Target/PowerPC/PPCGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/PowerPC/PPCGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/PowerPC/PPCGenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/PowerPC/PPCGenFastISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/PowerPC/PPCGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/PowerPC/PPCGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/PowerPC/PPCGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/PowerPC/PPCGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/PowerPC/PPCGenGlobalISel.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/PowerPC/PPCGenExegesis.inc",
+            ),
+        ],
     },
     {
         "name": "RISCV",
         "short_name": "RISCV",
-        "tbl_outs": {
-            "lib/Target/RISCV/RISCVGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/RISCV/RISCVGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/RISCV/RISCVGenCompressInstEmitter.inc": ["-gen-compress-inst-emitter"],
-            "lib/Target/RISCV/RISCVGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/RISCV/RISCVGenDisassemblerTables.inc": [
-                "-gen-disassembler",
-                "--specialize-decoders-per-bitwidth",
-            ],
-            "lib/Target/RISCV/RISCVGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/RISCV/RISCVGenMacroFusion.inc": ["-gen-macro-fusion-pred"],
-            "lib/Target/RISCV/RISCVGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/RISCV/RISCVGenMCPseudoLowering.inc": ["-gen-pseudo-lowering"],
-            "lib/Target/RISCV/RISCVGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/RISCV/RISCVGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/RISCV/RISCVGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/RISCV/RISCVGenSearchableTables.inc": ["-gen-searchable-tables"],
-            "lib/Target/RISCV/RISCVGenExegesis.inc": ["-gen-exegesis"],
-            "lib/Target/RISCV/RISCVGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/RISCV/RISCVGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/RISCV/RISCVGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-compress-inst-emitter"],
+                "lib/Target/RISCV/RISCVGenCompressInstEmitter.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/RISCV/RISCVGenDAGISel.inc",
+            ),
+            (
+                [
+                    "-gen-disassembler",
+                    "--specialize-decoders-per-bitwidth",
+                ],
+                "lib/Target/RISCV/RISCVGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/RISCV/RISCVGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-macro-fusion-pred"],
+                "lib/Target/RISCV/RISCVGenMacroFusion.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/RISCV/RISCVGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-pseudo-lowering"],
+                "lib/Target/RISCV/RISCVGenMCPseudoLowering.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/RISCV/RISCVGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/RISCV/RISCVGenRegisterInfo.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoEnums.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoMCDesc.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoHeader.inc",
+                    "lib/Target/RISCV/RISCVGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/RISCV/RISCVGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/RISCV/RISCVGenSearchableTables.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/RISCV/RISCVGenExegesis.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/RISCV/RISCVGenSDNodeInfo.inc",
+            ),
+        ],
         "tbl_deps": [
             ":riscv_isel_target_gen",
         ],
@@ -2473,135 +3013,396 @@ llvm_target_lib_list = [lib for lib in [
     {
         "name": "Sparc",
         "short_name": "Sparc",
-        "tbl_outs": {
-            "lib/Target/Sparc/SparcGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/Sparc/SparcGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/Sparc/SparcGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/Sparc/SparcGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/Sparc/SparcGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/Sparc/SparcGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/Sparc/SparcGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/Sparc/SparcGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/Sparc/SparcGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/Sparc/SparcGenSearchableTables.inc": ["-gen-searchable-tables"],
-            "lib/Target/Sparc/SparcGenSDNodeInfo.inc": [
-                "-gen-sd-node-info",
-                "-sdnode-namespace=SPISD",
-            ],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/Sparc/SparcGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/Sparc/SparcGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/Sparc/SparcGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/Sparc/SparcGenRegisterInfo.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoEnums.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoMCDesc.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoHeader.inc",
+                    "lib/Target/Sparc/SparcGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/Sparc/SparcGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/Sparc/SparcGenDAGISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/Sparc/SparcGenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/Sparc/SparcGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/Sparc/SparcGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/Sparc/SparcGenSearchableTables.inc",
+            ),
+            (
+                [
+                    "-gen-sd-node-info",
+                    "-sdnode-namespace=SPISD",
+                ],
+                "lib/Target/Sparc/SparcGenSDNodeInfo.inc",
+            ),
+        ],
     },
     {
         "name": "SPIRV",
         "short_name": "SPIRV",
-        "tbl_outs": {
-            "lib/Target/SPIRV/SPIRVGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/SPIRV/SPIRVGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/SPIRV/SPIRVGenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/SPIRV/SPIRVGenPreLegalizeGICombiner.inc": [
-                "-gen-global-isel-combiner",
-                "-combiners=SPIRVPreLegalizerCombiner",
-            ],
-            "lib/Target/SPIRV/SPIRVGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/SPIRV/SPIRVGenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/SPIRV/SPIRVGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/SPIRV/SPIRVGenTables.inc": ["-gen-searchable-tables"],
-            "lib/Target/SPIRV/SPIRVGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/SPIRV/SPIRVGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/SPIRV/SPIRVGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/SPIRV/SPIRVGenGlobalISel.inc",
+            ),
+            (
+                [
+                    "-gen-global-isel-combiner",
+                    "-combiners=SPIRVPreLegalizerCombiner",
+                ],
+                "lib/Target/SPIRV/SPIRVGenPreLegalizeGICombiner.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/SPIRV/SPIRVGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-bank"],
+                "lib/Target/SPIRV/SPIRVGenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfo.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoEnums.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoMCDesc.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoHeader.inc",
+                    "lib/Target/SPIRV/SPIRVGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-searchable-tables"],
+                "lib/Target/SPIRV/SPIRVGenTables.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/SPIRV/SPIRVGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "SystemZ",
         "short_name": "SystemZ",
-        "tbl_outs": {
-            "lib/Target/SystemZ/SystemZGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/SystemZ/SystemZGenGNUAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/SystemZ/SystemZGenHLASMAsmWriter.inc": [
-                "-gen-asm-writer",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/SystemZ/SystemZGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/SystemZ/SystemZGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/SystemZ/SystemZGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/SystemZ/SystemZGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/SystemZ/SystemZGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/SystemZ/SystemZGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/SystemZ/SystemZGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/SystemZ/SystemZGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/SystemZ/SystemZGenGNUAsmWriter.inc",
+            ),
+            (
+                [
+                    "-gen-asm-writer",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/SystemZ/SystemZGenHLASMAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/SystemZ/SystemZGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/SystemZ/SystemZGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/SystemZ/SystemZGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/SystemZ/SystemZGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/SystemZ/SystemZGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/SystemZ/SystemZGenRegisterInfo.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoEnums.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoMCDesc.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoHeader.inc",
+                    "lib/Target/SystemZ/SystemZGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/SystemZ/SystemZGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "VE",
         "short_name": "VE",
-        "tbl_outs": {
-            "lib/Target/VE/VEGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/VE/VEGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/VE/VEGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/VE/VEGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/VE/VEGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/VE/VEGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/VE/VEGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/VE/VEGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/VE/VEGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/VE/VEGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/VE/VEGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/VE/VEGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/VE/VEGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/VE/VEGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/VE/VEGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/VE/VEGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/VE/VEGenRegisterInfo.inc",
+                    "lib/Target/VE/VEGenRegisterInfoEnums.inc",
+                    "lib/Target/VE/VEGenRegisterInfoMCDesc.inc",
+                    "lib/Target/VE/VEGenRegisterInfoHeader.inc",
+                    "lib/Target/VE/VEGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/VE/VEGenSubtargetInfo.inc",
+            ),
+        ],
     },
     {
         "name": "WebAssembly",
         "short_name": "WebAssembly",
-        "tbl_outs": {
-            "lib/Target/WebAssembly/WebAssemblyGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/WebAssembly/WebAssemblyGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/WebAssembly/WebAssemblyGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/WebAssembly/WebAssemblyGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/WebAssembly/WebAssemblyGenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/WebAssembly/WebAssemblyGenMCCodeEmitter.inc": ["-gen-emitter"],
-            "lib/Target/WebAssembly/WebAssemblyGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/WebAssembly/WebAssemblyGenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/WebAssembly/WebAssemblyGenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/WebAssembly/WebAssemblyGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-disassembler"],
+                "lib/Target/WebAssembly/WebAssemblyGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/WebAssembly/WebAssemblyGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/WebAssembly/WebAssemblyGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/WebAssembly/WebAssemblyGenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/WebAssembly/WebAssemblyGenFastISel.inc",
+            ),
+            (
+                ["-gen-emitter"],
+                "lib/Target/WebAssembly/WebAssemblyGenMCCodeEmitter.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfo.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoEnums.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoMCDesc.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoHeader.inc",
+                    "lib/Target/WebAssembly/WebAssemblyGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/WebAssembly/WebAssemblyGenSubtargetInfo.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/WebAssembly/WebAssemblyGenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/WebAssembly/WebAssemblyGenSDNodeInfo.inc",
+            ),
+        ],
     },
     {
         "name": "X86",
         "short_name": "X86",
-        "tbl_outs": {
-            "lib/Target/X86/X86GenRegisterBank.inc": ["-gen-register-bank"],
-            "lib/Target/X86/X86GenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/X86/X86GenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/X86/X86GenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/X86/X86GenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/X86/X86GenAsmWriter1.inc": [
-                "-gen-asm-writer",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/X86/X86GenAsmMatcher.inc": ["-gen-asm-matcher"],
-            "lib/Target/X86/X86GenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/X86/X86GenFastISel.inc": ["-gen-fast-isel"],
-            "lib/Target/X86/X86GenGlobalISel.inc": ["-gen-global-isel"],
-            "lib/Target/X86/X86GenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/X86/X86GenSubtargetInfo.inc": ["-gen-subtarget"],
-            "lib/Target/X86/X86GenFoldTables.inc": [
-                "-gen-x86-fold-tables",
-                "-asmwriternum=1",
-            ],
-            "lib/Target/X86/X86GenInstrMapping.inc": ["-gen-x86-instr-mapping"],
-            "lib/Target/X86/X86GenExegesis.inc": ["-gen-exegesis"],
-            "lib/Target/X86/X86GenMnemonicTables.inc": [
-                "-gen-x86-mnemonic-tables",
-                "-asmwriternum=1",
-            ],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-register-bank"],
+                "lib/Target/X86/X86GenRegisterBank.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/X86/X86GenRegisterInfo.inc",
+                    "lib/Target/X86/X86GenRegisterInfoEnums.inc",
+                    "lib/Target/X86/X86GenRegisterInfoMCDesc.inc",
+                    "lib/Target/X86/X86GenRegisterInfoHeader.inc",
+                    "lib/Target/X86/X86GenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/X86/X86GenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/X86/X86GenInstrInfo.inc",
+            ),
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/X86/X86GenAsmWriter.inc",
+            ),
+            (
+                [
+                    "-gen-asm-writer",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/X86/X86GenAsmWriter1.inc",
+            ),
+            (
+                ["-gen-asm-matcher"],
+                "lib/Target/X86/X86GenAsmMatcher.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/X86/X86GenDAGISel.inc",
+            ),
+            (
+                ["-gen-fast-isel"],
+                "lib/Target/X86/X86GenFastISel.inc",
+            ),
+            (
+                ["-gen-global-isel"],
+                "lib/Target/X86/X86GenGlobalISel.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/X86/X86GenCallingConv.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/X86/X86GenSubtargetInfo.inc",
+            ),
+            (
+                [
+                    "-gen-x86-fold-tables",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/X86/X86GenFoldTables.inc",
+            ),
+            (
+                ["-gen-x86-instr-mapping"],
+                "lib/Target/X86/X86GenInstrMapping.inc",
+            ),
+            (
+                ["-gen-exegesis"],
+                "lib/Target/X86/X86GenExegesis.inc",
+            ),
+            (
+                [
+                    "-gen-x86-mnemonic-tables",
+                    "-asmwriternum=1",
+                ],
+                "lib/Target/X86/X86GenMnemonicTables.inc",
+            ),
+        ],
     },
     {
         "name": "XCore",
         "short_name": "XCore",
-        "tbl_outs": {
-            "lib/Target/XCore/XCoreGenAsmWriter.inc": ["-gen-asm-writer"],
-            "lib/Target/XCore/XCoreGenCallingConv.inc": ["-gen-callingconv"],
-            "lib/Target/XCore/XCoreGenDAGISel.inc": ["-gen-dag-isel"],
-            "lib/Target/XCore/XCoreGenDisassemblerTables.inc": ["-gen-disassembler"],
-            "lib/Target/XCore/XCoreGenInstrInfo.inc": ["-gen-instr-info"],
-            "lib/Target/XCore/XCoreGenRegisterInfo.inc": ["-gen-register-info"],
-            "lib/Target/XCore/XCoreGenSDNodeInfo.inc": ["-gen-sd-node-info"],
-            "lib/Target/XCore/XCoreGenSubtargetInfo.inc": ["-gen-subtarget"],
-        },
+        "tbl_outs": [
+            (
+                ["-gen-asm-writer"],
+                "lib/Target/XCore/XCoreGenAsmWriter.inc",
+            ),
+            (
+                ["-gen-callingconv"],
+                "lib/Target/XCore/XCoreGenCallingConv.inc",
+            ),
+            (
+                ["-gen-dag-isel"],
+                "lib/Target/XCore/XCoreGenDAGISel.inc",
+            ),
+            (
+                ["-gen-disassembler"],
+                "lib/Target/XCore/XCoreGenDisassemblerTables.inc",
+            ),
+            (
+                ["-gen-instr-info"],
+                "lib/Target/XCore/XCoreGenInstrInfo.inc",
+            ),
+            (
+                ["-gen-register-info"],
+                [
+                    "lib/Target/XCore/XCoreGenRegisterInfo.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoEnums.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoMCDesc.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoHeader.inc",
+                    "lib/Target/XCore/XCoreGenRegisterInfoTargetDesc.inc",
+                ],
+            ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/XCore/XCoreGenSDNodeInfo.inc",
+            ),
+            (
+                ["-gen-subtarget"],
+                "lib/Target/XCore/XCoreGenSubtargetInfo.inc",
+            ),
+        ],
     },
 ] if lib["name"] in llvm_targets]
 
@@ -2639,16 +3440,46 @@ gentbl_cc_library(
 gentbl_cc_library(
     name = "r600_target_gen",
     strip_include_prefix = "lib/Target/AMDGPU",
-    tbl_outs = {
-        "lib/Target/AMDGPU/R600GenAsmWriter.inc": ["-gen-asm-writer"],
-        "lib/Target/AMDGPU/R600GenCallingConv.inc": ["-gen-callingconv"],
-        "lib/Target/AMDGPU/R600GenDAGISel.inc": ["-gen-dag-isel"],
-        "lib/Target/AMDGPU/R600GenDFAPacketizer.inc": ["-gen-dfa-packetizer"],
-        "lib/Target/AMDGPU/R600GenInstrInfo.inc": ["-gen-instr-info"],
-        "lib/Target/AMDGPU/R600GenMCCodeEmitter.inc": ["-gen-emitter"],
-        "lib/Target/AMDGPU/R600GenRegisterInfo.inc": ["-gen-register-info"],
-        "lib/Target/AMDGPU/R600GenSubtargetInfo.inc": ["-gen-subtarget"],
-    },
+    tbl_outs = [
+        (
+            ["-gen-asm-writer"],
+            "lib/Target/AMDGPU/R600GenAsmWriter.inc",
+        ),
+        (
+            ["-gen-callingconv"],
+            "lib/Target/AMDGPU/R600GenCallingConv.inc",
+        ),
+        (
+            ["-gen-dag-isel"],
+            "lib/Target/AMDGPU/R600GenDAGISel.inc",
+        ),
+        (
+            ["-gen-dfa-packetizer"],
+            "lib/Target/AMDGPU/R600GenDFAPacketizer.inc",
+        ),
+        (
+            ["-gen-instr-info"],
+            "lib/Target/AMDGPU/R600GenInstrInfo.inc",
+        ),
+        (
+            ["-gen-emitter"],
+            "lib/Target/AMDGPU/R600GenMCCodeEmitter.inc",
+        ),
+        (
+            ["-gen-register-info"],
+            [
+                "lib/Target/AMDGPU/R600GenRegisterInfo.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoEnums.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoMCDesc.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoHeader.inc",
+                "lib/Target/AMDGPU/R600GenRegisterInfoTargetDesc.inc",
+            ],
+        ),
+        (
+            ["-gen-subtarget"],
+            "lib/Target/AMDGPU/R600GenSubtargetInfo.inc",
+        ),
+    ],
     tblgen = ":llvm-tblgen",
     td_file = "lib/Target/AMDGPU/R600.td",
     deps = [
@@ -3381,7 +4212,10 @@ cc_library(
 gentbl_cc_library(
     name = "LibOptionsTableGen",
     strip_include_prefix = "lib/ToolDrivers/llvm-lib",
-    tbl_outs = {"lib/ToolDrivers/llvm-lib/Options.inc": ["-gen-opt-parser-defs"]},
+    tbl_outs = [(
+        ["-gen-opt-parser-defs"],
+        "lib/ToolDrivers/llvm-lib/Options.inc",
+    )],
     tblgen = ":llvm-tblgen",
     td_file = "lib/ToolDrivers/llvm-lib/Options.td",
     deps = [":OptParserTdFiles"],
diff --git a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
index 35888aac37e17..d28a8854fa896 100644
--- a/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
+++ b/utils/bazel/llvm-project-overlay/mlir/tblgen.bzl
@@ -153,7 +153,7 @@ def _gentbl_rule_impl(ctx):
     args.add("-o", ctx.outputs.out)
 
     ctx.actions.run(
-        outputs = [ctx.outputs.out],
+        outputs = [ctx.outputs.out] + ctx.outputs.additional_outputs,
         inputs = trans_srcs,
         executable = ctx.executable.tblgen,
         execution_requirements = {"supports-path-mapping": "1"},
@@ -195,6 +195,9 @@ gentbl_rule = rule(
             doc = "The output file for the TableGen invocation.",
             mandatory = True,
         ),
+        "additional_outputs": attr.output_list(
+            doc = "Extra output files from the TableGen invocation. The primary 'out' is used for the -o argument.",
+        ),
         "opts": attr.string_list(
             doc = "Additional command line options to add to the TableGen" +
                   " invocation. For include arguments, prefer to use" +
@@ -313,9 +316,12 @@ def gentbl_filegroup(
       name: The name of the generated filegroup rule for use in dependencies.
       tblgen: The binary used to produce the output.
       td_file: The primary table definitions file.
-      tbl_outs: Either a dict {out: [opts]} or a list of tuples ([opts], out),
-        where each 'opts' is a list of options passed to tblgen, each option
-        being a string, and 'out' is the corresponding output file produced.
+      tbl_outs: Either a dict {out: [opts]}, a list of tuples ([opts], out),
+        or a list of tuples ([opts], [outs]). Each 'opts' is a list of options
+        passed to tblgen, each option being a string,
+        and 'out' is the corresponding output file produced. If 'outs' are used,
+        the first path in the list is passed to '-o' but tblgen is expected
+        to produce all listed outputs.
       td_srcs: See gentbl_rule.td_srcs
       includes: See gentbl_rule.includes
       deps: See gentbl_rule.deps
@@ -325,9 +331,14 @@ def gentbl_filegroup(
       **kwargs: Extra keyword arguments to pass to all generated rules.
     """
 
+    included_srcs = []
     if type(tbl_outs) == type({}):
         tbl_outs = [(v, k) for k, v in tbl_outs.items()]
-    for (opts, out) in tbl_outs:
+    for (opts, output_or_outputs) in tbl_outs:
+        outs = output_or_outputs if type(output_or_outputs) == type([]) else [output_or_outputs]
+        out = outs[0]
+        if not any([skip_opt in opts for skip_opt in skip_opts]):
+            included_srcs.extend(outs)
         first_opt = opts[0] if opts else ""
         rule_suffix = "_{}_{}".format(
             first_opt.replace("-", "_").replace("=", "_"),
@@ -343,6 +354,7 @@ def gentbl_filegroup(
             deps = deps,
             includes = includes,
             out = out,
+            additional_outputs = outs[1:],
             **kwargs
         )
 
@@ -364,7 +376,6 @@ def gentbl_filegroup(
                 **kwargs
             )
 
-    included_srcs = [f for (opts, f) in tbl_outs if not any([skip_opt in opts for skip_opt in skip_opts])]
     native.filegroup(
         name = name,
         srcs = included_srcs,

From 6360bbbb6890c965016a98fab8ea76551f577c1f Mon Sep 17 00:00:00 2001
From: David Tenty <daltenty@ibm.com>
Date: Mon, 17 Nov 2025 13:46:37 -0500
Subject: [PATCH 30/67] [AIX] Raise soft memory limits to hard limits (#167928)

AIX out-of-box memory soft limits are often
insufficient to run LLVM on reasonably size
inputs. Thus, we often encounter users who run
into spurious out of memory errors.

This change raises the memory soft limits
to the hard limits at LLVM startup to prevent
these types of issues.
---
 llvm/lib/Support/InitLLVM.cpp | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp
index b8fbfd21c4f28..b90f4e0714458 100644
--- a/llvm/lib/Support/InitLLVM.cpp
+++ b/llvm/lib/Support/InitLLVM.cpp
@@ -32,6 +32,34 @@
 #endif
 #endif
 
+static void RaiseLimits() {
+#ifdef _AIX
+  // AIX has restrictive memory soft-limits out-of-box, so raise them if needed.
+  auto RaiseLimit = [](int resource) {
+    struct rlimit r;
+    getrlimit(resource, &r);
+
+    // Increase the soft limit to the hard limit, if necessary and
+    // possible.
+    if (r.rlim_cur != RLIM_INFINITY && r.rlim_cur != r.rlim_max) {
+      r.rlim_cur = r.rlim_max;
+      setrlimit(resource, &r);
+    }
+  };
+
+  // Address space size.
+  RaiseLimit(RLIMIT_AS);
+  // Heap size.
+  RaiseLimit(RLIMIT_DATA);
+  // Stack size.
+  RaiseLimit(RLIMIT_STACK);
+#ifdef RLIMIT_RSS
+  // Resident set size.
+  RaiseLimit(RLIMIT_RSS);
+#endif
+#endif
+}
+
 void CleanupStdHandles(void *Cookie) {
   llvm::raw_ostream *Outs = &llvm::outs(), *Errs = &llvm::errs();
   Outs->flush();
@@ -67,6 +95,7 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv,
   StackPrinter.emplace(Argc, Argv);
   sys::PrintStackTraceOnErrorSignal(Argv[0]);
   install_out_of_memory_new_handler();
+  RaiseLimits();
 
 #ifdef __MVS__
 

From bb9df2e3bd7ec903f5040ec9e78bdc9e06561d67 Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Mon, 17 Nov 2025 10:51:13 -0800
Subject: [PATCH 31/67] [lldb] Ensure FILE* access mode is correctly specified
 when creating a NativeFile. (#167764)

If we open a `NativeFile` with a `FILE*`, the OpenOptions default to
`eOpenOptionReadOnly`. This is an issue in python scripts if you try to
write to one of the files like `print("Hi",
file=lldb.debugger.GetOutputFileHandle())`.

To address this, we need to specify the access mode whenever we create a
`NativeFile` from a `FILE*`. I also added an assert on the `NativeFile`
that validates the file is opened with the correct access mode and
updated `NativeFile::Read` and `NativeFile::Write` to check the access
mode.

Before these changes:
```
$ lldb -b -O 'script lldb.debugger.GetOutputFileHandle().write("abc")'
(lldb) script lldb.debugger.GetOutputFileHandle().write("abc")
Traceback (most recent call last):
  File "<input>", line 1, in <module>
io.UnsupportedOperation: not writable
```

After:
```
$ lldb -b -O 'script lldb.debugger.GetOutputFileHandle().write("abc")'
(lldb) script lldb.debugger.GetOutputFileHandle().write("abc")
abc3
```

Fixes #122387
---
 lldb/include/lldb/API/SBFile.h            |  3 ++
 lldb/include/lldb/Host/File.h             |  5 ++-
 lldb/include/lldb/Host/StreamFile.h       |  3 +-
 lldb/source/API/SBCommandReturnObject.cpp |  7 ++--
 lldb/source/API/SBDebugger.cpp            | 16 +++++----
 lldb/source/API/SBFile.cpp                | 17 ++++++++-
 lldb/source/API/SBInstruction.cpp         |  5 +--
 lldb/source/API/SBProcess.cpp             |  4 ++-
 lldb/source/API/SBStream.cpp              |  3 +-
 lldb/source/Core/Debugger.cpp             |  9 +++--
 lldb/source/Host/common/File.cpp          | 42 +++++++++++++++++++----
 lldb/source/Host/common/StreamFile.cpp    |  3 +-
 lldb/unittests/Host/FileTest.cpp          | 22 +++++++++++-
 13 files changed, 113 insertions(+), 26 deletions(-)

diff --git a/lldb/include/lldb/API/SBFile.h b/lldb/include/lldb/API/SBFile.h
index ebdc5607b7942..8cf4fe1b405fa 100644
--- a/lldb/include/lldb/API/SBFile.h
+++ b/lldb/include/lldb/API/SBFile.h
@@ -27,7 +27,10 @@ class LLDB_API SBFile {
   SBFile(FileSP file_sp);
 #ifndef SWIG
   SBFile(const SBFile &rhs);
+  LLDB_DEPRECATED_FIXME("Use the constructor that specifies mode instead",
+                        "SBFile(FILE*, const char*, bool)")
   SBFile(FILE *file, bool transfer_ownership);
+  SBFile(FILE *file, const char *mode, bool transfer_ownership);
 #endif
   SBFile(int fd, const char *mode, bool transfer_ownership);
   ~SBFile();
diff --git a/lldb/include/lldb/Host/File.h b/lldb/include/lldb/Host/File.h
index 7402a2231735a..590c9fa523b29 100644
--- a/lldb/include/lldb/Host/File.h
+++ b/lldb/include/lldb/Host/File.h
@@ -66,6 +66,9 @@ class File : public IOObject {
     LLVM_MARK_AS_BITMASK_ENUM(/* largest_value= */ eOpenOptionInvalid)
   };
 
+  static constexpr OpenOptions OpenOptionsModeMask =
+      eOpenOptionReadOnly | eOpenOptionWriteOnly | eOpenOptionReadWrite;
+
   static mode_t ConvertOpenOptionsForPOSIXOpen(OpenOptions open_options);
   static llvm::Expected<OpenOptions> GetOptionsFromMode(llvm::StringRef mode);
   static bool DescriptorIsValid(int descriptor) { return descriptor >= 0; };
@@ -384,7 +387,7 @@ class NativeFile : public File {
 
   NativeFile();
 
-  NativeFile(FILE *fh, bool transfer_ownership);
+  NativeFile(FILE *fh, OpenOptions options, bool transfer_ownership);
 
   NativeFile(int fd, OpenOptions options, bool transfer_ownership);
 
diff --git a/lldb/include/lldb/Host/StreamFile.h b/lldb/include/lldb/Host/StreamFile.h
index e37661a9938c0..8b01eeab6f586 100644
--- a/lldb/include/lldb/Host/StreamFile.h
+++ b/lldb/include/lldb/Host/StreamFile.h
@@ -81,7 +81,8 @@ class LockableStreamFile {
   LockableStreamFile(StreamFile &stream_file, Mutex &mutex)
       : m_file_sp(stream_file.GetFileSP()), m_mutex(mutex) {}
   LockableStreamFile(FILE *fh, bool transfer_ownership, Mutex &mutex)
-      : m_file_sp(std::make_shared<NativeFile>(fh, transfer_ownership)),
+      : m_file_sp(std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                               transfer_ownership)),
         m_mutex(mutex) {}
   LockableStreamFile(std::shared_ptr<File> file_sp, Mutex &mutex)
       : m_file_sp(file_sp), m_mutex(mutex) {}
diff --git a/lldb/source/API/SBCommandReturnObject.cpp b/lldb/source/API/SBCommandReturnObject.cpp
index e78e213aa23af..da7e288e38d28 100644
--- a/lldb/source/API/SBCommandReturnObject.cpp
+++ b/lldb/source/API/SBCommandReturnObject.cpp
@@ -15,6 +15,7 @@
 #include "lldb/API/SBValue.h"
 #include "lldb/API/SBValueList.h"
 #include "lldb/Core/StructuredDataImpl.h"
+#include "lldb/Host/File.h"
 #include "lldb/Interpreter/CommandReturnObject.h"
 #include "lldb/Utility/ConstString.h"
 #include "lldb/Utility/Instrumentation.h"
@@ -275,14 +276,16 @@ void SBCommandReturnObject::SetImmediateErrorFile(FILE *fh) {
 void SBCommandReturnObject::SetImmediateOutputFile(FILE *fh,
                                                    bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  FileSP file = std::make_shared<NativeFile>(fh, transfer_ownership);
+  FileSP file = std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                             transfer_ownership);
   ref().SetImmediateOutputFile(file);
 }
 
 void SBCommandReturnObject::SetImmediateErrorFile(FILE *fh,
                                                   bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  FileSP file = std::make_shared<NativeFile>(fh, transfer_ownership);
+  FileSP file = std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                             transfer_ownership);
   ref().SetImmediateErrorFile(file);
 }
 
diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp
index 5c4c653d95a81..7a4bebfdf998e 100644
--- a/lldb/source/API/SBDebugger.cpp
+++ b/lldb/source/API/SBDebugger.cpp
@@ -327,8 +327,8 @@ void SBDebugger::SkipAppInitFiles(bool b) {
 void SBDebugger::SetInputFileHandle(FILE *fh, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
   if (m_opaque_sp)
-    m_opaque_sp->SetInputFile(
-        (FileSP)std::make_shared<NativeFile>(fh, transfer_ownership));
+    m_opaque_sp->SetInputFile((FileSP)std::make_shared<NativeFile>(
+        fh, File::eOpenOptionReadOnly, transfer_ownership));
 }
 
 SBError SBDebugger::SetInputString(const char *data) {
@@ -385,7 +385,8 @@ SBError SBDebugger::SetOutputFile(FileSP file_sp) {
 
 void SBDebugger::SetOutputFileHandle(FILE *fh, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  SetOutputFile((FileSP)std::make_shared<NativeFile>(fh, transfer_ownership));
+  SetOutputFile((FileSP)std::make_shared<NativeFile>(
+      fh, File::eOpenOptionWriteOnly, transfer_ownership));
 }
 
 SBError SBDebugger::SetOutputFile(SBFile file) {
@@ -405,7 +406,8 @@ SBError SBDebugger::SetOutputFile(SBFile file) {
 
 void SBDebugger::SetErrorFileHandle(FILE *fh, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_ownership);
-  SetErrorFile((FileSP)std::make_shared<NativeFile>(fh, transfer_ownership));
+  SetErrorFile((FileSP)std::make_shared<NativeFile>(
+      fh, File::eOpenOptionWriteOnly, transfer_ownership));
 }
 
 SBError SBDebugger::SetErrorFile(FileSP file_sp) {
@@ -576,8 +578,10 @@ void SBDebugger::HandleProcessEvent(const SBProcess &process,
                                     FILE *err) {
   LLDB_INSTRUMENT_VA(this, process, event, out, err);
 
-  FileSP outfile = std::make_shared<NativeFile>(out, false);
-  FileSP errfile = std::make_shared<NativeFile>(err, false);
+  FileSP outfile =
+      std::make_shared<NativeFile>(out, File::eOpenOptionWriteOnly, false);
+  FileSP errfile =
+      std::make_shared<NativeFile>(err, File::eOpenOptionWriteOnly, false);
   return HandleProcessEvent(process, event, outfile, errfile);
 }
 
diff --git a/lldb/source/API/SBFile.cpp b/lldb/source/API/SBFile.cpp
index 2ae4b1481afbf..56909923d4b2d 100644
--- a/lldb/source/API/SBFile.cpp
+++ b/lldb/source/API/SBFile.cpp
@@ -39,7 +39,22 @@ SBFile::SBFile() { LLDB_INSTRUMENT_VA(this); }
 SBFile::SBFile(FILE *file, bool transfer_ownership) {
   LLDB_INSTRUMENT_VA(this, file, transfer_ownership);
 
-  m_opaque_sp = std::make_shared<NativeFile>(file, transfer_ownership);
+  // For backwards comptability, this defaulted to ReadOnly previously.
+  m_opaque_sp = std::make_shared<NativeFile>(file, File::eOpenOptionReadOnly,
+                                             transfer_ownership);
+}
+
+SBFile::SBFile(FILE *file, const char *mode, bool transfer_ownership) {
+  LLDB_INSTRUMENT_VA(this, file, transfer_ownership);
+
+  auto options = File::GetOptionsFromMode(mode);
+  if (!options) {
+    llvm::consumeError(options.takeError());
+    return;
+  }
+
+  m_opaque_sp =
+      std::make_shared<NativeFile>(file, options.get(), transfer_ownership);
 }
 
 SBFile::SBFile(int fd, const char *mode, bool transfer_owndership) {
diff --git a/lldb/source/API/SBInstruction.cpp b/lldb/source/API/SBInstruction.cpp
index 6755089af39a4..5921511f3b239 100644
--- a/lldb/source/API/SBInstruction.cpp
+++ b/lldb/source/API/SBInstruction.cpp
@@ -10,8 +10,8 @@
 #include "lldb/Utility/Instrumentation.h"
 
 #include "lldb/API/SBAddress.h"
-#include "lldb/API/SBFrame.h"
 #include "lldb/API/SBFile.h"
+#include "lldb/API/SBFrame.h"
 
 #include "lldb/API/SBStream.h"
 #include "lldb/API/SBTarget.h"
@@ -268,7 +268,8 @@ bool SBInstruction::GetDescription(lldb::SBStream &s) {
 
 void SBInstruction::Print(FILE *outp) {
   LLDB_INSTRUMENT_VA(this, outp);
-  FileSP out = std::make_shared<NativeFile>(outp, /*take_ownership=*/false);
+  FileSP out = std::make_shared<NativeFile>(outp, File::eOpenOptionWriteOnly,
+                                            /*take_ownership=*/false);
   Print(out);
 }
 
diff --git a/lldb/source/API/SBProcess.cpp b/lldb/source/API/SBProcess.cpp
index d4be64b815369..14aa9432eed83 100644
--- a/lldb/source/API/SBProcess.cpp
+++ b/lldb/source/API/SBProcess.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/API/SBProcess.h"
+#include "lldb/Host/File.h"
 #include "lldb/Utility/Instrumentation.h"
 
 #include <cinttypes>
@@ -310,7 +311,8 @@ void SBProcess::ReportEventState(const SBEvent &event, SBFile out) const {
 
 void SBProcess::ReportEventState(const SBEvent &event, FILE *out) const {
   LLDB_INSTRUMENT_VA(this, event, out);
-  FileSP outfile = std::make_shared<NativeFile>(out, false);
+  FileSP outfile =
+      std::make_shared<NativeFile>(out, File::eOpenOptionWriteOnly, false);
   return ReportEventState(event, outfile);
 }
 
diff --git a/lldb/source/API/SBStream.cpp b/lldb/source/API/SBStream.cpp
index fc8f09a7bb9ae..2fc5fcfa8b0c4 100644
--- a/lldb/source/API/SBStream.cpp
+++ b/lldb/source/API/SBStream.cpp
@@ -116,7 +116,8 @@ void SBStream::RedirectToFile(const char *path, bool append) {
 
 void SBStream::RedirectToFileHandle(FILE *fh, bool transfer_fh_ownership) {
   LLDB_INSTRUMENT_VA(this, fh, transfer_fh_ownership);
-  FileSP file = std::make_unique<NativeFile>(fh, transfer_fh_ownership);
+  FileSP file = std::make_unique<NativeFile>(fh, File::eOpenOptionReadWrite,
+                                             transfer_fh_ownership);
   return RedirectToFile(file);
 }
 
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index b37d9d3ed85e3..02f38e9094ec5 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -965,7 +965,8 @@ llvm::StringRef Debugger::GetStaticBroadcasterClass() {
 Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton)
     : UserID(g_unique_id++),
       Properties(std::make_shared<OptionValueProperties>()),
-      m_input_file_sp(std::make_shared<NativeFile>(stdin, NativeFile::Unowned)),
+      m_input_file_sp(std::make_shared<NativeFile>(
+          stdin, File::eOpenOptionReadOnly, NativeFile::Unowned)),
       m_output_stream_sp(std::make_shared<LockableStreamFile>(
           stdout, NativeFile::Unowned, m_output_mutex)),
       m_error_stream_sp(std::make_shared<LockableStreamFile>(
@@ -1172,7 +1173,8 @@ Status Debugger::SetInputString(const char *data) {
     return result;
   }
 
-  SetInputFile((FileSP)std::make_shared<NativeFile>(commands_file, true));
+  SetInputFile((FileSP)std::make_shared<NativeFile>(
+      commands_file, File::eOpenOptionReadOnly, true));
   return result;
 }
 
@@ -1378,7 +1380,8 @@ void Debugger::AdoptTopIOHandlerFilesIfInvalid(FileSP &in,
       in = GetInputFileSP();
     // If there is nothing, use stdin
     if (!in)
-      in = std::make_shared<NativeFile>(stdin, NativeFile::Unowned);
+      in = std::make_shared<NativeFile>(stdin, File::eOpenOptionReadOnly,
+                                        NativeFile::Unowned);
   }
   // If no STDOUT has been set, then set it appropriately
   if (!out || !out->GetUnlockedFile().IsValid()) {
diff --git a/lldb/source/Host/common/File.cpp b/lldb/source/Host/common/File.cpp
index 65b75bd647c5d..4fad93fca9ea3 100644
--- a/lldb/source/Host/common/File.cpp
+++ b/lldb/source/Host/common/File.cpp
@@ -249,8 +249,8 @@ uint32_t File::GetPermissions(Status &error) const {
 
 NativeFile::NativeFile() = default;
 
-NativeFile::NativeFile(FILE *fh, bool transfer_ownership)
-    : m_stream(fh), m_own_stream(transfer_ownership) {
+NativeFile::NativeFile(FILE *fh, OpenOptions options, bool transfer_ownership)
+    : m_stream(fh), m_options(options), m_own_stream(transfer_ownership) {
 #ifdef _WIN32
   // In order to properly display non ASCII characters in Windows, we need to
   // use Windows APIs to print to the console. This is only required if the
@@ -258,6 +258,26 @@ NativeFile::NativeFile(FILE *fh, bool transfer_ownership)
   int fd = _fileno(fh);
   is_windows_console =
       ::GetFileType((HANDLE)::_get_osfhandle(fd)) == FILE_TYPE_CHAR;
+#else
+#ifndef NDEBUG
+  int fd = fileno(fh);
+  if (fd != -1) {
+    int required_mode = ConvertOpenOptionsForPOSIXOpen(options) & O_ACCMODE;
+    int mode = fcntl(fd, F_GETFL);
+    if (mode != -1) {
+      mode &= O_ACCMODE;
+      // Check that the file is open with a valid subset of the requested file
+      // access mode, e.g. if we expected the file to be writable then ensure it
+      // was opened with O_WRONLY or O_RDWR.
+      assert(
+          (required_mode == O_RDWR && mode == O_RDWR) ||
+          (required_mode == O_RDONLY && (mode == O_RDWR || mode == O_RDONLY) ||
+           (required_mode == O_WRONLY &&
+            (mode == O_RDWR || mode == O_WRONLY))) &&
+              "invalid file access mode");
+    }
+  }
+#endif
 #endif
 }
 
@@ -274,7 +294,8 @@ NativeFile::NativeFile(int fd, OpenOptions options, bool transfer_ownership)
 }
 
 bool NativeFile::IsValid() const {
-  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex, m_stream_mutex);
+  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex,
+                                                m_stream_mutex);
   return DescriptorIsValidUnlocked() || StreamIsValidUnlocked();
 }
 
@@ -343,7 +364,8 @@ FILE *NativeFile::GetStream() {
 }
 
 Status NativeFile::Close() {
-  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex, m_stream_mutex);
+  std::scoped_lock<std::mutex, std::mutex> lock(m_descriptor_mutex,
+                                                m_stream_mutex);
 
   Status error;
 
@@ -548,6 +570,10 @@ Status NativeFile::Sync() {
 Status NativeFile::Read(void *buf, size_t &num_bytes) {
   Status error;
 
+  // Ensure the file is open for reading.
+  if ((m_options & File::OpenOptionsModeMask) == eOpenOptionWriteOnly)
+    return Status(std::make_error_code(std::errc::bad_file_descriptor));
+
 #if defined(MAX_READ_SIZE)
   if (num_bytes > MAX_READ_SIZE) {
     uint8_t *p = (uint8_t *)buf;
@@ -612,6 +638,10 @@ Status NativeFile::Read(void *buf, size_t &num_bytes) {
 Status NativeFile::Write(const void *buf, size_t &num_bytes) {
   Status error;
 
+  // Ensure the file is open for writing.
+  if ((m_options & File::OpenOptionsModeMask) == File::eOpenOptionReadOnly)
+    return Status(std::make_error_code(std::errc::bad_file_descriptor));
+
 #if defined(MAX_WRITE_SIZE)
   if (num_bytes > MAX_WRITE_SIZE) {
     const uint8_t *p = (const uint8_t *)buf;
@@ -776,8 +806,8 @@ Status NativeFile::Write(const void *buf, size_t &num_bytes, off_t &offset) {
   int fd = GetDescriptor();
   if (fd != kInvalidDescriptor) {
 #ifndef _WIN32
-    ssize_t bytes_written =
-        llvm::sys::RetryAfterSignal(-1, ::pwrite, m_descriptor, buf, num_bytes, offset);
+    ssize_t bytes_written = llvm::sys::RetryAfterSignal(
+        -1, ::pwrite, m_descriptor, buf, num_bytes, offset);
     if (bytes_written < 0) {
       num_bytes = 0;
       error = Status::FromErrno();
diff --git a/lldb/source/Host/common/StreamFile.cpp b/lldb/source/Host/common/StreamFile.cpp
index 099980a0993c6..131412d81983b 100644
--- a/lldb/source/Host/common/StreamFile.cpp
+++ b/lldb/source/Host/common/StreamFile.cpp
@@ -27,7 +27,8 @@ StreamFile::StreamFile(int fd, bool transfer_ownership) : Stream() {
 }
 
 StreamFile::StreamFile(FILE *fh, bool transfer_ownership) : Stream() {
-  m_file_sp = std::make_shared<NativeFile>(fh, transfer_ownership);
+  m_file_sp = std::make_shared<NativeFile>(fh, File::eOpenOptionWriteOnly,
+                                           transfer_ownership);
 }
 
 StreamFile::StreamFile(const char *path, File::OpenOptions options,
diff --git a/lldb/unittests/Host/FileTest.cpp b/lldb/unittests/Host/FileTest.cpp
index d973d19430596..85697c49f6fce 100644
--- a/lldb/unittests/Host/FileTest.cpp
+++ b/lldb/unittests/Host/FileTest.cpp
@@ -8,6 +8,7 @@
 
 #include "lldb/Host/File.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/Path.h"
@@ -35,7 +36,7 @@ TEST(File, GetWaitableHandleFileno) {
   FILE *stream = fdopen(fd, "r");
   ASSERT_TRUE(stream);
 
-  NativeFile file(stream, true);
+  NativeFile file(stream, File::eOpenOptionReadWrite, true);
 #ifdef _WIN32
   EXPECT_EQ(file.GetWaitableHandle(), (HANDLE)_get_osfhandle(fd));
 #else
@@ -67,3 +68,22 @@ TEST(File, GetStreamFromDescriptor) {
   EXPECT_EQ(file.GetWaitableHandle(), (file_t)fd);
 #endif
 }
+
+TEST(File, ReadOnlyModeNotWritable) {
+  const auto *Info = testing::UnitTest::GetInstance()->current_test_info();
+  llvm::SmallString<128> name;
+  int fd;
+  llvm::sys::fs::createTemporaryFile(llvm::Twine(Info->test_case_name()) + "-" +
+                                         Info->name(),
+                                     "test", fd, name);
+
+  llvm::FileRemover remover(name);
+  ASSERT_GE(fd, 0);
+
+  NativeFile file(fd, File::eOpenOptionReadOnly, true);
+  ASSERT_TRUE(file.IsValid());
+  llvm::StringLiteral buf = "Hello World";
+  size_t bytes_written = buf.size();
+  Status error = file.Write(buf.data(), bytes_written);
+  EXPECT_EQ(error.Fail(), true);
+}

From aba3269bc98733f56a4b694dddf816c32ff50341 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 17 Nov 2025 10:55:27 -0800
Subject: [PATCH 32/67] [.gitignore] Ignore .claude and .gemini in
 subdirectories (#167029)

Currently `.claude/` and `.gemini/` are only ignored in the root of the
repo. Developers might conceivable run these tools in project
subdirectories, in which case these should be ignored as well.
---
 .gitignore | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 860b8ea12abd4..a9d616286adf1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -54,9 +54,9 @@ autoconf/autom4te.cache
 /cmake-build*
 # Coding assistants' stuff
 /CLAUDE.md
-/.claude/
+.claude/
 /GEMINI.md
-/.gemini/
+.gemini/
 
 #==============================================================================#
 # Directories to ignore (do not add trailing '/'s, they skip symlinks).

From adeedad449d19087baa5ec4fbc246d1f6664b7d4 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Mon, 17 Nov 2025 10:56:00 -0800
Subject: [PATCH 33/67] [bazel] Port 900c517919794ff0ea83c6b15ffe03707a164800
 (#168423)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index a94442af376e5..4932e674ba85c 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2435,6 +2435,10 @@ llvm_target_lib_list = [lib for lib in [
                 ["-gen-searchable-tables"],
                 "lib/Target/AMDGPU/AMDGPUGenSearchableTables.inc",
             ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/AMDGPU/AMDGPUGenSDNodeInfo.inc",
+            ),
         ],
         "tbl_deps": [
             ":InstCombineTableGen",

From 18b5e2a7266bfe8f211be7ae1198e6bed4ab0c06 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 17 Nov 2025 10:56:28 -0800
Subject: [PATCH 34/67] [lldb] Push down the SWIG module to avoid an import
 cycle (#166265)

This is a reland of #129135 (by dingxiangfei2009) with Vladislav
(dzhidzhoev) fix on top.

Fixes #92603
---
 lldb/bindings/python/CMakeLists.txt                       | 8 ++++++--
 lldb/bindings/python/python.swig                          | 7 ++++++-
 .../ScriptInterpreter/Python/ScriptInterpreterPython.cpp  | 1 +
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/lldb/bindings/python/CMakeLists.txt b/lldb/bindings/python/CMakeLists.txt
index 28a8af8f06319..2ebcf5a8e7aca 100644
--- a/lldb/bindings/python/CMakeLists.txt
+++ b/lldb/bindings/python/CMakeLists.txt
@@ -60,8 +60,10 @@ endfunction()
 function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_target_dir)
   # Add a Post-Build Event to copy over Python files and create the symlink to
   # liblldb.so for the Python API(hardlink on Windows).
+  # Note that Swig-generated code is located one level deeper in the `native`
+  # module, in order to avoid cyclic importing.
   add_custom_target(${swig_target} ALL VERBATIM
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${lldb_python_target_dir}
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${lldb_python_target_dir}/native/
     DEPENDS ${lldb_python_bindings_dir}/lldb.py
     COMMENT "Python script sym-linking LLDB Python API")
 
@@ -75,6 +77,8 @@ function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_tar
       "${LLDB_SOURCE_DIR}/source/Interpreter/embedded_interpreter.py"
       "${lldb_python_target_dir}")
 
+  create_python_package(${swig_target} ${lldb_python_target_dir} "native" FILES)
+
   # Distribute the examples as python packages.
   create_python_package(
     ${swig_target}
@@ -143,7 +147,7 @@ function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_tar
   endif()
   set(LIBLLDB_SYMLINK_OUTPUT_FILE "_lldb${LLDB_PYTHON_EXT_SUFFIX}")
   create_relative_symlink(${swig_target} ${LIBLLDB_SYMLINK_DEST}
-                          ${lldb_python_target_dir} ${LIBLLDB_SYMLINK_OUTPUT_FILE})
+                          ${lldb_python_target_dir}/native/ ${LIBLLDB_SYMLINK_OUTPUT_FILE})
 
 
   if (NOT WIN32)
diff --git a/lldb/bindings/python/python.swig b/lldb/bindings/python/python.swig
index b2823f98acac8..3d2caa65f1658 100644
--- a/lldb/bindings/python/python.swig
+++ b/lldb/bindings/python/python.swig
@@ -50,7 +50,12 @@ Older swig versions will simply ignore this setting.
     import $module
 except ImportError:
     # Relative import should work if we are being loaded by Python.
-    from . import $module"
+    # The cpython module built by swig is pushed one level down into
+    # the native submodule, because at this point the interpreter
+    # is still constructing the lldb module itself.
+    # Simply importing anything using `from . import` constitutes
+    # a cyclic importing.
+    from .native import $module"
 %enddef
 
 // The name of the module to be created.
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 3493fa9fef635..35a772c1454df 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -272,6 +272,7 @@ void ScriptInterpreterPython::SharedLibraryDirectoryHelper(
   // does.
   if (this_file.GetFileNameExtension() == ".pyd") {
     this_file.RemoveLastPathComponent(); // _lldb.pyd or _lldb_d.pyd
+    this_file.RemoveLastPathComponent(); // native
     this_file.RemoveLastPathComponent(); // lldb
     llvm::StringRef libdir = LLDB_PYTHON_RELATIVE_LIBDIR;
     for (auto it = llvm::sys::path::begin(libdir),

From 7672a5cee12a299a083b93a6d304b27ab3f4707f Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Mon, 17 Nov 2025 10:59:04 -0800
Subject: [PATCH 35/67] [scudo] Fix wrong return type. (#168157)

---
 compiler-rt/lib/scudo/standalone/primary64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 747b1a2233d32..c2401c86671d0 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -1394,7 +1394,7 @@ uptr SizeClassAllocator64<Config>::releaseToOSMaybe(RegionInfo *Region,
                                             Region->FreeListInfo.PushedBlocks) *
                                                BlockSize;
     if (UNLIKELY(BytesInFreeList == 0))
-      return false;
+      return 0;
 
     // ==================================================================== //
     // 1. Check if we have enough free blocks and if it's worth doing a page

From cd5d5b31bff0052be214357133ad3dd7f3f24a74 Mon Sep 17 00:00:00 2001
From: Dmitry Chigarev <dmitry.chigarev@intel.com>
Date: Mon, 17 Nov 2025 20:00:03 +0100
Subject: [PATCH 36/67] [mlir][XeGPU] Use DistributeLayoutAttr instead of
 LayoutAttr for load gather/scatter ops (#167850)

The PR changes the layout attribute type for
`xegpu::LoadGatherOp/StoreScatterOp` from `LayoutAttr` to
`DistributeLayoutAttr` to also support `xegpu.slice` layouts.

Initially we [wanted to restrict slice
layouts](https://github.com/llvm/llvm-project/pull/163414#discussion_r2478978798)
from the attribute, but now it turns out there are actually valid use
cases for that:
```mlir
gpu.func @distribute_load_slice_attr() {
  %2 = memref.alloca() {alignment = 1024} : memref<4096xf32>
  %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<0> : vector<256xindex>
  %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>

  %3 = xegpu.load %2[%offset], %mask <{chunk_size = 1, layout = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]>>} {
      layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]>
  } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>

  %4 = vector.broadcast %3 {layout_result_0 =
      #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256xf32> to vector<256x256xf32>
  gpu.return
}
```

Signed-off-by: dchigarev <dmitry.chigarev@intel.com>
---
 mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td  |  8 ++++----
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp          |  4 ++--
 .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp    |  4 ++--
 .../XeGPU/Transforms/XeGPUWgToSgDistribute.cpp  | 12 +++++++-----
 .../Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir | 17 +++++++++++++++++
 5 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 689ebd0d1179a..4c67856b559b1 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -844,7 +844,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
-      OptionalAttr<XeGPU_LayoutAttr>:$layout);
+      OptionalAttr<DistributeLayoutAttr>:$layout);
   let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
@@ -903,7 +903,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint,
-                    "xegpu::LayoutAttr": $layout)>
+                    "xegpu::DistributeLayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
@@ -988,7 +988,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
-      OptionalAttr<XeGPU_LayoutAttr>:$layout);
+      OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration#[{
     Type getDestType() {
@@ -1046,7 +1046,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
                     "xegpu::CachePolicyAttr": $l3_hint,
-                    "xegpu::LayoutAttr": $layout)>
+                    "xegpu::DistributeLayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 4dd10bedc6d84..85c9a966f0fe8 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -901,7 +901,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
                          IntegerAttr chunk_size, xegpu::CachePolicyAttr l1_hint,
                          xegpu::CachePolicyAttr l2_hint,
                          xegpu::CachePolicyAttr l3_hint,
-                         xegpu::LayoutAttr layout) {
+                         DistributeLayoutAttr layout) {
   auto loc = source.getLoc();
   int64_t size = static_cast<int64_t>(offsets.size());
   auto type = VectorType::get(size, builder.getIndexType());
@@ -985,7 +985,7 @@ void StoreScatterOp::build(
     OpBuilder &builder, OperationState &state, Value value, Value dest,
     ArrayRef<OpFoldResult> offsets, Value mask, IntegerAttr chunk_size,
     xegpu::CachePolicyAttr l1_hint, xegpu::CachePolicyAttr l2_hint,
-    xegpu::CachePolicyAttr l3_hint, xegpu::LayoutAttr layout) {
+    xegpu::CachePolicyAttr l3_hint, DistributeLayoutAttr layout) {
   auto loc = dest.getLoc();
   int64_t size = static_cast<int64_t>(offsets.size());
   auto type = VectorType::get(size, builder.getIndexType());
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index c3bf9606693a8..330553564f81a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -678,7 +678,7 @@ struct UnrollLoadGatherOpWithOffset
           pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter);
     }
 
-    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    auto layout = op.getLayoutAttr();
     if (layout)
       layout = layout.dropInstData();
 
@@ -778,7 +778,7 @@ struct UnrollStoreScatterOpWithOffsets
     SmallVector<Value> convertedValues =
         pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
 
-    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    auto layout = op.getLayoutAttr();
     if (layout)
       layout = layout.dropInstData();
 
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 0a9ef0aa6df96..33d4b0457e5d3 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -889,8 +889,8 @@ struct WgToSgLoadGatherOpWithOffset
       return failure();
     ArrayRef<int64_t> wgShape = resultType.getShape();
 
-    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
-        xegpu::getDistributeLayoutAttr(op.getResult()));
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getResult());
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -913,10 +913,12 @@ struct WgToSgLoadGatherOpWithOffset
     VectorType newTy = VectorType::get(sgShape, resultType.getElementType());
     for (auto [offsets, mask] :
          llvm::zip(adaptor.getOffsets(), adaptor.getMask())) {
+      auto newLayout = layout.dropSgLayoutAndData();
       auto newLoadOp = xegpu::LoadGatherOp::create(
           rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr,
           op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
-          layout.dropSgLayoutAndData());
+          newLayout);
+      xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0), newLayout);
       newLoadOps.push_back(newLoadOp);
     }
     rewriter.replaceOpWithMultiple(op, {newLoadOps});
@@ -941,8 +943,8 @@ struct WgToSgStoreScatterOpWithOffset
     if (!valueType)
       return failure();
 
-    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
-        xegpu::getDistributeLayoutAttr(op.getOperand(0)));
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getOperand(0));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 4fbb566cfbe73..5dde84e8e0bc2 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -547,4 +547,21 @@ gpu.module @test_distribution {
     %broadcast = vector.broadcast %arg0 {layout_result_0 = #xegpu.layout<sg_layout = [4, 8, 1], sg_data = [1, 1, 1]>} : index to vector<4x1x1xindex>
     gpu.return
   }
+
+  // CHECK-LABEL: distribute_load_slice_attr
+  gpu.func @distribute_load_slice_attr() {
+    %2 = memref.alloca() {alignment = 1024} : memref<4096xf32>
+    %offset =  arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<0> : vector<256xindex>
+    %mask = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [32], inst_data = [16]> } dense<1> : vector<256xi1>
+
+    // CHECK: %[[LOAD:.*]] = xegpu.load {{.*}} <{chunk_size = 1 : i64, layout = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>}>
+    // CHECK-SAME: {layout_result_0 = #xegpu.slice<#xegpu.layout<inst_data = [8, 16]>, dims = [0]>} :
+    // CHECK-SAME: memref<4096xf32>, vector<32xindex>, vector<32xi1> -> vector<32xf32>
+    %3 = xegpu.load %2[%offset], %mask {chunk_size = 1, layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>, dims = [0]> } : memref<4096xf32>, vector<256xindex>, vector<256xi1> -> vector<256xf32>
+
+    // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[LOAD]] {layout_result_0 = #xegpu.layout<inst_data = [8, 16]>} : vector<32xf32> to vector<32x32xf32>
+    %4 = vector.broadcast %3 {layout_result_0 =
+        #xegpu.layout<sg_layout = [8, 8], sg_data = [32, 32], inst_data = [8, 16]>} : vector<256xf32> to vector<256x256xf32>
+    gpu.return
+  }
 }

From bafb3f67880d716fcc0ad14f10d8a98699591cd5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 17 Nov 2025 19:01:25 +0000
Subject: [PATCH 37/67] [LV] Add test with existing noalias metadata and
 runtime checks.

Add test where we have loads with existing noalias metadata and noalias
metadata gets added by loop versioning.
---
 .../test/Transforms/LoopVectorize/metadata.ll | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll
index fe25d1b231efc..ed027e8b9a895 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata.ll
@@ -497,6 +497,129 @@ exit:
   ret void
 }
 
+define void @noalias_metadata(ptr align 8 %dst, ptr align 8 %src) {
+; CHECK-LABEL: define void @noalias_metadata(
+; CHECK-SAME: ptr align 8 [[DST:%.*]], ptr align 8 [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[SRC4:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[DST3:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[DST3]], [[SRC4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[DST1]], 8
+; CHECK-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], [[SRC2]]
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP22]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP5]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 8
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP23]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul i64 [[INDEX]], 8
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP26]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[NEXT_GEP]], align 8, !alias.scope [[META14:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    store ptr [[TMP7]], ptr [[DST]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP24]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ], [ [[SRC]], %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[VAL:%.*]] = load ptr, ptr [[PTR]], align 8
+; CHECK-NEXT:    store ptr [[VAL]], ptr [[DST]], align 8, !noalias [[META23:![0-9]+]]
+; CHECK-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[PTR]], [[DST]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+; INTERLEAVE-LABEL: define void @noalias_metadata(
+; INTERLEAVE-SAME: ptr align 8 [[DST:%.*]], ptr align 8 [[SRC:%.*]]) {
+; INTERLEAVE-NEXT:  [[ENTRY:.*]]:
+; INTERLEAVE-NEXT:    [[SRC4:%.*]] = ptrtoint ptr [[SRC]] to i64
+; INTERLEAVE-NEXT:    [[DST3:%.*]] = ptrtoint ptr [[DST]] to i64
+; INTERLEAVE-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
+; INTERLEAVE-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = sub i64 [[DST3]], [[SRC4]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = lshr i64 [[TMP2]], 3
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; INTERLEAVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP4]], 4
+; INTERLEAVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; INTERLEAVE:       [[VECTOR_MEMCHECK]]:
+; INTERLEAVE-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = add i64 [[DST1]], 8
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP5]], [[SRC2]]
+; INTERLEAVE-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP22]]
+; INTERLEAVE-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP5]]
+; INTERLEAVE-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; INTERLEAVE-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; INTERLEAVE-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; INTERLEAVE:       [[VECTOR_PH]]:
+; INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[N_MOD_VF]]
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 8
+; INTERLEAVE-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP23]]
+; INTERLEAVE-NEXT:    br label %[[VECTOR_BODY:.*]]
+; INTERLEAVE:       [[VECTOR_BODY]]:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP26:%.*]] = mul i64 [[INDEX]], 8
+; INTERLEAVE-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP26]]
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr ptr, ptr [[NEXT_GEP]], i32 2
+; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x ptr>, ptr [[TMP7]], align 8, !alias.scope [[META14:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = extractelement <2 x ptr> [[WIDE_LOAD]], i32 1
+; INTERLEAVE-NEXT:    store ptr [[TMP8]], ptr [[DST]], align 8, !alias.scope [[META17:![0-9]+]], !noalias [[META19:![0-9]+]]
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; INTERLEAVE-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[TMP28]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; INTERLEAVE:       [[MIDDLE_BLOCK]]:
+; INTERLEAVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
+; INTERLEAVE-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; INTERLEAVE:       [[SCALAR_PH]]:
+; INTERLEAVE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP24]], %[[MIDDLE_BLOCK]] ], [ [[SRC]], %[[ENTRY]] ], [ [[SRC]], %[[VECTOR_MEMCHECK]] ]
+; INTERLEAVE-NEXT:    br label %[[LOOP:.*]]
+; INTERLEAVE:       [[LOOP]]:
+; INTERLEAVE-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; INTERLEAVE-NEXT:    [[VAL:%.*]] = load ptr, ptr [[PTR]], align 8
+; INTERLEAVE-NEXT:    store ptr [[VAL]], ptr [[DST]], align 8, !noalias [[META23:![0-9]+]]
+; INTERLEAVE-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
+; INTERLEAVE-NEXT:    [[CMP:%.*]] = icmp eq ptr [[PTR]], [[DST]]
+; INTERLEAVE-NEXT:    br i1 [[CMP]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
+; INTERLEAVE:       [[EXIT]]:
+; INTERLEAVE-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr = phi ptr [ %ptr.next, %loop ], [ %src, %entry ]
+  %val = load ptr, ptr %ptr, align 8
+  store ptr %val, ptr %dst, align 8, !noalias !4
+  %ptr.next = getelementptr inbounds i8, ptr %ptr, i64 8
+  %cmp = icmp eq ptr %ptr, %dst
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 declare i64 @foo(i64)
 declare double @bar(double)
 
@@ -510,6 +633,9 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 !1 = !{ i64 0, i64 2 }
 !2 = !{!"Simple C/C++ TBAA"}
 !3 = !{!"omnipotent char", !2, i64 0}
+!4 = !{!5}
+!5 = distinct !{!5, !6, !"g1"}
+!6 = distinct !{!6, !"t2"}
 
 ;.
 ; CHECK: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0}
@@ -526,6 +652,17 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META5]], [[META6]]}
 ; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]], [[META6]]}
 ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META6]], [[META5]]}
+; CHECK: [[META14]] = !{[[META15:![0-9]+]]}
+; CHECK: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]]}
+; CHECK: [[META16]] = distinct !{[[META16]], !"LVerDomain"}
+; CHECK: [[META17]] = !{[[META18:![0-9]+]]}
+; CHECK: [[META18]] = distinct !{[[META18]], [[META16]]}
+; CHECK: [[META19]] = !{[[META20:![0-9]+]], [[META15]]}
+; CHECK: [[META20]] = distinct !{[[META20]], [[META21:![0-9]+]], !"g1"}
+; CHECK: [[META21]] = distinct !{[[META21]], !"t2"}
+; CHECK: [[LOOP22]] = distinct !{[[LOOP22]], [[META5]], [[META6]]}
+; CHECK: [[META23]] = !{[[META20]]}
+; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META5]]}
 ;.
 ; INTERLEAVE: [[CHAR_TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0, i64 0}
 ; INTERLEAVE: [[META1]] = !{!"omnipotent char", [[META2]]}
@@ -541,4 +678,15 @@ attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_bar(bar_
 ; INTERLEAVE: [[LOOP11]] = distinct !{[[LOOP11]], [[META5]], [[META6]]}
 ; INTERLEAVE: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]], [[META6]]}
 ; INTERLEAVE: [[LOOP13]] = distinct !{[[LOOP13]], [[META6]], [[META5]]}
+; INTERLEAVE: [[META14]] = !{[[META15:![0-9]+]]}
+; INTERLEAVE: [[META15]] = distinct !{[[META15]], [[META16:![0-9]+]]}
+; INTERLEAVE: [[META16]] = distinct !{[[META16]], !"LVerDomain"}
+; INTERLEAVE: [[META17]] = !{[[META18:![0-9]+]]}
+; INTERLEAVE: [[META18]] = distinct !{[[META18]], [[META16]]}
+; INTERLEAVE: [[META19]] = !{[[META20:![0-9]+]], [[META15]]}
+; INTERLEAVE: [[META20]] = distinct !{[[META20]], [[META21:![0-9]+]], !"g1"}
+; INTERLEAVE: [[META21]] = distinct !{[[META21]], !"t2"}
+; INTERLEAVE: [[LOOP22]] = distinct !{[[LOOP22]], [[META5]], [[META6]]}
+; INTERLEAVE: [[META23]] = !{[[META20]]}
+; INTERLEAVE: [[LOOP24]] = distinct !{[[LOOP24]], [[META5]]}
 ;.

From af6af8e4eb778acc1b655574cdf2a4086a9fdcce Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Mon, 17 Nov 2025 11:08:22 -0800
Subject: [PATCH 38/67] [bazel] Port 0a58e49c44ae7cca39b3eb219efed9f0581b8b0f
 (#168424)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 4932e674ba85c..d582f448c2213 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -3225,6 +3225,10 @@ llvm_target_lib_list = [lib for lib in [
                 ["-gen-subtarget"],
                 "lib/Target/VE/VEGenSubtargetInfo.inc",
             ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/VE/VEGenSDNodeInfo.inc",
+            ),
         ],
     },
     {

From c555522818ff3acaa928f4147546ecec81e579eb Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Mon, 17 Nov 2025 11:08:49 -0800
Subject: [PATCH 39/67] [lldb-dap] Migrating 'evaluate' to structured types.
 (#167720)

Adding structured types for the evaluate request handler.

This should be mostly a non-functional change. I did catch some spelling
mistakes in our tests ('variable' vs 'variables').
---
 .../test/tools/lldb-dap/dap_server.py         |   3 +-
 .../lldb-dap/evaluate/TestDAP_evaluate.py     | 194 ++++++++++---
 .../Handler/EvaluateRequestHandler.cpp        | 265 +++++-------------
 lldb/tools/lldb-dap/Handler/RequestHandler.h  |   9 +-
 lldb/tools/lldb-dap/JSONUtils.cpp             |   5 +-
 lldb/tools/lldb-dap/JSONUtils.h               |   6 +-
 lldb/tools/lldb-dap/LLDBUtils.cpp             |  11 +-
 lldb/tools/lldb-dap/LLDBUtils.h               |   2 +-
 .../lldb-dap/Protocol/ProtocolRequests.cpp    |  49 ++++
 .../lldb-dap/Protocol/ProtocolRequests.h      | 117 ++++++++
 lldb/tools/lldb-dap/Protocol/ProtocolTypes.h  |   3 +-
 lldb/unittests/DAP/ProtocolRequestsTest.cpp   |  51 ++++
 12 files changed, 461 insertions(+), 254 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index ac550962cfb85..a4ca090021f3f 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -978,9 +978,10 @@ def request_evaluate(self, expression, frameIndex=0, threadId=None, context=None
             return []
         args_dict = {
             "expression": expression,
-            "context": context,
             "frameId": stackFrame["id"],
         }
+        if context:
+            args_dict["context"] = context
         command_dict = {
             "command": "evaluate",
             "type": "request",
diff --git a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
index 20a75f4076e42..3c233a5b43ebb 100644
--- a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
+++ b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
@@ -1,5 +1,5 @@
 """
-Test lldb-dap completions request
+Test lldb-dap evaluate request
 """
 
 import re
@@ -7,16 +7,67 @@
 import lldbdap_testcase
 from lldbsuite.test.decorators import skipIfWindows
 from lldbsuite.test.lldbtest import line_number
+from typing import TypedDict, Optional
+
+
+class EvaluateResponseBody(TypedDict, total=False):
+    result: str
+    variablesReference: int
+    type: Optional[str]
+    memoryReference: Optional[str]
+    valueLocationReference: Optional[int]
 
 
 class TestDAP_evaluate(lldbdap_testcase.DAPTestCaseBase):
-    def assertEvaluate(self, expression, regex):
+    def assertEvaluate(
+        self,
+        expression,
+        result: str,
+        want_type="",
+        want_varref=False,
+        want_memref=True,
+        want_locref=False,
+    ):
+        resp = self.dap_server.request_evaluate(expression, context=self.context)
+        self.assertTrue(
+            resp["success"], f"Failed to evaluate expression {expression!r}"
+        )
+        body: EvaluateResponseBody = resp["body"]
         self.assertRegex(
-            self.dap_server.request_evaluate(expression, context=self.context)["body"][
-                "result"
-            ],
-            regex,
+            body["result"],
+            result,
+            f"Unexpected 'result' for expression {expression!r} in response body {body}",
         )
+        if want_varref:
+            self.assertNotEqual(
+                body["variablesReference"],
+                0,
+                f"Unexpected 'variablesReference' for expression {expression!r} in response body {body}",
+            )
+        else:
+            self.assertEqual(
+                body["variablesReference"],
+                0,
+                f"Unexpected 'variablesReference' for expression {expression!r} in response body {body}",
+            )
+        if want_type:
+            self.assertEqual(
+                body["type"],
+                want_type,
+                f"Unexpected 'type' for expression {expression!r} in response body {body}",
+            )
+        if want_memref:
+            self.assertIn(
+                "memoryReference",
+                body,
+                f"Unexpected 'memoryReference' for expression {expression!r} in response body {body}",
+            )
+        if want_locref:
+            self.assertIn(
+                "valueLocationReference",
+                body,
+                f"Unexpected 'valueLocationReference' for expression {expression!r} in response body {body}",
+            )
 
     def assertEvaluateFailure(self, expression):
         self.assertNotIn(
@@ -71,29 +122,39 @@ def run_test_evaluate_expressions(
         self.continue_to_breakpoint(breakpoint_1)
 
         # Expressions at breakpoint 1, which is in main
-        self.assertEvaluate("var1", "20")
+        self.assertEvaluate("var1", "20", want_type="int")
         # Empty expression should equate to the previous expression.
         if context == "repl":
             self.assertEvaluate("", "20")
         else:
             self.assertEvaluateFailure("")
-        self.assertEvaluate("var2", "21")
+        self.assertEvaluate("var2", "21", want_type="int")
         if context == "repl":
-            self.assertEvaluate("", "21")
-            self.assertEvaluate("", "21")
-        self.assertEvaluate("static_int", "42")
-        self.assertEvaluate("non_static_int", "43")
-        self.assertEvaluate("struct1.foo", "15")
-        self.assertEvaluate("struct2->foo", "16")
+            self.assertEvaluate("", "21", want_type="int")
+            self.assertEvaluate("", "21", want_type="int")
+        self.assertEvaluate("static_int", "42", want_type="int")
+        self.assertEvaluate("non_static_int", "43", want_type="int")
+        self.assertEvaluate("struct1.foo", "15", want_type="int")
+        self.assertEvaluate("struct2->foo", "16", want_type="int")
 
         if self.isResultExpandedDescription():
             self.assertEvaluate(
                 "struct1",
                 r"\(my_struct\) (struct1|\$\d+) = \(foo = 15\)",
+                want_type="my_struct",
+                want_varref=True,
+            )
+            self.assertEvaluate(
+                "struct2",
+                r"\(my_struct \*\) (struct2|\$\d+) = 0x.*",
+                want_type="my_struct *",
+                want_varref=True,
             )
-            self.assertEvaluate("struct2", r"\(my_struct \*\) (struct2|\$\d+) = 0x.*")
             self.assertEvaluate(
-                "struct3", r"\(my_struct \*\) (struct3|\$\d+) = nullptr"
+                "struct3",
+                r"\(my_struct \*\) (struct3|\$\d+) = nullptr",
+                want_type="my_struct *",
+                want_varref=True,
             )
         else:
             self.assertEvaluate(
@@ -103,16 +164,22 @@ def run_test_evaluate_expressions(
                     if enableAutoVariableSummaries
                     else "my_struct @ 0x"
                 ),
+                want_varref=True,
+            )
+            self.assertEvaluate(
+                "struct2",
+                "0x.* {foo:16}" if enableAutoVariableSummaries else "0x.*",
+                want_varref=True,
+                want_type="my_struct *",
             )
             self.assertEvaluate(
-                "struct2", "0x.* {foo:16}" if enableAutoVariableSummaries else "0x.*"
+                "struct3", "0x.*0", want_varref=True, want_type="my_struct *"
             )
-            self.assertEvaluate("struct3", "0x.*0")
 
         if context == "repl":
             # In the repl context expressions may be interpreted as lldb
             # commands since no variables have the same name as the command.
-            self.assertEvaluate("list", r".*")
+            self.assertEvaluate("list", r".*", want_memref=False)
         else:
             self.assertEvaluateFailure("list")  # local variable of a_function
 
@@ -121,10 +188,26 @@ def run_test_evaluate_expressions(
         self.assertEvaluateFailure("foo")  # member of my_struct
 
         if self.isExpressionParsedExpected():
-            self.assertEvaluate("a_function", "0x.*a.out`a_function.*")
-            self.assertEvaluate("a_function(1)", "1")
-            self.assertEvaluate("var2 + struct1.foo", "36")
-            self.assertEvaluate("foo_func", "0x.*a.out`foo_func.*")
+            self.assertEvaluate(
+                "a_function",
+                "0x.*a.out`a_function.*",
+                want_type="int (*)(int)",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
+            self.assertEvaluate(
+                "a_function(1)", "1", want_memref=False, want_type="int"
+            )
+            self.assertEvaluate("var2 + struct1.foo", "36", want_memref=False)
+            self.assertEvaluate(
+                "foo_func",
+                "0x.*a.out`foo_func.*",
+                want_type="int (*)()",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
             self.assertEvaluate("foo_var", "44")
         else:
             self.assertEvaluateFailure("a_function")
@@ -145,6 +228,8 @@ def run_test_evaluate_expressions(
             self.assertEvaluate(
                 "struct1",
                 r"\(my_struct\) (struct1|\$\d+) = \(foo = 15\)",
+                want_type="my_struct",
+                want_varref=True,
             )
         else:
             self.assertEvaluate(
@@ -154,15 +239,26 @@ def run_test_evaluate_expressions(
                     if enableAutoVariableSummaries
                     else "my_struct @ 0x"
                 ),
+                want_type="my_struct",
+                want_varref=True,
             )
         self.assertEvaluate("struct1.foo", "15")
         self.assertEvaluate("struct2->foo", "16")
 
         if self.isExpressionParsedExpected():
-            self.assertEvaluate("a_function", "0x.*a.out`a_function.*")
-            self.assertEvaluate("a_function(1)", "1")
-            self.assertEvaluate("var2 + struct1.foo", "17")
-            self.assertEvaluate("foo_func", "0x.*a.out`foo_func.*")
+            self.assertEvaluate(
+                "a_function",
+                "0x.*a.out`a_function.*",
+                want_type="int (*)(int)",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
+            self.assertEvaluate("a_function(1)", "1", want_memref=False)
+            self.assertEvaluate("var2 + struct1.foo", "17", want_memref=False)
+            self.assertEvaluate(
+                "foo_func", "0x.*a.out`foo_func.*", want_varref=True, want_memref=False
+            )
             self.assertEvaluate("foo_var", "44")
         else:
             self.assertEvaluateFailure("a_function")
@@ -185,10 +281,18 @@ def run_test_evaluate_expressions(
         self.assertEvaluateFailure("var2 + struct1.foo")
 
         if self.isExpressionParsedExpected():
-            self.assertEvaluate("a_function", "0x.*a.out`a_function.*")
-            self.assertEvaluate("a_function(1)", "1")
-            self.assertEvaluate("list + 1", "43")
-            self.assertEvaluate("foo_func", "0x.*a.out`foo_func.*")
+            self.assertEvaluate(
+                "a_function",
+                "0x.*a.out`a_function.*",
+                want_varref=True,
+                want_memref=False,
+                want_locref=True,
+            )
+            self.assertEvaluate("a_function(1)", "1", want_memref=False)
+            self.assertEvaluate("list + 1", "43", want_memref=False)
+            self.assertEvaluate(
+                "foo_func", "0x.*a.out`foo_func.*", want_varref=True, want_memref=False
+            )
             self.assertEvaluate("foo_var", "44")
         else:
             self.assertEvaluateFailure("a_function")
@@ -199,26 +303,28 @@ def run_test_evaluate_expressions(
 
         # Now we check that values are updated after stepping
         self.continue_to_breakpoint(breakpoint_4)
-        self.assertEvaluate("my_vec", "size=2")
+        self.assertEvaluate("my_vec", "size=2", want_varref=True)
         self.continue_to_breakpoint(breakpoint_5)
-        self.assertEvaluate("my_vec", "size=3")
+        self.assertEvaluate("my_vec", "size=3", want_varref=True)
 
-        self.assertEvaluate("my_map", "size=2")
+        self.assertEvaluate("my_map", "size=2", want_varref=True)
         self.continue_to_breakpoint(breakpoint_6)
-        self.assertEvaluate("my_map", "size=3")
+        self.assertEvaluate("my_map", "size=3", want_varref=True)
 
-        self.assertEvaluate("my_bool_vec", "size=1")
+        self.assertEvaluate("my_bool_vec", "size=1", want_varref=True)
         self.continue_to_breakpoint(breakpoint_7)
-        self.assertEvaluate("my_bool_vec", "size=2")
+        self.assertEvaluate("my_bool_vec", "size=2", want_varref=True)
 
         self.continue_to_breakpoint(breakpoint_8)
         # Test memory read, especially with 'empty' repeat commands.
         if context == "repl":
-            self.assertEvaluate("memory read -c 1 &my_ints", ".* 05 .*\n")
-            self.assertEvaluate("", ".* 0a .*\n")
-            self.assertEvaluate("", ".* 0f .*\n")
-            self.assertEvaluate("", ".* 14 .*\n")
-            self.assertEvaluate("", ".* 19 .*\n")
+            self.assertEvaluate(
+                "memory read -c 1 &my_ints", ".* 05 .*\n", want_memref=False
+            )
+            self.assertEvaluate("", ".* 0a .*\n", want_memref=False)
+            self.assertEvaluate("", ".* 0f .*\n", want_memref=False)
+            self.assertEvaluate("", ".* 14 .*\n", want_memref=False)
+            self.assertEvaluate("", ".* 19 .*\n", want_memref=False)
 
         self.continue_to_exit()
 
@@ -245,4 +351,6 @@ def test_hover_evaluate_expressions(self):
     @skipIfWindows
     def test_variable_evaluate_expressions(self):
         # Tests expression evaluations that are triggered in the variable explorer
-        self.run_test_evaluate_expressions("variable", enableAutoVariableSummaries=True)
+        self.run_test_evaluate_expressions(
+            "variables", enableAutoVariableSummaries=True
+        )
diff --git a/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp
index e1556846dff19..ea8c3a2a4a296 100644
--- a/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/EvaluateRequestHandler.cpp
@@ -10,148 +10,31 @@
 #include "EventHelper.h"
 #include "JSONUtils.h"
 #include "LLDBUtils.h"
+#include "Protocol/ProtocolRequests.h"
+#include "Protocol/ProtocolTypes.h"
 #include "RequestHandler.h"
+#include "lldb/lldb-enumerations.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+
+using namespace llvm;
+using namespace lldb_dap;
+using namespace lldb_dap::protocol;
 
 namespace lldb_dap {
 
-//  "EvaluateRequest": {
-//    "allOf": [ { "$ref": "#/definitions/Request" }, {
-//      "type": "object",
-//      "description": "Evaluate request; value of command field is 'evaluate'.
-//                      Evaluates the given expression in the context of the
-//                      top most stack frame. The expression has access to any
-//                      variables and arguments that are in scope.",
-//      "properties": {
-//        "command": {
-//          "type": "string",
-//          "enum": [ "evaluate" ]
-//        },
-//        "arguments": {
-//          "$ref": "#/definitions/EvaluateArguments"
-//        }
-//      },
-//      "required": [ "command", "arguments"  ]
-//    }]
-//  },
-//  "EvaluateArguments": {
-//    "type": "object",
-//    "description": "Arguments for 'evaluate' request.",
-//    "properties": {
-//      "expression": {
-//        "type": "string",
-//        "description": "The expression to evaluate."
-//      },
-//      "frameId": {
-//        "type": "integer",
-//        "description": "Evaluate the expression in the scope of this stack
-//                        frame. If not specified, the expression is evaluated
-//                        in the global scope."
-//      },
-//      "context": {
-//        "type": "string",
-//        "_enum": [ "watch", "repl", "hover" ],
-//        "enumDescriptions": [
-//          "evaluate is run in a watch.",
-//          "evaluate is run from REPL console.",
-//          "evaluate is run from a data hover."
-//        ],
-//        "description": "The context in which the evaluate request is run."
-//      },
-//      "format": {
-//        "$ref": "#/definitions/ValueFormat",
-//        "description": "Specifies details on how to format the Evaluate
-//                        result."
-//      }
-//    },
-//    "required": [ "expression" ]
-//  },
-//  "EvaluateResponse": {
-//    "allOf": [ { "$ref": "#/definitions/Response" }, {
-//      "type": "object",
-//      "description": "Response to 'evaluate' request.",
-//      "properties": {
-//        "body": {
-//          "type": "object",
-//          "properties": {
-//            "result": {
-//              "type": "string",
-//              "description": "The result of the evaluate request."
-//            },
-//            "type": {
-//              "type": "string",
-//              "description": "The optional type of the evaluate result."
-//            },
-//            "presentationHint": {
-//              "$ref": "#/definitions/VariablePresentationHint",
-//              "description": "Properties of a evaluate result that can be
-//                              used to determine how to render the result in
-//                              the UI."
-//            },
-//            "variablesReference": {
-//              "type": "number",
-//              "description": "If variablesReference is > 0, the evaluate
-//                              result is structured and its children can be
-//                              retrieved by passing variablesReference to the
-//                              VariablesRequest."
-//            },
-//            "namedVariables": {
-//              "type": "number",
-//              "description": "The number of named child variables. The
-//                              client can use this optional information to
-//                              present the variables in a paged UI and fetch
-//                              them in chunks."
-//            },
-//            "indexedVariables": {
-//              "type": "number",
-//              "description": "The number of indexed child variables. The
-//                              client can use this optional information to
-//                              present the variables in a paged UI and fetch
-//                              them in chunks."
-//            },
-//            "valueLocationReference": {
-//              "type": "integer",
-//              "description": "A reference that allows the client to request
-//                              the location where the returned value is
-//                              declared. For example, if a function pointer is
-//                              returned, the adapter may be able to look up the
-//                              function's location. This should be present only
-//                              if the adapter is likely to be able to resolve
-//                              the location.\n\nThis reference shares the same
-//                              lifetime as the `variablesReference`. See
-//                              'Lifetime of Object References' in the
-//              Overview section for details."
-//            }
-//            "memoryReference": {
-//               "type": "string",
-//                "description": "A memory reference to a location appropriate
-//                                for this result. For pointer type eval
-//                                results, this is generally a reference to the
-//                                memory address contained in the pointer. This
-//                                attribute may be returned by a debug adapter
-//                                if corresponding capability
-//                                `supportsMemoryReferences` is true."
-//             },
-//          },
-//          "required": [ "result", "variablesReference" ]
-//        }
-//      },
-//      "required": [ "body" ]
-//    }]
-//  }
-void EvaluateRequestHandler::operator()(
-    const llvm::json::Object &request) const {
-  llvm::json::Object response;
-  FillResponse(request, response);
-  llvm::json::Object body;
-  const auto *arguments = request.getObject("arguments");
-  lldb::SBFrame frame = dap.GetLLDBFrame(*arguments);
-  std::string expression =
-      GetString(arguments, "expression").value_or("").str();
-  const llvm::StringRef context = GetString(arguments, "context").value_or("");
+/// Evaluates the given expression in the context of a stack frame.
+///
+/// The expression has access to any variables and arguments that are in scope.
+Expected<EvaluateResponseBody>
+EvaluateRequestHandler::Run(const EvaluateArguments &arguments) const {
+  EvaluateResponseBody body;
+  lldb::SBFrame frame = dap.GetLLDBFrame(arguments.frameId);
+  std::string expression = arguments.expression;
   bool repeat_last_command =
       expression.empty() && dap.last_nonempty_var_expression.empty();
 
-  if (context == "repl" &&
+  if (arguments.context == protocol::eEvaluateContextRepl &&
       (repeat_last_command ||
        (!expression.empty() &&
         dap.DetectReplMode(frame, expression, false) == ReplMode::Command))) {
@@ -165,70 +48,60 @@ void EvaluateRequestHandler::operator()(
     }
 
     bool required_command_failed = false;
-    std::string result = RunLLDBCommands(
+    body.result = RunLLDBCommands(
         dap.debugger, llvm::StringRef(), {expression}, required_command_failed,
         /*parse_command_directives=*/false, /*echo_commands=*/false);
+    return body;
+  }
 
-    EmplaceSafeString(body, "result", result);
-    body.try_emplace("variablesReference", (int64_t)0);
-  } else {
-    if (context == "repl") {
-      // If the expression is empty and the last expression was for a
-      // variable, set the expression to the previous expression (repeat the
-      // evaluation); otherwise save the current non-empty expression for the
-      // next (possibly empty) variable expression.
-      if (expression.empty())
-        expression = dap.last_nonempty_var_expression;
-      else
-        dap.last_nonempty_var_expression = expression;
-    }
-    // Always try to get the answer from the local variables if possible. If
-    // this fails, then if the context is not "hover", actually evaluate an
-    // expression using the expression parser.
-    //
-    // "frame variable" is more reliable than the expression parser in
-    // many cases and it is faster.
-    lldb::SBValue value = frame.GetValueForVariablePath(
-        expression.data(), lldb::eDynamicDontRunTarget);
-
-    // Freeze dry the value in case users expand it later in the debug console
-    if (value.GetError().Success() && context == "repl")
-      value = value.Persist();
-
-    if (value.GetError().Fail() && context != "hover")
-      value = frame.EvaluateExpression(expression.data());
-
-    if (value.GetError().Fail()) {
-      response["success"] = llvm::json::Value(false);
-      // This error object must live until we're done with the pointer returned
-      // by GetCString().
-      lldb::SBError error = value.GetError();
-      const char *error_cstr = error.GetCString();
-      if (error_cstr && error_cstr[0])
-        EmplaceSafeString(response, "message", error_cstr);
-      else
-        EmplaceSafeString(response, "message", "evaluate failed");
-    } else {
-      VariableDescription desc(value,
-                               dap.configuration.enableAutoVariableSummaries);
-      EmplaceSafeString(body, "result", desc.GetResult(context));
-      EmplaceSafeString(body, "type", desc.display_type_name);
-      int64_t var_ref = 0;
-      if (value.MightHaveChildren() || ValuePointsToCode(value))
-        var_ref = dap.variables.InsertVariable(
-            value, /*is_permanent=*/context == "repl");
-      if (value.MightHaveChildren())
-        body.try_emplace("variablesReference", var_ref);
-      else
-        body.try_emplace("variablesReference", (int64_t)0);
-      if (lldb::addr_t addr = value.GetLoadAddress();
-          addr != LLDB_INVALID_ADDRESS)
-        body.try_emplace("memoryReference", EncodeMemoryReference(addr));
-      if (ValuePointsToCode(value))
-        body.try_emplace("valueLocationReference", var_ref);
-    }
+  if (arguments.context == eEvaluateContextRepl) {
+    // If the expression is empty and the last expression was for a
+    // variable, set the expression to the previous expression (repeat the
+    // evaluation); otherwise save the current non-empty expression for the
+    // next (possibly empty) variable expression.
+    if (expression.empty())
+      expression = dap.last_nonempty_var_expression;
+    else
+      dap.last_nonempty_var_expression = expression;
   }
-  response.try_emplace("body", std::move(body));
-  dap.SendJSON(llvm::json::Value(std::move(response)));
+
+  // Always try to get the answer from the local variables if possible. If
+  // this fails, then if the context is not "hover", actually evaluate an
+  // expression using the expression parser.
+  //
+  // "frame variable" is more reliable than the expression parser in
+  // many cases and it is faster.
+  lldb::SBValue value = frame.GetValueForVariablePath(
+      expression.data(), lldb::eDynamicDontRunTarget);
+
+  // Freeze dry the value in case users expand it later in the debug console
+  if (value.GetError().Success() && arguments.context == eEvaluateContextRepl)
+    value = value.Persist();
+
+  if (value.GetError().Fail() && arguments.context != eEvaluateContextHover)
+    value = frame.EvaluateExpression(expression.data());
+
+  if (value.GetError().Fail())
+    return ToError(value.GetError(), /*show_user=*/false);
+
+  VariableDescription desc(value,
+                           dap.configuration.enableAutoVariableSummaries);
+
+  body.result = desc.GetResult(arguments.context);
+  body.type = desc.display_type_name;
+
+  if (value.MightHaveChildren() || ValuePointsToCode(value))
+    body.variablesReference = dap.variables.InsertVariable(
+        value, /*is_permanent=*/arguments.context == eEvaluateContextRepl);
+
+  if (lldb::addr_t addr = value.GetLoadAddress(); addr != LLDB_INVALID_ADDRESS)
+    body.memoryReference = EncodeMemoryReference(addr);
+
+  if (ValuePointsToCode(value) &&
+      body.variablesReference != LLDB_DAP_INVALID_VARRERF)
+    body.valueLocationReference = PackLocation(body.variablesReference, true);
+
+  return body;
 }
+
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.h b/lldb/tools/lldb-dap/Handler/RequestHandler.h
index bc22133d92453..65a52075ebd79 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.h
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.h
@@ -292,11 +292,14 @@ class DisconnectRequestHandler
   Run(const std::optional<protocol::DisconnectArguments> &args) const override;
 };
 
-class EvaluateRequestHandler : public LegacyRequestHandler {
+class EvaluateRequestHandler
+    : public RequestHandler<protocol::EvaluateArguments,
+                            llvm::Expected<protocol::EvaluateResponseBody>> {
 public:
-  using LegacyRequestHandler::LegacyRequestHandler;
+  using RequestHandler::RequestHandler;
   static llvm::StringLiteral GetCommand() { return "evaluate"; }
-  void operator()(const llvm::json::Object &request) const override;
+  llvm::Expected<protocol::EvaluateResponseBody>
+  Run(const protocol::EvaluateArguments &) const override;
   FeatureSet GetSupportedFeatures() const override {
     return {protocol::eAdapterFeatureEvaluateForHovers};
   }
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 1a3a6701b194d..81eadae03bb48 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -11,6 +11,7 @@
 #include "ExceptionBreakpoint.h"
 #include "LLDBUtils.h"
 #include "Protocol/ProtocolBase.h"
+#include "Protocol/ProtocolRequests.h"
 #include "ProtocolUtils.h"
 #include "lldb/API/SBAddress.h"
 #include "lldb/API/SBCompileUnit.h"
@@ -817,10 +818,10 @@ VariableDescription::VariableDescription(lldb::SBValue v,
   evaluate_name = llvm::StringRef(evaluateStream.GetData()).str();
 }
 
-std::string VariableDescription::GetResult(llvm::StringRef context) {
+std::string VariableDescription::GetResult(protocol::EvaluateContext context) {
   // In repl context, the results can be displayed as multiple lines so more
   // detailed descriptions can be returned.
-  if (context != "repl")
+  if (context != protocol::eEvaluateContextRepl)
     return display_value;
 
   if (!v.IsValid())
diff --git a/lldb/tools/lldb-dap/JSONUtils.h b/lldb/tools/lldb-dap/JSONUtils.h
index 0c865a33a6ce4..329dc8ab02f99 100644
--- a/lldb/tools/lldb-dap/JSONUtils.h
+++ b/lldb/tools/lldb-dap/JSONUtils.h
@@ -10,7 +10,7 @@
 #define LLDB_TOOLS_LLDB_DAP_JSONUTILS_H
 
 #include "DAPForward.h"
-#include "Protocol/ProtocolTypes.h"
+#include "Protocol/ProtocolRequests.h"
 #include "lldb/API/SBCompileUnit.h"
 #include "lldb/API/SBFormat.h"
 #include "lldb/API/SBType.h"
@@ -28,7 +28,7 @@
 
 namespace lldb_dap {
 
-/// Emplace a StringRef in a json::Object after enusring that the
+/// Emplace a StringRef in a json::Object after ensuring that the
 /// string is valid UTF8. If not, first call llvm::json::fixUTF8
 /// before emplacing.
 ///
@@ -351,7 +351,7 @@ struct VariableDescription {
                       std::optional<std::string> custom_name = {});
 
   /// Returns a description of the value appropriate for the specified context.
-  std::string GetResult(llvm::StringRef context);
+  std::string GetResult(protocol::EvaluateContext context);
 };
 
 /// Does the given variable have an associated value location?
diff --git a/lldb/tools/lldb-dap/LLDBUtils.cpp b/lldb/tools/lldb-dap/LLDBUtils.cpp
index 4db6caa1af38b..e2ba2ee64103d 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.cpp
+++ b/lldb/tools/lldb-dap/LLDBUtils.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LLDBUtils.h"
+#include "DAPError.h"
 #include "JSONUtils.h"
 #include "lldb/API/SBCommandInterpreter.h"
 #include "lldb/API/SBCommandReturnObject.h"
@@ -17,6 +18,7 @@
 #include "lldb/API/SBThread.h"
 #include "lldb/lldb-enumerations.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -214,13 +216,14 @@ GetStopDisassemblyDisplay(lldb::SBDebugger &debugger) {
   return result;
 }
 
-llvm::Error ToError(const lldb::SBError &error) {
+llvm::Error ToError(const lldb::SBError &error, bool show_user) {
   if (error.Success())
     return llvm::Error::success();
 
-  return llvm::createStringError(
-      std::error_code(error.GetError(), std::generic_category()),
-      error.GetCString());
+  return llvm::make_error<DAPError>(
+      /*message=*/error.GetCString(),
+      /*EC=*/std::error_code(error.GetError(), std::generic_category()),
+      /*show_user=*/show_user);
 }
 
 std::string GetStringValue(const lldb::SBStructuredData &data) {
diff --git a/lldb/tools/lldb-dap/LLDBUtils.h b/lldb/tools/lldb-dap/LLDBUtils.h
index 9db721a47ccf7..a29d3d88789a0 100644
--- a/lldb/tools/lldb-dap/LLDBUtils.h
+++ b/lldb/tools/lldb-dap/LLDBUtils.h
@@ -243,7 +243,7 @@ class ScopeSyncMode {
 lldb::StopDisassemblyType GetStopDisassemblyDisplay(lldb::SBDebugger &debugger);
 
 /// Take ownership of the stored error.
-llvm::Error ToError(const lldb::SBError &error);
+llvm::Error ToError(const lldb::SBError &error, bool show_user = true);
 
 /// Provides the string value if this data structure is a string type.
 std::string GetStringValue(const lldb::SBStructuredData &data);
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
index 44ae79f8b9f43..ac01cfb95dd41 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.cpp
@@ -8,6 +8,7 @@
 
 #include "Protocol/ProtocolRequests.h"
 #include "JSONUtils.h"
+#include "Protocol/ProtocolTypes.h"
 #include "lldb/lldb-defines.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
@@ -639,6 +640,54 @@ json::Value toJSON(const ExceptionInfoResponseBody &ERB) {
     result.insert({"description", ERB.description});
   if (ERB.details.has_value())
     result.insert({"details", *ERB.details});
+  return result;
+}
+
+static bool fromJSON(const llvm::json::Value &Params, EvaluateContext &C,
+                     llvm::json::Path P) {
+  auto rawContext = Params.getAsString();
+  if (!rawContext) {
+    P.report("expected a string");
+    return false;
+  }
+  C = StringSwitch<EvaluateContext>(*rawContext)
+          .Case("watch", EvaluateContext::eEvaluateContextWatch)
+          .Case("repl", EvaluateContext::eEvaluateContextRepl)
+          .Case("hover", EvaluateContext::eEvaluateContextHover)
+          .Case("clipboard", EvaluateContext::eEvaluateContextClipboard)
+          .Case("variables", EvaluateContext::eEvaluateContextVariables)
+          .Default(eEvaluateContextUnknown);
+  return true;
+}
+
+bool fromJSON(const llvm::json::Value &Params, EvaluateArguments &Args,
+              llvm::json::Path P) {
+  json::ObjectMapper O(Params, P);
+  return O && O.map("expression", Args.expression) &&
+         O.mapOptional("frameId", Args.frameId) &&
+         O.mapOptional("line", Args.line) &&
+         O.mapOptional("column", Args.column) &&
+         O.mapOptional("source", Args.source) &&
+         O.mapOptional("context", Args.context) &&
+         O.mapOptional("format", Args.format);
+}
+
+llvm::json::Value toJSON(const EvaluateResponseBody &Body) {
+  json::Object result{{"result", Body.result},
+                      {"variablesReference", Body.variablesReference}};
+
+  if (!Body.type.empty())
+    result.insert({"type", Body.type});
+  if (Body.presentationHint)
+    result.insert({"presentationHint", Body.presentationHint});
+  if (Body.namedVariables)
+    result.insert({"namedVariables", Body.namedVariables});
+  if (Body.indexedVariables)
+    result.insert({"indexedVariables", Body.indexedVariables});
+  if (!Body.memoryReference.empty())
+    result.insert({"memoryReference", Body.memoryReference});
+  if (Body.valueLocationReference != LLDB_DAP_INVALID_VALUE_LOC)
+    result.insert({"valueLocationReference", Body.valueLocationReference});
 
   return result;
 }
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
index b894f2b4ed44d..c1e1e93f1e44a 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolRequests.h
@@ -1061,6 +1061,123 @@ struct ExceptionInfoResponseBody {
 };
 llvm::json::Value toJSON(const ExceptionInfoResponseBody &);
 
+/// The context in which the evaluate request is used.
+enum EvaluateContext : unsigned {
+  /// An unspecified or unknown evaluate context.
+  eEvaluateContextUnknown = 0,
+  /// 'watch': evaluate is called from a watch view context.
+  eEvaluateContextWatch = 1,
+  /// 'repl': evaluate is called from a REPL context.
+  eEvaluateContextRepl = 2,
+  /// 'hover': evaluate is called to generate the debug hover contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsEvaluateForHovers` is true.
+  eEvaluateContextHover = 3,
+  /// 'clipboard': evaluate is called to generate clipboard contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsClipboardContext` is true.
+  eEvaluateContextClipboard = 4,
+  /// 'variables': evaluate is called from a variables view context.
+  eEvaluateContextVariables = 5,
+};
+
+/// Arguments for `evaluate` request.
+struct EvaluateArguments {
+  /// The expression to evaluate.
+  std::string expression;
+
+  /// Evaluate the expression in the scope of this stack frame. If not
+  /// specified, the expression is evaluated in the global scope.
+  uint64_t frameId = LLDB_DAP_INVALID_FRAME_ID;
+
+  /// The contextual line where the expression should be evaluated. In the
+  /// 'hover' context, this should be set to the start of the expression being
+  /// hovered.
+  uint32_t line = LLDB_INVALID_LINE_NUMBER;
+
+  /// The contextual column where the expression should be evaluated. This may
+  /// be provided if `line` is also provided.
+  ///
+  /// It is measured in UTF-16 code units and the client capability
+  /// `columnsStartAt1` determines whether it is 0- or 1-based.
+  uint32_t column = LLDB_INVALID_COLUMN_NUMBER;
+
+  /// The contextual source in which the `line` is found. This must be provided
+  /// if `line` is provided.
+  std::optional<Source> source;
+
+  /// The context in which the evaluate request is used.
+  /// Values:
+  /// 'watch': evaluate is called from a watch view context.
+  /// 'repl': evaluate is called from a REPL context.
+  /// 'hover': evaluate is called to generate the debug hover contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsEvaluateForHovers` is true.
+  /// 'clipboard': evaluate is called to generate clipboard contents.
+  /// This value should only be used if the corresponding capability
+  /// `supportsClipboardContext` is true.
+  /// 'variables': evaluate is called from a variables view context.
+  /// etc.
+  EvaluateContext context = eEvaluateContextUnknown;
+
+  /// Specifies details on how to format the result.
+  /// The attribute is only honored by a debug adapter if the corresponding
+  /// capability `supportsValueFormattingOptions` is true.
+  std::optional<ValueFormat> format;
+};
+bool fromJSON(const llvm::json::Value &, EvaluateArguments &, llvm::json::Path);
+
+/// Response to 'evaluate' request.
+struct EvaluateResponseBody {
+  /// The result of the evaluate request.
+  std::string result;
+
+  /// The type of the evaluate result.
+  /// This attribute should only be returned by a debug adapter if the
+  /// corresponding capability `supportsVariableType` is true.
+  std::string type;
+
+  /// Properties of an evaluate result that can be used to determine how to
+  /// render the result in the UI.
+  std::optional<VariablePresentationHint> presentationHint;
+
+  /// If `variablesReference` is > 0, the evaluate result is structured and its
+  /// children can be retrieved by passing `variablesReference` to the
+  /// `variables` request as long as execution remains suspended. See 'Lifetime
+  /// of Object References' in the Overview section for details.
+  int64_t variablesReference = 0;
+
+  /// The number of named child variables.
+  /// The client can use this information to present the variables in a paged
+  /// UI and fetch them in chunks.
+  /// The value should be less than or equal to 2147483647 (2^31-1).
+  uint32_t namedVariables = 0;
+
+  /// The number of indexed child variables.
+  /// The client can use this information to present the variables in a paged
+  /// UI and fetch them in chunks.
+  /// The value should be less than or equal to 2147483647 (2^31-1).
+  uint32_t indexedVariables = 0;
+
+  /// A memory reference to a location appropriate for this result.
+  /// For pointer type eval results, this is generally a reference to the
+  /// memory address contained in the pointer.
+  /// This attribute may be returned by a debug adapter if corresponding
+  /// capability `supportsMemoryReferences` is true.
+  std::string memoryReference;
+
+  /// A reference that allows the client to request the location where the
+  /// returned value is declared. For example, if a function pointer is
+  /// returned, the adapter may be able to look up the function's location.
+  /// This should be present only if the adapter is likely to be able to
+  /// resolve the location.
+  ///
+  /// This reference shares the same lifetime as the `variablesReference`. See
+  /// 'Lifetime of Object References' in the Overview section for details.
+  uint64_t valueLocationReference = LLDB_DAP_INVALID_VALUE_LOC;
+};
+llvm::json::Value toJSON(const EvaluateResponseBody &);
+
 } // namespace lldb_dap::protocol
 
 #endif
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
index 6d85c74377bd3..690a1d684d0e9 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolTypes.h
@@ -28,8 +28,9 @@
 #include <optional>
 #include <string>
 
-#define LLDB_DAP_INVALID_VARRERF UINT64_MAX
+#define LLDB_DAP_INVALID_VARRERF INT64_MAX
 #define LLDB_DAP_INVALID_SRC_REF 0
+#define LLDB_DAP_INVALID_VALUE_LOC 0
 
 namespace lldb_dap::protocol {
 
diff --git a/lldb/unittests/DAP/ProtocolRequestsTest.cpp b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
index 498195dc09325..ba9aef1e5fcc5 100644
--- a/lldb/unittests/DAP/ProtocolRequestsTest.cpp
+++ b/lldb/unittests/DAP/ProtocolRequestsTest.cpp
@@ -67,3 +67,54 @@ TEST(ProtocolRequestsTest, ExceptionInfoResponseBody) {
   ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded());
   EXPECT_EQ(PrettyPrint(*expected_opt), PrettyPrint(body));
 }
+
+TEST(ProtocolRequestsTest, EvaluateArguments) {
+  llvm::Expected<EvaluateArguments> expected = parse<EvaluateArguments>(R"({
+    "expression": "hello world",
+    "context": "repl"
+  })");
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(expected->expression, "hello world");
+  EXPECT_EQ(expected->context, eEvaluateContextRepl);
+
+  // Check required keys;
+  EXPECT_THAT_EXPECTED(parse<EvaluateArguments>(R"({})"),
+                       FailedWithMessage("missing value at (root).expression"));
+}
+
+TEST(ProtocolRequestsTest, EvaluateResponseBody) {
+  EvaluateResponseBody body;
+  body.result = "hello world";
+  body.variablesReference = 7;
+
+  // Check required keys.
+  Expected<json::Value> expected = parse(R"({
+    "result": "hello world",
+    "variablesReference": 7
+  })");
+
+  ASSERT_THAT_EXPECTED(expected, llvm::Succeeded());
+  EXPECT_EQ(PrettyPrint(*expected), PrettyPrint(body));
+
+  // Check optional keys.
+  body.result = "'abc'";
+  body.type = "string";
+  body.variablesReference = 42;
+  body.namedVariables = 1;
+  body.indexedVariables = 2;
+  body.memoryReference = "0x123";
+  body.valueLocationReference = 22;
+
+  Expected<json::Value> expected_opt = parse(R"({
+    "result": "'abc'",
+    "type": "string",
+    "variablesReference": 42,
+    "namedVariables": 1,
+    "indexedVariables": 2,
+    "memoryReference": "0x123",
+    "valueLocationReference": 22
+  })");
+
+  ASSERT_THAT_EXPECTED(expected_opt, llvm::Succeeded());
+  EXPECT_EQ(PrettyPrint(*expected_opt), PrettyPrint(body));
+}

From c1c22cd3e16beb3937eb0d11da014451397be5d6 Mon Sep 17 00:00:00 2001
From: Amit Kumar Pandey <pandey.kumaramit2023@gmail.com>
Date: Tue, 18 Nov 2025 00:43:09 +0530
Subject: [PATCH 40/67] [ASan][HIP] Add ASan declarations and macros. (#167522)

This patch adds the following device ASan hooks and guarded macros in
__clang_hip_libdevice_declares.h

  - Function Declarations
    - __asan_poison_memory_region
    - __asan_unpoison_memory_region
    - __asan_address_is_poisoned
    - __asan_region_is_poisoned

  - Macros
    - ASAN_POISON_MEMORY_REGION
    - ASAN_UNPOISON_MEMORY_REGION
---
 .../Headers/__clang_hip_libdevice_declares.h    | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/clang/lib/Headers/__clang_hip_libdevice_declares.h b/clang/lib/Headers/__clang_hip_libdevice_declares.h
index fa8d918248dd0..fad9c6ca7ffc5 100644
--- a/clang/lib/Headers/__clang_hip_libdevice_declares.h
+++ b/clang/lib/Headers/__clang_hip_libdevice_declares.h
@@ -338,6 +338,23 @@ __device__ __attribute__((const)) __2f16 __ocml_sqrt_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_trunc_2f16(__2f16);
 __device__ __attribute__((const)) __2f16 __ocml_pown_2f16(__2f16, __2i16);
 
+__device__ void __asan_poison_memory_region(const void *addr,
+                                            __SIZE_TYPE__ size);
+__device__ void __asan_unpoison_memory_region(const void *addr,
+                                              __SIZE_TYPE__ size);
+__device__ int __asan_address_is_poisoned(const void *addr);
+__device__ void *__asan_region_is_poisoned(void *beg, __SIZE_TYPE__ size);
+
+#if __has_feature(address_sanitizer)
+#define ASAN_POISON_MEMORY_REGION(addr, size)                                  \
+  __asan_poison_memory_region((addr), (size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size)                                \
+  __asan_unpoison_memory_region((addr), (size))
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#endif
+
 #ifdef __cplusplus
 } // extern "C"
 #endif

From 69b4190d5f1b483524f2f539f373960ef8de8d84 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96@gmail.com>
Date: Mon, 17 Nov 2025 21:39:10 +0200
Subject: [PATCH 41/67] [AArch64] Optimize extending loads of small vectors
 (#163064)

Reduces the total amount of loads and the amount of moves between SIMD
registers and general-purpose registers.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 148 +++++++---
 llvm/test/CodeGen/AArch64/aarch64-load-ext.ll | 264 ++++++++++++++++--
 llvm/test/CodeGen/AArch64/aarch64-smull.ll    |  12 +-
 llvm/test/CodeGen/AArch64/add.ll              |  27 +-
 llvm/test/CodeGen/AArch64/andorxor.ll         |  81 +++---
 llvm/test/CodeGen/AArch64/bitcast.ll          |   6 +-
 llvm/test/CodeGen/AArch64/ctlz.ll             |  18 +-
 llvm/test/CodeGen/AArch64/ctpop.ll            |  18 +-
 llvm/test/CodeGen/AArch64/cttz.ll             |  16 +-
 llvm/test/CodeGen/AArch64/extbinopload.ll     |  26 +-
 llvm/test/CodeGen/AArch64/load.ll             |  11 +-
 llvm/test/CodeGen/AArch64/mul.ll              |  27 +-
 llvm/test/CodeGen/AArch64/sadd_sat_vec.ll     |  22 +-
 llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll    |   8 +-
 llvm/test/CodeGen/AArch64/ssub_sat_vec.ll     |  22 +-
 llvm/test/CodeGen/AArch64/store.ll            |   7 +-
 llvm/test/CodeGen/AArch64/sub.ll              |  27 +-
 .../AArch64/sve-fixed-length-ext-loads.ll     |   8 +-
 .../AArch64/sve-fixed-length-masked-gather.ll |  13 +-
 .../sve-fixed-length-masked-scatter.ll        |  13 +-
 llvm/test/CodeGen/AArch64/uadd_sat_vec.ll     |  26 +-
 llvm/test/CodeGen/AArch64/usub_sat_vec.ll     |  26 +-
 llvm/test/CodeGen/AArch64/v3f-to-int.ll       |  15 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll |   7 +-
 .../AArch64/vec3-loads-ext-trunc-stores.ll    |  67 +++--
 25 files changed, 567 insertions(+), 348 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 35836af3c874b..42567883b2594 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1427,12 +1427,24 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::BITCAST, MVT::v2i16, Custom);
     setOperationAction(ISD::BITCAST, MVT::v4i8, Custom);
 
-    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i16, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
-    setLoadExtAction(ISD::EXTLOAD,  MVT::v4i32, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i32, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Custom);
 
     // ADDP custom lowering
     for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
@@ -6728,8 +6740,34 @@ bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(SDValue Extend,
   return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
 }
 
+/// Helper function to check if a small vector load can be optimized.
+static bool isEligibleForSmallVectorLoadOpt(LoadSDNode *LD,
+                                            const AArch64Subtarget &Subtarget) {
+  if (!Subtarget.isNeonAvailable())
+    return false;
+  if (LD->isVolatile())
+    return false;
+
+  EVT MemVT = LD->getMemoryVT();
+  if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8 && MemVT != MVT::v2i16)
+    return false;
+
+  Align Alignment = LD->getAlign();
+  Align RequiredAlignment = Align(MemVT.getStoreSize().getFixedValue());
+  if (Subtarget.requiresStrictAlign() && Alignment < RequiredAlignment)
+    return false;
+
+  return true;
+}
+
 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   EVT ExtVT = ExtVal.getValueType();
+  // Small, illegal vectors can be extended inreg.
+  if (auto *Load = dyn_cast<LoadSDNode>(ExtVal.getOperand(0))) {
+    if (ExtVT.isFixedLengthVector() && ExtVT.getStoreSizeInBits() <= 128 &&
+        isEligibleForSmallVectorLoadOpt(Load, *Subtarget))
+      return true;
+  }
   if (!ExtVT.isScalableVector() && !Subtarget->useSVEForFixedLengthVectors())
     return false;
 
@@ -7188,12 +7226,86 @@ SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
   return Result;
 }
 
+/// Helper function to optimize loads of extended small vectors.
+/// These patterns would otherwise get scalarized into inefficient sequences.
+static SDValue tryLowerSmallVectorExtLoad(LoadSDNode *Load, SelectionDAG &DAG) {
+  const AArch64Subtarget &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
+  if (!isEligibleForSmallVectorLoadOpt(Load, Subtarget))
+    return SDValue();
+
+  EVT MemVT = Load->getMemoryVT();
+  EVT ResVT = Load->getValueType(0);
+  unsigned NumElts = ResVT.getVectorNumElements();
+  unsigned DstEltBits = ResVT.getScalarSizeInBits();
+  unsigned SrcEltBits = MemVT.getScalarSizeInBits();
+
+  unsigned ExtOpcode;
+  switch (Load->getExtensionType()) {
+  case ISD::EXTLOAD:
+  case ISD::ZEXTLOAD:
+    ExtOpcode = ISD::ZERO_EXTEND;
+    break;
+  case ISD::SEXTLOAD:
+    ExtOpcode = ISD::SIGN_EXTEND;
+    break;
+  case ISD::NON_EXTLOAD:
+    return SDValue();
+  }
+
+  SDLoc DL(Load);
+  SDValue Chain = Load->getChain();
+  SDValue BasePtr = Load->getBasePtr();
+  const MachinePointerInfo &PtrInfo = Load->getPointerInfo();
+  Align Alignment = Load->getAlign();
+
+  // Load the data as an FP scalar to avoid issues with integer loads.
+  unsigned LoadBits = MemVT.getStoreSizeInBits();
+  MVT ScalarLoadType = MVT::getFloatingPointVT(LoadBits);
+  SDValue ScalarLoad =
+      DAG.getLoad(ScalarLoadType, DL, Chain, BasePtr, PtrInfo, Alignment);
+
+  MVT ScalarToVecTy = MVT::getVectorVT(ScalarLoadType, 128 / LoadBits);
+  SDValue ScalarToVec =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ScalarToVecTy, ScalarLoad);
+  MVT BitcastTy =
+      MVT::getVectorVT(MVT::getIntegerVT(SrcEltBits), 128 / SrcEltBits);
+  SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, BitcastTy, ScalarToVec);
+
+  SDValue Res = Bitcast;
+  unsigned CurrentEltBits = Res.getValueType().getScalarSizeInBits();
+  unsigned CurrentNumElts = Res.getValueType().getVectorNumElements();
+  while (CurrentEltBits < DstEltBits) {
+    if (Res.getValueSizeInBits() >= 128) {
+      CurrentNumElts = CurrentNumElts / 2;
+      MVT ExtractVT =
+          MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
+      Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, Res,
+                        DAG.getConstant(0, DL, MVT::i64));
+    }
+    CurrentEltBits = CurrentEltBits * 2;
+    MVT ExtVT =
+        MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), CurrentNumElts);
+    Res = DAG.getNode(ExtOpcode, DL, ExtVT, Res);
+  }
+
+  if (CurrentNumElts != NumElts) {
+    MVT FinalVT = MVT::getVectorVT(MVT::getIntegerVT(CurrentEltBits), NumElts);
+    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FinalVT, Res,
+                      DAG.getConstant(0, DL, MVT::i64));
+  }
+
+  return DAG.getMergeValues({Res, ScalarLoad.getValue(1)}, DL);
+}
+
 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
                                          SelectionDAG &DAG) const {
   SDLoc DL(Op);
   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   assert(LoadNode && "Expected custom lowering of a load node");
 
+  if (SDValue Result = tryLowerSmallVectorExtLoad(LoadNode, DAG))
+    return Result;
+
   if (LoadNode->getMemoryVT() == MVT::i64x8) {
     SmallVector<SDValue, 8> Ops;
     SDValue Base = LoadNode->getBasePtr();
@@ -7212,37 +7324,7 @@ SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
     return DAG.getMergeValues({Loaded, Chain}, DL);
   }
 
-  // Custom lowering for extending v4i8 vector loads.
-  EVT VT = Op->getValueType(0);
-  assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
-
-  if (LoadNode->getMemoryVT() != MVT::v4i8)
-    return SDValue();
-
-  // Avoid generating unaligned loads.
-  if (Subtarget->requiresStrictAlign() && LoadNode->getAlign() < Align(4))
-    return SDValue();
-
-  unsigned ExtType;
-  if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
-    ExtType = ISD::SIGN_EXTEND;
-  else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
-           LoadNode->getExtensionType() == ISD::EXTLOAD)
-    ExtType = ISD::ZERO_EXTEND;
-  else
-    return SDValue();
-
-  SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
-                             LoadNode->getBasePtr(), MachinePointerInfo());
-  SDValue Chain = Load.getValue(1);
-  SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
-  SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
-  SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
-  Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
-                    DAG.getConstant(0, DL, MVT::i64));
-  if (VT == MVT::v4i32)
-    Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
-  return DAG.getMergeValues({Ext, Chain}, DL);
+  return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerVECTOR_COMPRESS(SDValue Op,
diff --git a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
index 317feb5ad9ad0..0ef2b31d00daa 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-load-ext.ll
@@ -22,17 +22,16 @@ define <2 x i16> @test0(ptr %i16_ptr, i64 %inc) {
 define <2 x i16> @test1(ptr %v2i16_ptr) {
 ; CHECK-LE-LABEL: test1:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-LE-NEXT:    add x8, x0, #2
-; CHECK-LE-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test1:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-BE-NEXT:    add x8, x0, #2
-; CHECK-BE-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %v2i16 = load <2 x i16>, ptr %v2i16_ptr
@@ -66,17 +65,18 @@ define <2 x i16> @test2(ptr %i16_ptr, i64 %inc) {
 define <2 x i8> @test3(ptr %v2i8_ptr) {
 ; CHECK-LE-LABEL: test3:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-LE-NEXT:    add x8, x0, #1
-; CHECK-LE-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: test3:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-BE-NEXT:    add x8, x0, #1
-; CHECK-BE-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %v2i8 = load <2 x i8>, ptr %v2i8_ptr
@@ -105,19 +105,18 @@ define <4 x i8> @test4(ptr %v4i8_ptr) {
 define <2 x i32> @fsext_v2i32(ptr %a) {
 ; CHECK-LE-LABEL: fsext_v2i32:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ldrsb w8, [x0]
-; CHECK-LE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT:    fmov s0, w8
-; CHECK-LE-NEXT:    mov v0.s[1], w9
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: fsext_v2i32:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ldrsb w8, [x0]
-; CHECK-BE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT:    fmov s0, w8
-; CHECK-BE-NEXT:    mov v0.s[1], w9
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %x = load <2 x i8>, ptr %a
@@ -249,19 +248,18 @@ define i32 @loadExti32(ptr %ref) {
 define <2 x i16> @fsext_v2i16(ptr %a) {
 ; CHECK-LE-LABEL: fsext_v2i16:
 ; CHECK-LE:       // %bb.0:
-; CHECK-LE-NEXT:    ldrsb w8, [x0]
-; CHECK-LE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-LE-NEXT:    fmov s0, w8
-; CHECK-LE-NEXT:    mov v0.s[1], w9
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-LE-NEXT:    ret
 ;
 ; CHECK-BE-LABEL: fsext_v2i16:
 ; CHECK-BE:       // %bb.0:
-; CHECK-BE-NEXT:    ldrsb w8, [x0]
-; CHECK-BE-NEXT:    ldrsb w9, [x0, #1]
-; CHECK-BE-NEXT:    fmov s0, w8
-; CHECK-BE-NEXT:    mov v0.s[1], w9
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
 ; CHECK-BE-NEXT:    ret
   %x = load <2 x i8>, ptr %a
@@ -497,3 +495,213 @@ define <4 x i8> @strict_align_unaligned(ptr %v4i8_ptr) "target-features"="+stric
   %v4i8 = load <4 x i8>, ptr %v4i8_ptr, align 1
   ret <4 x i8> %v4i8
 }
+
+define <2 x i16> @zext_v2i8_v2i16(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i16:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i16:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = zext <2 x i8> %x to <2 x i16>
+  ret <2 x i16> %y
+}
+
+define <2 x i32> @zext_v2i8_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = zext <2 x i8> %x to <2 x i32>
+  ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i8_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i8_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = zext <2 x i8> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <2 x i32> @zext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = zext <2 x i16> %x to <2 x i32>
+  ret <2 x i32> %y
+}
+
+define <2 x i64> @zext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: zext_v2i16_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v2i16_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = zext <2 x i16> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <4 x i32> @zext_v4i16_v4i32(ptr %a) {
+; CHECK-LE-LABEL: zext_v4i16_v4i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: zext_v4i16_v4i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <4 x i16>, ptr %a
+  %y = zext <4 x i16> %x to <4 x i32>
+  ret <4 x i32> %y
+}
+
+define <2 x i64> @sext_v2i8_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i8_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr h0, [x0]
+; CHECK-LE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v2i8_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr h0, [x0]
+; CHECK-BE-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-BE-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i8>, ptr %a
+  %y = sext <2 x i8> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <2 x i32> @sext_v2i16_v2i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = sext <2 x i16> %x to <2 x i32>
+  ret <2 x i32> %y
+}
+
+define <2 x i64> @sext_v2i16_v2i64(ptr %a) {
+; CHECK-LE-LABEL: sext_v2i16_v2i64:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr s0, [x0]
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v2i16_v2i64:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ldr s0, [x0]
+; CHECK-BE-NEXT:    rev32 v0.4h, v0.4h
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    sshll v0.2d, v0.2s, #0
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <2 x i16>, ptr %a
+  %y = sext <2 x i16> %x to <2 x i64>
+  ret <2 x i64> %y
+}
+
+define <4 x i32> @sext_v4i16_v4i32(ptr %a) {
+; CHECK-LE-LABEL: sext_v4i16_v4i32:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    ldr d0, [x0]
+; CHECK-LE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-LE-NEXT:    ret
+;
+; CHECK-BE-LABEL: sext_v4i16_v4i32:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    ld1 { v0.4h }, [x0]
+; CHECK-BE-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-BE-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-BE-NEXT:    ret
+  %x = load <4 x i16>, ptr %a
+  %y = sext <4 x i16> %x to <4 x i32>
+  ret <4 x i32> %y
+}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index e85e808921c87..a302ddf483caa 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -219,21 +219,17 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    ldrh w8, [x0, #2]
-; CHECK-NEON-NEXT:    ldr h0, [x0]
+; CHECK-NEON-NEXT:    ldr s0, [x0]
 ; CHECK-NEON-NEXT:    ldr d1, [x1]
-; CHECK-NEON-NEXT:    mov v0.d[1], x8
-; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
+; CHECK-NEON-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    ldrh w8, [x0, #2]
-; CHECK-SVE-NEXT:    ldr h0, [x0]
+; CHECK-SVE-NEXT:    ldr s0, [x0]
 ; CHECK-SVE-NEXT:    ldr d1, [x1]
-; CHECK-SVE-NEXT:    mov v0.d[1], x8
-; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
+; CHECK-SVE-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/add.ll b/llvm/test/CodeGen/AArch64/add.ll
index 96168cb80196f..7502db4c5aa93 100644
--- a/llvm/test/CodeGen/AArch64/add.ll
+++ b/llvm/test/CodeGen/AArch64/add.ll
@@ -56,13 +56,11 @@ entry:
 define void @v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
 ; CHECK-SD-NEXT:    stur b1, [x0, #1]
@@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -228,13 +225,9 @@ entry:
 define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
 ; CHECK-SD-NEXT:    str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/andorxor.ll b/llvm/test/CodeGen/AArch64/andorxor.ll
index a7875dbebd0e6..d8d003c85eed6 100644
--- a/llvm/test/CodeGen/AArch64/andorxor.ll
+++ b/llvm/test/CodeGen/AArch64/andorxor.ll
@@ -176,12 +176,12 @@ entry:
 define void @and_v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: and_v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-SD-NEXT:    zip1 v0.4h, v0.4h, v0.4h
 ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
@@ -212,12 +212,12 @@ entry:
 define void @or_v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: or_v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
@@ -248,12 +248,12 @@ entry:
 define void @xor_v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: xor_v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
@@ -293,10 +293,9 @@ define void @and_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -345,10 +344,9 @@ define void @or_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -397,10 +395,9 @@ define void @xor_v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -698,12 +695,10 @@ entry:
 define void @and_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: and_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    zip1 v0.4h, v0.4h, v0.4h
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
@@ -734,12 +729,10 @@ entry:
 define void @or_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: or_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    orr v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
@@ -770,12 +763,10 @@ entry:
 define void @xor_v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: xor_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    eor v0.8b, v0.8b, v1.8b
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
diff --git a/llvm/test/CodeGen/AArch64/bitcast.ll b/llvm/test/CodeGen/AArch64/bitcast.ll
index 20f19fddf790a..002e6cd509bec 100644
--- a/llvm/test/CodeGen/AArch64/bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/bitcast.ll
@@ -433,12 +433,8 @@ define <2 x i16> @bitcast_v4i8_v2i16(<4 x i8> %a, <4 x i8> %b){
 ; CHECK-SD-NEXT:    sub sp, sp, #16
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    add v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    add x8, sp, #12
 ; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
-; CHECK-SD-NEXT:    str s0, [sp, #12]
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x8]
-; CHECK-SD-NEXT:    orr x8, x8, #0x2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/ctlz.ll b/llvm/test/CodeGen/AArch64/ctlz.ll
index 04124609eec74..b1b869ec9e1ff 100644
--- a/llvm/test/CodeGen/AArch64/ctlz.ll
+++ b/llvm/test/CodeGen/AArch64/ctlz.ll
@@ -6,11 +6,10 @@
 define void @v2i8(ptr %p1) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x0, #1]
+; CHECK-SD-NEXT:    ldr h1, [x0]
 ; CHECK-SD-NEXT:    movi v0.2s, #24
-; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    mov v1.s[1], w9
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    clz v1.2s, v1.2s
 ; CHECK-SD-NEXT:    sub v0.2s, v1.2s, v0.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
@@ -47,10 +46,9 @@ define void @v3i8(ptr %p1) {
 ; CHECK-SD-NEXT:    sub v0.4h, v1.4h, v0.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -145,11 +143,9 @@ entry:
 define void @v2i16(ptr %p1) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x0, #2]
+; CHECK-SD-NEXT:    ldr s1, [x0]
 ; CHECK-SD-NEXT:    movi v0.2s, #16
-; CHECK-SD-NEXT:    fmov s1, w8
-; CHECK-SD-NEXT:    mov v1.s[1], w9
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    clz v1.2s, v1.2s
 ; CHECK-SD-NEXT:    sub v0.2s, v1.2s, v0.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
diff --git a/llvm/test/CodeGen/AArch64/ctpop.ll b/llvm/test/CodeGen/AArch64/ctpop.ll
index d547b6bec5b83..9c59f1b233b5d 100644
--- a/llvm/test/CodeGen/AArch64/ctpop.ll
+++ b/llvm/test/CodeGen/AArch64/ctpop.ll
@@ -6,10 +6,9 @@
 define void @v2i8(ptr %p1) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x0, #1]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    mov v0.s[1], w9
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.2s, v0.4h
@@ -46,10 +45,9 @@ define void @v3i8(ptr %p1) {
 ; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -143,10 +141,8 @@ entry:
 define void @v2i16(ptr %p1) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x0, #2]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    mov v0.s[1], w9
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.4h, v0.8b
 ; CHECK-SD-NEXT:    uaddlp v0.2s, v0.4h
diff --git a/llvm/test/CodeGen/AArch64/cttz.ll b/llvm/test/CodeGen/AArch64/cttz.ll
index fc9bf2c0aca65..c9181b4c312d1 100644
--- a/llvm/test/CodeGen/AArch64/cttz.ll
+++ b/llvm/test/CodeGen/AArch64/cttz.ll
@@ -6,10 +6,10 @@
 define void @v2i8(ptr %p1) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #1
+; CHECK-SD-NEXT:    ldr h0, [x0]
 ; CHECK-SD-NEXT:    movi v1.2s, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    orr v0.2s, #1, lsl #8
 ; CHECK-SD-NEXT:    sub v1.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    bic v0.8b, v1.8b, v0.8b
@@ -59,10 +59,9 @@ define void @v3i8(ptr %p1) {
 ; CHECK-SD-NEXT:    sub v0.4h, v1.4h, v0.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -219,10 +218,9 @@ entry:
 define void @v2i16(ptr %p1) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #2
+; CHECK-SD-NEXT:    ldr s0, [x0]
 ; CHECK-SD-NEXT:    movi v1.2s, #1
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    orr v0.2s, #1, lsl #16
 ; CHECK-SD-NEXT:    sub v1.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    bic v0.8b, v1.8b, v0.8b
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index cabb0e7278e40..d18cff51c6101 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -263,16 +263,14 @@ define <16 x i16> @load_v16i8(ptr %p) {
 define <2 x i16> @std_v2i8_v2i16(ptr %p) {
 ; CHECK-LABEL: std_v2i8_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #2]
-; CHECK-NEXT:    ldrb w9, [x0, #3]
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    mov v0.s[1], w9
-; CHECK-NEXT:    ldrb w9, [x0, #1]
-; CHECK-NEXT:    mov v1.s[1], w9
+; CHECK-NEXT:    ldr h0, [x0, #2]
+; CHECK-NEXT:    ldr h1, [x0]
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    shl v0.2s, v0.2s, #3
-; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %l1 = load <2 x i8>, ptr %p
   %q = getelementptr i8, ptr %p, i32 2
@@ -1394,12 +1392,12 @@ define <4 x i32> @volatile(ptr %p) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ldr s1, [x0, #4]
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ldr s0, [x0, #4]
+; CHECK-NEXT:    ldr s1, [x0]
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #3
-; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #3
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
   %l1b = load volatile float, ptr %p
diff --git a/llvm/test/CodeGen/AArch64/load.ll b/llvm/test/CodeGen/AArch64/load.ll
index c4bb6e37d6eaf..b138fa4085427 100644
--- a/llvm/test/CodeGen/AArch64/load.ll
+++ b/llvm/test/CodeGen/AArch64/load.ll
@@ -230,9 +230,9 @@ define <2 x i64> @load_v2i64(ptr %ptr) {
 define <2 x i8> @load_v2i8(ptr %ptr, <2 x i8> %b) {
 ; CHECK-SD-LABEL: load_v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -269,9 +269,8 @@ define <32 x i8> @load_v32i8(ptr %ptr) {
 define <2 x i16> @load_v2i16(ptr %ptr) {
 ; CHECK-SD-LABEL: load_v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/mul.ll b/llvm/test/CodeGen/AArch64/mul.ll
index 9c69a6f03b858..475bd22c6ebcb 100644
--- a/llvm/test/CodeGen/AArch64/mul.ll
+++ b/llvm/test/CodeGen/AArch64/mul.ll
@@ -68,13 +68,11 @@ entry:
 define void @v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    umull v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
 ; CHECK-SD-NEXT:    stur b1, [x0, #1]
@@ -113,10 +111,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    mul v0.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -240,13 +237,9 @@ entry:
 define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT:    mul v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    umull v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
 ; CHECK-SD-NEXT:    str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 1c4a504d0ab70..b31a5ea0b5d79 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #24
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-SD-NEXT:    sqadd v0.2s, v0.2s, v1.2s
@@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-SD-NEXT:    sqadd v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
index 3e708b0678fbc..297b25ed075e4 100644
--- a/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/sitofp-to-tbl.ll
@@ -244,11 +244,9 @@ define void @sitofp_v2i8_to_v2f64(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB3_1: // %loop
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    add x9, x0, x8, lsl #1
-; CHECK-NEXT:    ldrsb w10, [x9]
-; CHECK-NEXT:    ldrsb w9, [x9, #1]
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ldr h0, [x0, x8, lsl #1]
+; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
+; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-NEXT:    str q0, [x1, x8, lsl #4]
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index 3af858713525b..02eb40b412efd 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -159,12 +159,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #24
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-SD-NEXT:    sqsub v0.2s, v0.2s, v1.2s
@@ -212,12 +212,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    shl v1.2s, v1.2s, #16
 ; CHECK-SD-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-SD-NEXT:    sqsub v0.2s, v0.2s, v1.2s
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 3a9f12b838702..1dc55fccc3dac 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -207,13 +207,12 @@ define void @store_v3i8(<3 x i8> %a, ptr %ptr){
 ; CHECK-SD-NEXT:    sub sp, sp, #16
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    strb w2, [x3, #2]
 ; CHECK-SD-NEXT:    mov v0.h[1], w1
 ; CHECK-SD-NEXT:    mov v0.h[2], w2
 ; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
-; CHECK-SD-NEXT:    str s0, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
-; CHECK-SD-NEXT:    strb w2, [x3, #2]
-; CHECK-SD-NEXT:    strh w8, [x3]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    str h0, [x3]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/AArch64/sub.ll b/llvm/test/CodeGen/AArch64/sub.ll
index 5e278d59b6591..dd920b98e18eb 100644
--- a/llvm/test/CodeGen/AArch64/sub.ll
+++ b/llvm/test/CodeGen/AArch64/sub.ll
@@ -56,13 +56,11 @@ entry:
 define void @v2i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.b }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.b }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #1
-; CHECK-SD-NEXT:    add x9, x1, #1
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
-; CHECK-SD-NEXT:    sub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    usubl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x0]
 ; CHECK-SD-NEXT:    stur b1, [x0, #1]
@@ -101,10 +99,9 @@ define void @v3i8(ptr %p1, ptr %p2) {
 ; CHECK-SD-NEXT:    sub v0.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; CHECK-SD-NEXT:    mov h0, v0.h[2]
-; CHECK-SD-NEXT:    str s1, [sp, #12]
-; CHECK-SD-NEXT:    ldrh w8, [sp, #12]
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    stur b0, [x0, #2]
-; CHECK-SD-NEXT:    strh w8, [x0]
+; CHECK-SD-NEXT:    str h1, [x0]
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
 ;
@@ -228,13 +225,9 @@ entry:
 define void @v2i16(ptr %p1, ptr %p2) {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ld1 { v0.h }[0], [x0]
-; CHECK-SD-NEXT:    ld1 { v1.h }[0], [x1]
-; CHECK-SD-NEXT:    add x8, x0, #2
-; CHECK-SD-NEXT:    add x9, x1, #2
-; CHECK-SD-NEXT:    ld1 { v0.h }[2], [x8]
-; CHECK-SD-NEXT:    ld1 { v1.h }[2], [x9]
-; CHECK-SD-NEXT:    sub v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    usubl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x0]
 ; CHECK-SD-NEXT:    str h1, [x0, #2]
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
index ba7bee9a94bac..a77c74ab67b80 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
@@ -7,8 +7,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define <4 x i32> @load_zext_v4i16i32(ptr %ap) vscale_range(2,0) #0 {
 ; CHECK-LABEL: load_zext_v4i16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = zext <4 x i16> %a to <4 x i32>
@@ -97,8 +99,10 @@ define void @load_zext_v64i16i32(ptr %ap, ptr %b) #0 {
 define <4 x i32> @load_sext_v4i16i32(ptr %ap) vscale_range(2,0) #0 {
 ; CHECK-LABEL: load_sext_v4i16i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldp s0, s1, [x0]
 ; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
+; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-NEXT:    ret
   %a = load <4 x i16>, ptr %ap
   %val = sext <4 x i16> %a to <4 x i32>
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
index 6fd5b820a2242..b457e0307fbe1 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll
@@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @masked_gather_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    ldrb w9, [x0, #1]
+; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
@@ -165,11 +164,9 @@ define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_gather_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_gather_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v0.2s, v0.2s, #0
 ; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    cmpne p0.d, p0/z, z0.d, #0
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
index ed03f9b322432..4fb3bf7392d4e 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll
@@ -12,11 +12,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @masked_scatter_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    ldrb w9, [x0, #1]
+; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
@@ -159,11 +158,9 @@ define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 {
 define void @masked_scatter_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 {
 ; CHECK-LABEL: masked_scatter_v2i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ptrue p0.d, vl2
-; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov v0.s[1], w9
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
 ; CHECK-NEXT:    cmeq v1.2s, v0.2s, #0
 ; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-NEXT:    sshll v1.2d, v1.2s, #0
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index 3cfb24aaccb11..cd02d18e61643 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -156,16 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x1]
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
 ; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
-; CHECK-SD-NEXT:    ldrb w10, [x0, #1]
-; CHECK-SD-NEXT:    ldrb w11, [x1, #1]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    umin v0.2s, v0.2s, v2.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x2]
@@ -210,16 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x1]
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
 ; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
-; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
-; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
-; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    umin v0.2s, v0.2s, v2.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x2]
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index a71cf95a728db..ef70137e6deee 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -156,14 +156,12 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i8:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    ldrb w9, [x1]
-; CHECK-SD-NEXT:    ldrb w10, [x0, #1]
-; CHECK-SD-NEXT:    ldrb w11, [x1, #1]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    ldr h1, [x1]
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    uqsub v0.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str b0, [x2]
@@ -208,14 +206,10 @@ define void @v4i16(ptr %px, ptr %py, ptr %pz) nounwind {
 define void @v2i16(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-SD-LABEL: v2i16:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    ldrh w9, [x1]
-; CHECK-SD-NEXT:    ldrh w10, [x0, #2]
-; CHECK-SD-NEXT:    ldrh w11, [x1, #2]
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    fmov s1, w9
-; CHECK-SD-NEXT:    mov v0.s[1], w10
-; CHECK-SD-NEXT:    mov v1.s[1], w11
+; CHECK-SD-NEXT:    ldr s0, [x0]
+; CHECK-SD-NEXT:    ldr s1, [x1]
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
 ; CHECK-SD-NEXT:    uqsub v0.2s, v0.2s, v1.2s
 ; CHECK-SD-NEXT:    mov s1, v0.s[1]
 ; CHECK-SD-NEXT:    str h0, [x2]
diff --git a/llvm/test/CodeGen/AArch64/v3f-to-int.ll b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
index f6553b6acec9d..6d4061fb02cff 100644
--- a/llvm/test/CodeGen/AArch64/v3f-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/v3f-to-int.ll
@@ -1,9 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s
 
-; CHECK-LABEL: convert_v3f32
-; CHECK: strb
-; CHECK: strh
 define void @convert_v3f32() {
+; CHECK-LABEL: convert_v3f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    str wzr, [sp, #12]
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    strb wzr, [x8]
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    str h0, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
 entry:
   br label %bb
 
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 80029fb717575..ee74984125f77 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -896,16 +896,13 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
 ; CHECK-SD-NEXT:    shl.16b v0, v0, #7
 ; CHECK-SD-NEXT:    adrp x8, lCPI20_0@PAGE
 ; CHECK-SD-NEXT:    ldr q1, [x8, lCPI20_0@PAGEOFF]
-; CHECK-SD-NEXT:    add x8, sp, #14
 ; CHECK-SD-NEXT:    cmlt.16b v0, v0, #0
 ; CHECK-SD-NEXT:    and.16b v0, v0, v1
 ; CHECK-SD-NEXT:    ext.16b v1, v0, v0, #8
 ; CHECK-SD-NEXT:    zip1.16b v0, v0, v1
 ; CHECK-SD-NEXT:    addv.8h h0, v0
-; CHECK-SD-NEXT:    str h0, [sp, #14]
-; CHECK-SD-NEXT:    ld1.b { v0 }[0], [x8]
-; CHECK-SD-NEXT:    orr x8, x8, #0x1
-; CHECK-SD-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-SD-NEXT:    ushll.8h v0, v0, #0
+; CHECK-SD-NEXT:    ushll.4s v0, v0, #0
 ; CHECK-SD-NEXT:    ; kill: def $d0 killed $d0 killed $q0
 ; CHECK-SD-NEXT:    add sp, sp, #16
 ; CHECK-SD-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 7d3f5bc270d6b..60414adba75fc 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -372,13 +372,13 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
 ; BE-NEXT:    ldr s0, [x0]
 ; BE-NEXT:    ldrh w8, [x0, #4]
 ; BE-NEXT:    rev32 v0.4h, v0.4h
+; BE-NEXT:    strb w8, [x1, #2]
 ; BE-NEXT:    mov v0.h[2], w8
 ; BE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; BE-NEXT:    rev32 v0.16b, v0.16b
-; BE-NEXT:    str s0, [sp, #12]
-; BE-NEXT:    ldrh w9, [sp, #12]
-; BE-NEXT:    strb w8, [x1, #2]
-; BE-NEXT:    strh w9, [x1]
+; BE-NEXT:    rev32 v0.4h, v0.4h
+; BE-NEXT:    ushll v0.4s, v0.4h, #0
+; BE-NEXT:    str h0, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
 entry:
@@ -422,10 +422,10 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
 entry:
@@ -604,10 +604,10 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -638,10 +638,10 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -672,10 +672,10 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -706,10 +706,10 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #3]
-; BE-NEXT:    sturh w8, [x1, #1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    stur h1, [x1, #1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -741,10 +741,10 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #5]
-; BE-NEXT:    sturh w8, [x1, #3]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    stur h1, [x1, #3]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -764,10 +764,9 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    shrn.4h v0, v0, #16
 ; CHECK-NEXT:    uzp1.8b v1, v0, v0
 ; CHECK-NEXT:    mov h0, v0[2]
-; CHECK-NEXT:    str s1, [sp, #12]
-; CHECK-NEXT:    ldrh w8, [sp, #12]
+; CHECK-NEXT:    ushll.4s v1, v1, #0
 ; CHECK-NEXT:    stur b0, [x1, #2]
-; CHECK-NEXT:    strh w8, [x1]
+; CHECK-NEXT:    str h1, [x1]
 ; CHECK-NEXT:    add sp, sp, #16
 ; CHECK-NEXT:    ret
 ;
@@ -780,10 +779,10 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #12]
-; BE-NEXT:    ldrh w8, [sp, #12]
 ; BE-NEXT:    stur b0, [x1, #2]
-; BE-NEXT:    strh w8, [x1]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x1]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i32>, ptr %src
@@ -832,10 +831,10 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #8]
-; BE-NEXT:    ldrh w8, [sp, #8]
 ; BE-NEXT:    stur b0, [x0, #2]
-; BE-NEXT:    strh w8, [x0]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x0]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i8>, ptr %src, align 1
@@ -885,10 +884,10 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
 ; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    mov h0, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
-; BE-NEXT:    str s1, [sp, #8]
-; BE-NEXT:    ldrh w8, [sp, #8]
 ; BE-NEXT:    stur b0, [x0, #2]
-; BE-NEXT:    strh w8, [x0]
+; BE-NEXT:    rev32 v1.4h, v1.4h
+; BE-NEXT:    ushll v1.4s, v1.4h, #0
+; BE-NEXT:    str h1, [x0]
 ; BE-NEXT:    add sp, sp, #16
 ; BE-NEXT:    ret
   %l = load <3 x i8>, ptr %src, align 1

From 21e0b56d7afc2f1af0ad5b728fcc039bfe1d37ff Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 17 Nov 2025 19:47:36 +0000
Subject: [PATCH 42/67] [AArch64][GlobalISel] Add basic GISel test coverage for
 lround and llround. NFC

---
 .../test/CodeGen/AArch64/llround-conv-fp16.ll |  8 ++-
 llvm/test/CodeGen/AArch64/llround-conv.ll     | 67 ++++++++++++-------
 llvm/test/CodeGen/AArch64/lround-conv-fp16.ll |  8 ++-
 llvm/test/CodeGen/AArch64/lround-conv.ll      | 55 +++++++++------
 4 files changed, 88 insertions(+), 50 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll b/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll
index 4bf65e7d6fd08..cb042757a4a42 100644
--- a/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll
+++ b/llvm/test/CodeGen/AArch64/llround-conv-fp16.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16
+; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-NOFP16,CHECK-GI
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmhhs
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhws
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhxs
 
 define i16 @testmhhs(half %x) {
 ; CHECK-NOFP16-LABEL: testmhhs:
@@ -55,5 +61,3 @@ entry:
   %0 = tail call i64 @llvm.llround.i64.f16(half %x)
   ret i64 %0
 }
-
-declare i64 @llvm.llround.i64.f16(half) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/llround-conv.ll b/llvm/test/CodeGen/AArch64/llround-conv.ll
index 797136037f0e9..4cc089804ce97 100644
--- a/llvm/test/CodeGen/AArch64/llround-conv.ll
+++ b/llvm/test/CodeGen/AArch64/llround-conv.ll
@@ -1,60 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmswl
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmsll
 
-; CHECK-LABEL: testmsws:
-; CHECK:       fcvtas  x0, s0
-; CHECK:       ret
 define i32 @testmsws(float %x) {
+; CHECK-LABEL: testmsws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f32(float %x)
+  %0 = tail call i64 @llvm.llround.i64.f32(float %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxs:
-; CHECK:       fcvtas  x0, s0
-; CHECK-NEXT:  ret
 define i64 @testmsxs(float %x) {
+; CHECK-LABEL: testmsxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f32(float %x)
+  %0 = tail call i64 @llvm.llround.i64.f32(float %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswd:
-; CHECK:       fcvtas  x0, d0
-; CHECK:       ret
 define i32 @testmswd(double %x) {
+; CHECK-LABEL: testmswd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f64(double %x)
+  %0 = tail call i64 @llvm.llround.i64.f64(double %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxd:
-; CHECK:       fcvtas  x0, d0
-; CHECK-NEXT:  ret
 define i64 @testmsxd(double %x) {
+; CHECK-LABEL: testmsxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f64(double %x)
+  %0 = tail call i64 @llvm.llround.i64.f64(double %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswl:
-; CHECK:       bl      llroundl
 define i32 @testmswl(fp128 %x) {
+; CHECK-LABEL: testmswl:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl llroundl
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
-  %0 = tail call i64 @llvm.llround.f128(fp128 %x)
+  %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsll:
-; CHECK:       b       llroundl
 define i64 @testmsll(fp128 %x) {
+; CHECK-LABEL: testmsll:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b llroundl
 entry:
-  %0 = tail call i64 @llvm.llround.f128(fp128 %x)
+  %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x)
   ret i64 %0
 }
-
-declare i64 @llvm.llround.f32(float) nounwind readnone
-declare i64 @llvm.llround.f64(double) nounwind readnone
-declare i64 @llvm.llround.f128(fp128) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll b/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll
index bf78fd456eac0..a29dea0eb9f9f 100644
--- a/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll
+++ b/llvm/test/CodeGen/AArch64/lround-conv-fp16.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK-NOFP16
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-FP16
+; RUN: llc < %s -mtriple=aarch64 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-NOFP16,CHECK-GI
+; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK-FP16,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmhhs
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhws
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmhxs
 
 define i16 @testmhhs(half %x) {
 ; CHECK-NOFP16-LABEL: testmhhs:
@@ -55,5 +61,3 @@ entry:
   %0 = tail call i64 @llvm.lround.i64.f16(half %x)
   ret i64 %0
 }
-
-declare i64 @llvm.lround.i64.f16(half) nounwind readnone
diff --git a/llvm/test/CodeGen/AArch64/lround-conv.ll b/llvm/test/CodeGen/AArch64/lround-conv.ll
index 678d3149f20cc..0bf82b538e70c 100644
--- a/llvm/test/CodeGen/AArch64/lround-conv.ll
+++ b/llvm/test/CodeGen/AArch64/lround-conv.ll
@@ -1,60 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc < %s -mtriple=aarch64 -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:       warning: Instruction selection used fallback path for testmswl
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for testmsll
 
-; CHECK-LABEL: testmsws:
-; CHECK:       fcvtas  x0, s0
-; CHECK:       ret
 define i32 @testmsws(float %x) {
+; CHECK-LABEL: testmsws:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f32(float %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxs:
-; CHECK:       fcvtas  x0, s0
-; CHECK-NEXT:  ret
 define i64 @testmsxs(float %x) {
+; CHECK-LABEL: testmsxs:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, s0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f32(float %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswd:
-; CHECK:       fcvtas  x0, d0
-; CHECK:       ret
 define i32 @testmswd(double %x) {
+; CHECK-LABEL: testmswd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f64(double %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsxd:
-; CHECK:       fcvtas  x0, d0
-; CHECK-NEXT:  ret
 define i64 @testmsxd(double %x) {
+; CHECK-LABEL: testmsxd:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtas x0, d0
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f64(double %x)
   ret i64 %0
 }
 
-; CHECK-LABEL: testmswl:
-; CHECK:       bl      lroundl
 define i32 @testmswl(fp128 %x) {
+; CHECK-LABEL: testmswl:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl lroundl
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
 entry:
   %0 = tail call i64 @llvm.lround.i64.f128(fp128 %x)
   %conv = trunc i64 %0 to i32
   ret i32 %conv
 }
 
-; CHECK-LABEL: testmsll:
-; CHECK:       b       lroundl
 define i64 @testmsll(fp128 %x) {
+; CHECK-LABEL: testmsll:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    b lroundl
 entry:
   %0 = tail call i64 @llvm.lround.i64.f128(fp128 %x)
   ret i64 %0
 }
-
-declare i64 @llvm.lround.i64.f32(float) nounwind readnone
-declare i64 @llvm.lround.i64.f64(double) nounwind readnone
-declare i64 @llvm.lround.i64.f128(fp128) nounwind readnone

From 320c18a066b29e90ab5f3ef33b6c510f28edeb80 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Mon, 17 Nov 2025 23:03:45 +0300
Subject: [PATCH 43/67] [SystemZ] TableGen-erate node descriptions (#168113)

This allows SDNodes to be validated against their expected type profiles
and reduces the number of changes required to add a new node.

There is only one node that is missing a description -- `GET_CCMASK`,
others were successfully imported.

Part of #119709.

Pull Request: https://github.com/llvm/llvm-project/pull/168113
---
 llvm/lib/Target/SystemZ/CMakeLists.txt        |   1 +
 .../Target/SystemZ/SystemZISelLowering.cpp    | 147 -------
 llvm/lib/Target/SystemZ/SystemZISelLowering.h | 386 ------------------
 llvm/lib/Target/SystemZ/SystemZOperators.td   | 279 ++++++++++++-
 .../SystemZ/SystemZSelectionDAGInfo.cpp       |  20 +-
 .../Target/SystemZ/SystemZSelectionDAGInfo.h  |  29 +-
 6 files changed, 295 insertions(+), 567 deletions(-)

diff --git a/llvm/lib/Target/SystemZ/CMakeLists.txt b/llvm/lib/Target/SystemZ/CMakeLists.txt
index 0d8f3eac6ee4f..6d94a755322df 100644
--- a/llvm/lib/Target/SystemZ/CMakeLists.txt
+++ b/llvm/lib/Target/SystemZ/CMakeLists.txt
@@ -11,6 +11,7 @@ tablegen(LLVM SystemZGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM SystemZGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM SystemZGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM SystemZGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM SystemZGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM SystemZGenSubtargetInfo.inc -gen-subtarget)
 
 add_public_tablegen_target(SystemZCommonTableGen)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 58109acc92015..dfd76f9b0427f 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -7423,153 +7423,6 @@ SystemZTargetLowering::ReplaceNodeResults(SDNode *N,
   return LowerOperationWrapper(N, Results, DAG);
 }
 
-const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
-#define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME
-  switch ((SystemZISD::NodeType)Opcode) {
-    case SystemZISD::FIRST_NUMBER: break;
-    OPCODE(RET_GLUE);
-    OPCODE(CALL);
-    OPCODE(SIBCALL);
-    OPCODE(TLS_GDCALL);
-    OPCODE(TLS_LDCALL);
-    OPCODE(PCREL_WRAPPER);
-    OPCODE(PCREL_OFFSET);
-    OPCODE(ICMP);
-    OPCODE(FCMP);
-    OPCODE(STRICT_FCMP);
-    OPCODE(STRICT_FCMPS);
-    OPCODE(TM);
-    OPCODE(BR_CCMASK);
-    OPCODE(SELECT_CCMASK);
-    OPCODE(ADJDYNALLOC);
-    OPCODE(PROBED_ALLOCA);
-    OPCODE(POPCNT);
-    OPCODE(SMUL_LOHI);
-    OPCODE(UMUL_LOHI);
-    OPCODE(SDIVREM);
-    OPCODE(UDIVREM);
-    OPCODE(SADDO);
-    OPCODE(SSUBO);
-    OPCODE(UADDO);
-    OPCODE(USUBO);
-    OPCODE(ADDCARRY);
-    OPCODE(SUBCARRY);
-    OPCODE(GET_CCMASK);
-    OPCODE(MVC);
-    OPCODE(NC);
-    OPCODE(OC);
-    OPCODE(XC);
-    OPCODE(CLC);
-    OPCODE(MEMSET_MVC);
-    OPCODE(STPCPY);
-    OPCODE(STRCMP);
-    OPCODE(SEARCH_STRING);
-    OPCODE(IPM);
-    OPCODE(TBEGIN);
-    OPCODE(TBEGIN_NOFLOAT);
-    OPCODE(TEND);
-    OPCODE(BYTE_MASK);
-    OPCODE(ROTATE_MASK);
-    OPCODE(REPLICATE);
-    OPCODE(JOIN_DWORDS);
-    OPCODE(SPLAT);
-    OPCODE(MERGE_HIGH);
-    OPCODE(MERGE_LOW);
-    OPCODE(SHL_DOUBLE);
-    OPCODE(PERMUTE_DWORDS);
-    OPCODE(PERMUTE);
-    OPCODE(PACK);
-    OPCODE(PACKS_CC);
-    OPCODE(PACKLS_CC);
-    OPCODE(UNPACK_HIGH);
-    OPCODE(UNPACKL_HIGH);
-    OPCODE(UNPACK_LOW);
-    OPCODE(UNPACKL_LOW);
-    OPCODE(VSHL_BY_SCALAR);
-    OPCODE(VSRL_BY_SCALAR);
-    OPCODE(VSRA_BY_SCALAR);
-    OPCODE(VROTL_BY_SCALAR);
-    OPCODE(SHL_DOUBLE_BIT);
-    OPCODE(SHR_DOUBLE_BIT);
-    OPCODE(VSUM);
-    OPCODE(VACC);
-    OPCODE(VSCBI);
-    OPCODE(VAC);
-    OPCODE(VSBI);
-    OPCODE(VACCC);
-    OPCODE(VSBCBI);
-    OPCODE(VMAH);
-    OPCODE(VMALH);
-    OPCODE(VME);
-    OPCODE(VMLE);
-    OPCODE(VMO);
-    OPCODE(VMLO);
-    OPCODE(VICMPE);
-    OPCODE(VICMPH);
-    OPCODE(VICMPHL);
-    OPCODE(VICMPES);
-    OPCODE(VICMPHS);
-    OPCODE(VICMPHLS);
-    OPCODE(VFCMPE);
-    OPCODE(STRICT_VFCMPE);
-    OPCODE(STRICT_VFCMPES);
-    OPCODE(VFCMPH);
-    OPCODE(STRICT_VFCMPH);
-    OPCODE(STRICT_VFCMPHS);
-    OPCODE(VFCMPHE);
-    OPCODE(STRICT_VFCMPHE);
-    OPCODE(STRICT_VFCMPHES);
-    OPCODE(VFCMPES);
-    OPCODE(VFCMPHS);
-    OPCODE(VFCMPHES);
-    OPCODE(VFTCI);
-    OPCODE(VEXTEND);
-    OPCODE(STRICT_VEXTEND);
-    OPCODE(VROUND);
-    OPCODE(STRICT_VROUND);
-    OPCODE(VTM);
-    OPCODE(SCMP128HI);
-    OPCODE(UCMP128HI);
-    OPCODE(VFAE_CC);
-    OPCODE(VFAEZ_CC);
-    OPCODE(VFEE_CC);
-    OPCODE(VFEEZ_CC);
-    OPCODE(VFENE_CC);
-    OPCODE(VFENEZ_CC);
-    OPCODE(VISTR_CC);
-    OPCODE(VSTRC_CC);
-    OPCODE(VSTRCZ_CC);
-    OPCODE(VSTRS_CC);
-    OPCODE(VSTRSZ_CC);
-    OPCODE(TDC);
-    OPCODE(ATOMIC_SWAPW);
-    OPCODE(ATOMIC_LOADW_ADD);
-    OPCODE(ATOMIC_LOADW_SUB);
-    OPCODE(ATOMIC_LOADW_AND);
-    OPCODE(ATOMIC_LOADW_OR);
-    OPCODE(ATOMIC_LOADW_XOR);
-    OPCODE(ATOMIC_LOADW_NAND);
-    OPCODE(ATOMIC_LOADW_MIN);
-    OPCODE(ATOMIC_LOADW_MAX);
-    OPCODE(ATOMIC_LOADW_UMIN);
-    OPCODE(ATOMIC_LOADW_UMAX);
-    OPCODE(ATOMIC_CMP_SWAPW);
-    OPCODE(ATOMIC_CMP_SWAP);
-    OPCODE(ATOMIC_LOAD_128);
-    OPCODE(ATOMIC_STORE_128);
-    OPCODE(ATOMIC_CMP_SWAP_128);
-    OPCODE(LRV);
-    OPCODE(STRV);
-    OPCODE(VLER);
-    OPCODE(VSTER);
-    OPCODE(STCKF);
-    OPCODE(PREFETCH);
-    OPCODE(ADA_ENTRY);
-  }
-  return nullptr;
-#undef OPCODE
-}
-
 // Return true if VT is a vector whose elements are a whole number of bytes
 // in width. Also check for presence of vector support.
 bool SystemZTargetLowering::canTreatAsByteVector(EVT VT) const {
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index d5b76031766dd..13a1cd1614a53 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -22,390 +22,6 @@
 #include <optional>
 
 namespace llvm {
-namespace SystemZISD {
-enum NodeType : unsigned {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-  // Return with a glue operand.  Operand 0 is the chain operand.
-  RET_GLUE,
-
-  // Calls a function.  Operand 0 is the chain operand and operand 1
-  // is the target address.  The arguments start at operand 2.
-  // There is an optional glue operand at the end.
-  CALL,
-  SIBCALL,
-
-  // TLS calls.  Like regular calls, except operand 1 is the TLS symbol.
-  // (The call target is implicitly __tls_get_offset.)
-  TLS_GDCALL,
-  TLS_LDCALL,
-
-  // Wraps a TargetGlobalAddress that should be loaded using PC-relative
-  // accesses (LARL).  Operand 0 is the address.
-  PCREL_WRAPPER,
-
-  // Used in cases where an offset is applied to a TargetGlobalAddress.
-  // Operand 0 is the full TargetGlobalAddress and operand 1 is a
-  // PCREL_WRAPPER for an anchor point.  This is used so that we can
-  // cheaply refer to either the full address or the anchor point
-  // as a register base.
-  PCREL_OFFSET,
-
-  // Integer comparisons.  There are three operands: the two values
-  // to compare, and an integer of type SystemZICMP.
-  ICMP,
-
-  // Floating-point comparisons.  The two operands are the values to compare.
-  FCMP,
-
-  // Test under mask.  The first operand is ANDed with the second operand
-  // and the condition codes are set on the result.  The third operand is
-  // a boolean that is true if the condition codes need to distinguish
-  // between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
-  // register forms do but the memory forms don't).
-  TM,
-
-  // Branches if a condition is true.  Operand 0 is the chain operand;
-  // operand 1 is the 4-bit condition-code mask, with bit N in
-  // big-endian order meaning "branch if CC=N"; operand 2 is the
-  // target block and operand 3 is the flag operand.
-  BR_CCMASK,
-
-  // Selects between operand 0 and operand 1.  Operand 2 is the
-  // mask of condition-code values for which operand 0 should be
-  // chosen over operand 1; it has the same form as BR_CCMASK.
-  // Operand 3 is the flag operand.
-  SELECT_CCMASK,
-
-  // Evaluates to the gap between the stack pointer and the
-  // base of the dynamically-allocatable area.
-  ADJDYNALLOC,
-
-  // For allocating stack space when using stack clash protector.
-  // Allocation is performed by block, and each block is probed.
-  PROBED_ALLOCA,
-
-  // Count number of bits set in operand 0 per byte.
-  POPCNT,
-
-  // Wrappers around the ISD opcodes of the same name.  The output is GR128.
-  // Input operands may be GR64 or GR32, depending on the instruction.
-  SMUL_LOHI,
-  UMUL_LOHI,
-  SDIVREM,
-  UDIVREM,
-
-  // Add/subtract with overflow/carry.  These have the same operands as
-  // the corresponding standard operations, except with the carry flag
-  // replaced by a condition code value.
-  SADDO, SSUBO, UADDO, USUBO, ADDCARRY, SUBCARRY,
-
-  // Set the condition code from a boolean value in operand 0.
-  // Operand 1 is a mask of all condition-code values that may result of this
-  // operation, operand 2 is a mask of condition-code values that may result
-  // if the boolean is true.
-  // Note that this operation is always optimized away, we will never
-  // generate any code for it.
-  GET_CCMASK,
-
-  // Use a series of MVCs to copy bytes from one memory location to another.
-  // The operands are:
-  // - the target address
-  // - the source address
-  // - the constant length
-  //
-  // This isn't a memory opcode because we'd need to attach two
-  // MachineMemOperands rather than one.
-  MVC,
-
-  // Similar to MVC, but for logic operations (AND, OR, XOR).
-  NC,
-  OC,
-  XC,
-
-  // Use CLC to compare two blocks of memory, with the same comments
-  // as for MVC.
-  CLC,
-
-  // Use MVC to set a block of memory after storing the first byte.
-  MEMSET_MVC,
-
-  // Use an MVST-based sequence to implement stpcpy().
-  STPCPY,
-
-  // Use a CLST-based sequence to implement strcmp().  The two input operands
-  // are the addresses of the strings to compare.
-  STRCMP,
-
-  // Use an SRST-based sequence to search a block of memory.  The first
-  // operand is the end address, the second is the start, and the third
-  // is the character to search for.  CC is set to 1 on success and 2
-  // on failure.
-  SEARCH_STRING,
-
-  // Store the CC value in bits 29 and 28 of an integer.
-  IPM,
-
-  // Transaction begin.  The first operand is the chain, the second
-  // the TDB pointer, and the third the immediate control field.
-  // Returns CC value and chain.
-  TBEGIN,
-  TBEGIN_NOFLOAT,
-
-  // Transaction end.  Just the chain operand.  Returns CC value and chain.
-  TEND,
-
-  // Create a vector constant by filling byte N of the result with bit
-  // 15-N of the single operand.
-  BYTE_MASK,
-
-  // Create a vector constant by replicating an element-sized RISBG-style mask.
-  // The first operand specifies the starting set bit and the second operand
-  // specifies the ending set bit.  Both operands count from the MSB of the
-  // element.
-  ROTATE_MASK,
-
-  // Replicate a GPR scalar value into all elements of a vector.
-  REPLICATE,
-
-  // Create a vector from two i64 GPRs.
-  JOIN_DWORDS,
-
-  // Replicate one element of a vector into all elements.  The first operand
-  // is the vector and the second is the index of the element to replicate.
-  SPLAT,
-
-  // Interleave elements from the high half of operand 0 and the high half
-  // of operand 1.
-  MERGE_HIGH,
-
-  // Likewise for the low halves.
-  MERGE_LOW,
-
-  // Concatenate the vectors in the first two operands, shift them left
-  // by the third operand, and take the first half of the result.
-  SHL_DOUBLE,
-
-  // Take one element of the first v2i64 operand and the one element of
-  // the second v2i64 operand and concatenate them to form a v2i64 result.
-  // The third operand is a 4-bit value of the form 0A0B, where A and B
-  // are the element selectors for the first operand and second operands
-  // respectively.
-  PERMUTE_DWORDS,
-
-  // Perform a general vector permute on vector operands 0 and 1.
-  // Each byte of operand 2 controls the corresponding byte of the result,
-  // in the same way as a byte-level VECTOR_SHUFFLE mask.
-  PERMUTE,
-
-  // Pack vector operands 0 and 1 into a single vector with half-sized elements.
-  PACK,
-
-  // Likewise, but saturate the result and set CC.  PACKS_CC does signed
-  // saturation and PACKLS_CC does unsigned saturation.
-  PACKS_CC,
-  PACKLS_CC,
-
-  // Unpack the first half of vector operand 0 into double-sized elements.
-  // UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
-  UNPACK_HIGH,
-  UNPACKL_HIGH,
-
-  // Likewise for the second half.
-  UNPACK_LOW,
-  UNPACKL_LOW,
-
-  // Shift/rotate each element of vector operand 0 by the number of bits
-  // specified by scalar operand 1.
-  VSHL_BY_SCALAR,
-  VSRL_BY_SCALAR,
-  VSRA_BY_SCALAR,
-  VROTL_BY_SCALAR,
-
-  // Concatenate the vectors in the first two operands, shift them left/right
-  // bitwise by the third operand, and take the first/last half of the result.
-  SHL_DOUBLE_BIT,
-  SHR_DOUBLE_BIT,
-
-  // For each element of the output type, sum across all sub-elements of
-  // operand 0 belonging to the corresponding element, and add in the
-  // rightmost sub-element of the corresponding element of operand 1.
-  VSUM,
-
-  // Compute carry/borrow indication for add/subtract.
-  VACC, VSCBI,
-  // Add/subtract with carry/borrow.
-  VAC, VSBI,
-  // Compute carry/borrow indication for add/subtract with carry/borrow.
-  VACCC, VSBCBI,
-
-  // High-word multiply-and-add.
-  VMAH, VMALH,
-  // Widen and multiply even/odd vector elements.
-  VME, VMLE, VMO, VMLO,
-
-  // Compare integer vector operands 0 and 1 to produce the usual 0/-1
-  // vector result.  VICMPE is for equality, VICMPH for "signed greater than"
-  // and VICMPHL for "unsigned greater than".
-  VICMPE,
-  VICMPH,
-  VICMPHL,
-
-  // Likewise, but also set the condition codes on the result.
-  VICMPES,
-  VICMPHS,
-  VICMPHLS,
-
-  // Compare floating-point vector operands 0 and 1 to produce the usual 0/-1
-  // vector result.  VFCMPE is for "ordered and equal", VFCMPH for "ordered and
-  // greater than" and VFCMPHE for "ordered and greater than or equal to".
-  VFCMPE,
-  VFCMPH,
-  VFCMPHE,
-
-  // Likewise, but also set the condition codes on the result.
-  VFCMPES,
-  VFCMPHS,
-  VFCMPHES,
-
-  // Test floating-point data class for vectors.
-  VFTCI,
-
-  // Extend the even f32 elements of vector operand 0 to produce a vector
-  // of f64 elements.
-  VEXTEND,
-
-  // Round the f64 elements of vector operand 0 to f32s and store them in the
-  // even elements of the result.
-  VROUND,
-
-  // AND the two vector operands together and set CC based on the result.
-  VTM,
-
-  // i128 high integer comparisons.
-  SCMP128HI,
-  UCMP128HI,
-
-  // String operations that set CC as a side-effect.
-  VFAE_CC,
-  VFAEZ_CC,
-  VFEE_CC,
-  VFEEZ_CC,
-  VFENE_CC,
-  VFENEZ_CC,
-  VISTR_CC,
-  VSTRC_CC,
-  VSTRCZ_CC,
-  VSTRS_CC,
-  VSTRSZ_CC,
-
-  // Test Data Class.
-  //
-  // Operand 0: the value to test
-  // Operand 1: the bit mask
-  TDC,
-
-  // z/OS XPLINK ADA Entry
-  // Wraps a TargetGlobalAddress that should be loaded from a function's
-  // AssociatedData Area (ADA). Tha ADA is passed to the function by the
-  // caller in the XPLink ABI defined register R5.
-  // Operand 0: the GlobalValue/External Symbol
-  // Operand 1: the ADA register
-  // Operand 2: the offset (0 for the first and 8 for the second element in the
-  // function descriptor)
-  ADA_ENTRY,
-
-  // Strict variants of scalar floating-point comparisons.
-  // Quiet and signaling versions.
-  FIRST_STRICTFP_OPCODE,
-  STRICT_FCMP = FIRST_STRICTFP_OPCODE,
-  STRICT_FCMPS,
-
-  // Strict variants of vector floating-point comparisons.
-  // Quiet and signaling versions.
-  STRICT_VFCMPE,
-  STRICT_VFCMPH,
-  STRICT_VFCMPHE,
-  STRICT_VFCMPES,
-  STRICT_VFCMPHS,
-  STRICT_VFCMPHES,
-
-  // Strict variants of VEXTEND and VROUND.
-  STRICT_VEXTEND,
-  STRICT_VROUND,
-  LAST_STRICTFP_OPCODE = STRICT_VROUND,
-
-  // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
-  // ATOMIC_LOAD_<op>.
-  //
-  // Operand 0: the address of the containing 32-bit-aligned field
-  // Operand 1: the second operand of <op>, in the high bits of an i32
-  //            for everything except ATOMIC_SWAPW
-  // Operand 2: how many bits to rotate the i32 left to bring the first
-  //            operand into the high bits
-  // Operand 3: the negative of operand 2, for rotating the other way
-  // Operand 4: the width of the field in bits (8 or 16)
-  FIRST_MEMORY_OPCODE,
-  ATOMIC_SWAPW = FIRST_MEMORY_OPCODE,
-  ATOMIC_LOADW_ADD,
-  ATOMIC_LOADW_SUB,
-  ATOMIC_LOADW_AND,
-  ATOMIC_LOADW_OR,
-  ATOMIC_LOADW_XOR,
-  ATOMIC_LOADW_NAND,
-  ATOMIC_LOADW_MIN,
-  ATOMIC_LOADW_MAX,
-  ATOMIC_LOADW_UMIN,
-  ATOMIC_LOADW_UMAX,
-
-  // A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
-  //
-  // Operand 0: the address of the containing 32-bit-aligned field
-  // Operand 1: the compare value, in the low bits of an i32
-  // Operand 2: the swap value, in the low bits of an i32
-  // Operand 3: how many bits to rotate the i32 left to bring the first
-  //            operand into the high bits
-  // Operand 4: the negative of operand 2, for rotating the other way
-  // Operand 5: the width of the field in bits (8 or 16)
-  ATOMIC_CMP_SWAPW,
-
-  // Atomic compare-and-swap returning CC value.
-  // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
-  ATOMIC_CMP_SWAP,
-
-  // 128-bit atomic load.
-  // Val, OUTCHAIN = ATOMIC_LOAD_128(INCHAIN, ptr)
-  ATOMIC_LOAD_128,
-
-  // 128-bit atomic store.
-  // OUTCHAIN = ATOMIC_STORE_128(INCHAIN, val, ptr)
-  ATOMIC_STORE_128,
-
-  // 128-bit atomic compare-and-swap.
-  // Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
-  ATOMIC_CMP_SWAP_128,
-
-  // Byte swapping load/store.  Same operands as regular load/store.
-  LRV, STRV,
-
-  // Element swapping load/store.  Same operands as regular load/store.
-  VLER, VSTER,
-
-  // Use STORE CLOCK FAST to store current TOD clock value.
-  STCKF,
-
-  // Prefetch from the second operand using the 4-bit control code in
-  // the first operand.  The code is 1 for a load prefetch and 2 for
-  // a store prefetch.
-  PREFETCH,
-  LAST_MEMORY_OPCODE = PREFETCH,
-};
-
-// Return true if OPCODE is some kind of PC-relative address.
-inline bool isPCREL(unsigned Opcode) {
-  return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
-}
-} // end namespace SystemZISD
 
 namespace SystemZICMP {
 // Describes whether an integer comparison needs to be signed or unsigned,
@@ -532,8 +148,6 @@ class SystemZTargetLowering : public TargetLowering {
     return true;
   }
 
-  const char *getTargetNodeName(unsigned Opcode) const override;
-
   // This function currently returns cost for srl/ipm/cc sequence for merging.
   CondMergingParams
   getJumpConditionMergingParams(Instruction::BinaryOps Opc, const Value *Lhs,
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index 547d3dcf92804..a02cafaaafcdf 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -265,74 +265,151 @@ def callseq_end         : SDNode<"ISD::CALLSEQ_END",   SDT_CallSeqEnd,
                                   SDNPOutGlue]>;
 def global_offset_table : SDNode<"ISD::GLOBAL_OFFSET_TABLE", SDTPtrLeaf>;
 
-// Nodes for SystemZISD::*.  See SystemZISelLowering.h for more details.
+// Return with a glue operand.  Operand 0 is the chain operand.
 def z_retglue           : SDNode<"SystemZISD::RET_GLUE", SDTNone,
                                  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+// Calls a function.  Operand 0 is the chain operand and operand 1
+// is the target address.  The arguments start at operand 2.
+// There is an optional glue operand at the end.
 def z_call              : SDNode<"SystemZISD::CALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                                   SDNPVariadic]>;
 def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                                   SDNPVariadic]>;
+// TLS calls.  Like regular calls, except operand 1 is the TLS symbol.
+// (The call target is implicitly __tls_get_offset.)
 def z_tls_gdcall        : SDNode<"SystemZISD::TLS_GDCALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                                   SDNPVariadic]>;
 def z_tls_ldcall        : SDNode<"SystemZISD::TLS_LDCALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                                   SDNPVariadic]>;
+
+// Wraps a TargetGlobalAddress that should be loaded using PC-relative
+// accesses (LARL).  Operand 0 is the address.
 def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
+
+// Used in cases where an offset is applied to a TargetGlobalAddress.
+// Operand 0 is the full TargetGlobalAddress and operand 1 is a
+// PCREL_WRAPPER for an anchor point.  This is used so that we can
+// cheaply refer to either the full address or the anchor point
+// as a register base.
 def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
                                  SDT_ZWrapOffset, []>;
+
+// Integer comparisons.  There are three operands: the two values
+// to compare, and an integer of type SystemZICMP.
 def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp>;
+
+// Floating-point comparisons.  The two operands are the values to compare.
 def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp>;
-def z_strict_fcmp       : SDNode<"SystemZISD::STRICT_FCMP", SDT_ZCmp,
-                                 [SDNPHasChain]>;
-def z_strict_fcmps      : SDNode<"SystemZISD::STRICT_FCMPS", SDT_ZCmp,
-                                 [SDNPHasChain]>;
+
+let IsStrictFP = true in {
+  // Strict variants of scalar floating-point comparisons.
+  // Quiet and signaling versions.
+  def z_strict_fcmp : SDNode<"SystemZISD::STRICT_FCMP", SDT_ZCmp,
+                             [SDNPHasChain]>;
+  def z_strict_fcmps : SDNode<"SystemZISD::STRICT_FCMPS", SDT_ZCmp,
+                              [SDNPHasChain]>;
+}
+
+// Test under mask.  The first operand is ANDed with the second operand
+// and the condition codes are set on the result.  The third operand is
+// a boolean that is true if the condition codes need to distinguish
+// between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
+// register forms do but the memory forms don't).
 def z_tm                : SDNode<"SystemZISD::TM", SDT_ZICmp>;
+
+// Branches if a condition is true.  Operand 0 is the chain operand;
+// operand 1 is the 4-bit condition-code mask, with bit N in
+// big-endian order meaning "branch if CC=N"; operand 2 is the
+// target block and operand 3 is the flag operand.
 def z_br_ccmask_1       : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
                                  [SDNPHasChain]>;
+
+// Selects between operand 0 and operand 1.  Operand 2 is the
+// mask of condition-code values for which operand 0 should be
+// chosen over operand 1; it has the same form as BR_CCMASK.
+// Operand 3 is the flag operand.
 def z_select_ccmask_1   : SDNode<"SystemZISD::SELECT_CCMASK",
                                  SDT_ZSelectCCMask>;
+
+// Store the CC value in bits 29 and 28 of an integer.
 def z_ipm_1             : SDNode<"SystemZISD::IPM", SDT_ZIPM>;
+
+// Evaluates to the gap between the stack pointer and the
+// base of the dynamically-allocatable area.
 def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
+
+// For allocating stack space when using stack clash protector.
+// Allocation is performed by block, and each block is probed.
 def z_probed_alloca     : SDNode<"SystemZISD::PROBED_ALLOCA", SDT_ZProbedAlloca,
                                  [SDNPHasChain]>;
+
+// Count number of bits set in operand 0 per byte.
 def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
+
+// Wrappers around the ISD opcodes of the same name.  The output is GR128.
+// Input operands may be GR64 or GR32, depending on the instruction.
 def z_smul_lohi         : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>;
 def z_umul_lohi         : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>;
 def z_sdivrem           : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>;
 def z_udivrem           : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>;
+
+// Add/subtract with overflow/carry.  These have the same operands as
+// the corresponding standard operations, except with the carry flag
+// replaced by a condition code value.
 def z_saddo             : SDNode<"SystemZISD::SADDO", SDT_ZBinaryWithFlags>;
 def z_ssubo             : SDNode<"SystemZISD::SSUBO", SDT_ZBinaryWithFlags>;
 def z_uaddo             : SDNode<"SystemZISD::UADDO", SDT_ZBinaryWithFlags>;
 def z_usubo             : SDNode<"SystemZISD::USUBO", SDT_ZBinaryWithFlags>;
 def z_addcarry_1        : SDNode<"SystemZISD::ADDCARRY", SDT_ZBinaryWithCarry>;
 def z_subcarry_1        : SDNode<"SystemZISD::SUBCARRY", SDT_ZBinaryWithCarry>;
+
+// Compute carry/borrow indication for add/subtract.
 def z_vacc              : SDNode<"SystemZISD::VACC", SDTIntBinOp>;
-def z_vac               : SDNode<"SystemZISD::VAC", SDT_ZTernary>;
-def z_vaccc             : SDNode<"SystemZISD::VACCC", SDT_ZTernary>;
 def z_vscbi             : SDNode<"SystemZISD::VSCBI", SDTIntBinOp>;
+
+// Add/subtract with carry/borrow.
+def z_vac               : SDNode<"SystemZISD::VAC", SDT_ZTernary>;
 def z_vsbi              : SDNode<"SystemZISD::VSBI", SDT_ZTernary>;
+
+// Compute carry/borrow indication for add/subtract with carry/borrow.
+def z_vaccc             : SDNode<"SystemZISD::VACCC", SDT_ZTernary>;
 def z_vsbcbi            : SDNode<"SystemZISD::VSBCBI", SDT_ZTernary>;
+
+// High-word multiply-and-add.
 def z_vmah              : SDNode<"SystemZISD::VMAH", SDT_ZTernary>;
 def z_vmalh             : SDNode<"SystemZISD::VMALH", SDT_ZTernary>;
+
+// Widen and multiply even/odd vector elements.
 def z_vme               : SDNode<"SystemZISD::VME", SDT_ZBinaryConv>;
 def z_vmle              : SDNode<"SystemZISD::VMLE", SDT_ZBinaryConv>;
 def z_vmo               : SDNode<"SystemZISD::VMO", SDT_ZBinaryConv>;
 def z_vmlo              : SDNode<"SystemZISD::VMLO", SDT_ZBinaryConv>;
 
+// Byte swapping load/store.  Same operands as regular load/store.
 def z_loadbswap        : SDNode<"SystemZISD::LRV", SDTLoad,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def z_storebswap       : SDNode<"SystemZISD::STRV", SDTStore,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// Element swapping load/store.  Same operands as regular load/store.
 def z_loadeswap        : SDNode<"SystemZISD::VLER", SDTLoad,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def z_storeeswap       : SDNode<"SystemZISD::VSTER", SDTStore,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// Use STORE CLOCK FAST to store current TOD clock value.
 def z_stckf            : SDNode<"SystemZISD::STCKF", SDT_ZStoreInherent,
                                 [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
+// Test Data Class.
+//
+// Operand 0: the value to test
+// Operand 1: the bit mask
 def z_tdc               : SDNode<"SystemZISD::TDC", SDT_ZTest>;
 
 def z_eh_sjlj_setjmp    : SDNode<"ISD::EH_SJLJ_SETJMP", SDT_ZSetJmp,
@@ -346,26 +423,75 @@ def z_vector_insert     : SDNode<"ISD::INSERT_VECTOR_ELT",
                                  SDT_ZInsertVectorElt>;
 def z_vector_extract    : SDNode<"ISD::EXTRACT_VECTOR_ELT",
                                  SDT_ZExtractVectorElt>;
+
+// Create a vector constant by filling byte N of the result with bit
+// 15-N of the single operand.
 def z_byte_mask         : SDNode<"SystemZISD::BYTE_MASK", SDT_ZReplicate>;
+
+// Create a vector constant by replicating an element-sized RISBG-style mask.
+// The first operand specifies the starting set bit and the second operand
+// specifies the ending set bit.  Both operands count from the MSB of the
+// element.
 def z_rotate_mask       : SDNode<"SystemZISD::ROTATE_MASK", SDT_ZRotateMask>;
+
+// Replicate a GPR scalar value into all elements of a vector.
 def z_replicate         : SDNode<"SystemZISD::REPLICATE", SDT_ZReplicate>;
+
+// Create a vector from two i64 GPRs.
 def z_join_dwords       : SDNode<"SystemZISD::JOIN_DWORDS", SDT_ZJoinDwords>;
+
+// Replicate one element of a vector into all elements.  The first operand
+// is the vector and the second is the index of the element to replicate.
 def z_splat             : SDNode<"SystemZISD::SPLAT", SDT_ZVecBinaryInt>;
+
+// Interleave elements from the high half of operand 0 and the high half
+// of operand 1.
 def z_merge_high        : SDNode<"SystemZISD::MERGE_HIGH", SDT_ZVecBinary>;
+
+// Likewise for the low halves.
 def z_merge_low         : SDNode<"SystemZISD::MERGE_LOW", SDT_ZVecBinary>;
+
+// Concatenate the vectors in the first two operands, shift them left
+// by the third operand, and take the first half of the result.
 def z_shl_double        : SDNode<"SystemZISD::SHL_DOUBLE", SDT_ZVecTernaryInt>;
+
+// Concatenate the vectors in the first two operands, shift them left/right
+// bitwise by the third operand, and take the first/last half of the result.
 def z_shl_double_bit    : SDNode<"SystemZISD::SHL_DOUBLE_BIT", SDT_ZVecTernaryInt>;
 def z_shr_double_bit    : SDNode<"SystemZISD::SHR_DOUBLE_BIT", SDT_ZVecTernaryInt>;
+
+// Take one element of the first v2i64 operand and the one element of
+// the second v2i64 operand and concatenate them to form a v2i64 result.
+// The third operand is a 4-bit value of the form 0A0B, where A and B
+// are the element selectors for the first operand and second operands
+// respectively.
 def z_permute_dwords    : SDNode<"SystemZISD::PERMUTE_DWORDS",
                                  SDT_ZVecTernaryInt>;
+
+// Perform a general vector permute on vector operands 0 and 1.
+// Each byte of operand 2 controls the corresponding byte of the result,
+// in the same way as a byte-level VECTOR_SHUFFLE mask.
 def z_permute           : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>;
+
+// Pack vector operands 0 and 1 into a single vector with half-sized elements.
 def z_pack              : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>;
+
+// Likewise, but saturate the result and set CC.  PACKS_CC does signed
+// saturation and PACKLS_CC does unsigned saturation.
 def z_packs_cc          : SDNode<"SystemZISD::PACKS_CC", SDT_ZVecBinaryConvCC>;
 def z_packls_cc         : SDNode<"SystemZISD::PACKLS_CC", SDT_ZVecBinaryConvCC>;
+
+// Unpack the first half of vector operand 0 into double-sized elements.
+// UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends.
 def z_unpack_high       : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnpack>;
 def z_unpackl_high      : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnpack>;
+
+// Likewise for the second half.
 def z_unpack_low        : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnpack>;
 def z_unpackl_low       : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnpack>;
+
+// Shift/rotate each element of vector operand 0 by the number of bits
+// specified by scalar operand 1.
 def z_vshl_by_scalar    : SDNode<"SystemZISD::VSHL_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
 def z_vsrl_by_scalar    : SDNode<"SystemZISD::VSRL_BY_SCALAR",
@@ -374,40 +500,75 @@ def z_vsra_by_scalar    : SDNode<"SystemZISD::VSRA_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
 def z_vrotl_by_scalar   : SDNode<"SystemZISD::VROTL_BY_SCALAR",
                                  SDT_ZVecBinaryInt>;
+
+// For each element of the output type, sum across all sub-elements of
+// operand 0 belonging to the corresponding element, and add in the
+// rightmost sub-element of the corresponding element of operand 1.
 def z_vsum              : SDNode<"SystemZISD::VSUM", SDT_ZBinaryConv>;
+
+// Compare integer vector operands 0 and 1 to produce the usual 0/-1
+// vector result.  VICMPE is for equality, VICMPH for "signed greater than"
+// and VICMPHL for "unsigned greater than".
 def z_vicmpe            : SDNode<"SystemZISD::VICMPE", SDT_ZVecCompare>;
 def z_vicmph            : SDNode<"SystemZISD::VICMPH", SDT_ZVecCompare>;
 def z_vicmphl           : SDNode<"SystemZISD::VICMPHL", SDT_ZVecCompare>;
+
+// Likewise, but also set the condition codes on the result.
 def z_vicmpes           : SDNode<"SystemZISD::VICMPES", SDT_ZVecCompareCC>;
 def z_vicmphs           : SDNode<"SystemZISD::VICMPHS", SDT_ZVecCompareCC>;
 def z_vicmphls          : SDNode<"SystemZISD::VICMPHLS", SDT_ZVecCompareCC>;
+
+// Compare floating-point vector operands 0 and 1 to produce the usual 0/-1
+// vector result.  VFCMPE is for "ordered and equal", VFCMPH for "ordered and
+// greater than" and VFCMPHE for "ordered and greater than or equal to".
 def z_vfcmpe            : SDNode<"SystemZISD::VFCMPE", SDT_ZVecBinaryConv>;
-def z_strict_vfcmpe     : SDNode<"SystemZISD::STRICT_VFCMPE",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
-def z_strict_vfcmpes    : SDNode<"SystemZISD::STRICT_VFCMPES",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
 def z_vfcmph            : SDNode<"SystemZISD::VFCMPH", SDT_ZVecBinaryConv>;
-def z_strict_vfcmph     : SDNode<"SystemZISD::STRICT_VFCMPH",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
-def z_strict_vfcmphs    : SDNode<"SystemZISD::STRICT_VFCMPHS",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
 def z_vfcmphe           : SDNode<"SystemZISD::VFCMPHE", SDT_ZVecBinaryConv>;
-def z_strict_vfcmphe    : SDNode<"SystemZISD::STRICT_VFCMPHE",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
-def z_strict_vfcmphes   : SDNode<"SystemZISD::STRICT_VFCMPHES",
-                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+
+// Likewise, but also set the condition codes on the result.
 def z_vfcmpes           : SDNode<"SystemZISD::VFCMPES", SDT_ZVecBinaryConvCC>;
 def z_vfcmphs           : SDNode<"SystemZISD::VFCMPHS", SDT_ZVecBinaryConvCC>;
 def z_vfcmphes          : SDNode<"SystemZISD::VFCMPHES", SDT_ZVecBinaryConvCC>;
+
+// Extend the even f32 elements of vector operand 0 to produce a vector
+// of f64 elements.
 def z_vextend           : SDNode<"SystemZISD::VEXTEND", SDT_ZVecUnaryConv>;
-def z_strict_vextend    : SDNode<"SystemZISD::STRICT_VEXTEND",
-                                 SDT_ZVecUnaryConv, [SDNPHasChain]>;
+
+// Round the f64 elements of vector operand 0 to f32s and store them in the
+// even elements of the result.
 def z_vround            : SDNode<"SystemZISD::VROUND", SDT_ZVecUnaryConv>;
-def z_strict_vround     : SDNode<"SystemZISD::STRICT_VROUND",
+
+let IsStrictFP = true in {
+  // Strict variants of vector floating-point comparisons.
+  // Quiet and signaling versions.
+  def z_strict_vfcmpe   : SDNode<"SystemZISD::STRICT_VFCMPE",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmph   : SDNode<"SystemZISD::STRICT_VFCMPH",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmphe  : SDNode<"SystemZISD::STRICT_VFCMPHE",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmpes  : SDNode<"SystemZISD::STRICT_VFCMPES",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmphs  : SDNode<"SystemZISD::STRICT_VFCMPHS",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+  def z_strict_vfcmphes : SDNode<"SystemZISD::STRICT_VFCMPHES",
+                                 SDT_ZVecBinaryConv, [SDNPHasChain]>;
+
+  // Strict variants of VEXTEND and VROUND.
+  def z_strict_vextend  : SDNode<"SystemZISD::STRICT_VEXTEND",
+                                 SDT_ZVecUnaryConv, [SDNPHasChain]>;
+  def z_strict_vround   : SDNode<"SystemZISD::STRICT_VROUND",
                                  SDT_ZVecUnaryConv, [SDNPHasChain]>;
+}
+
+// AND the two vector operands together and set CC based on the result.
 def z_vtm               : SDNode<"SystemZISD::VTM", SDT_ZCmp>;
+
+// i128 high integer comparisons.
 def z_scmp128hi         : SDNode<"SystemZISD::SCMP128HI", SDT_ZCmp>;
 def z_ucmp128hi         : SDNode<"SystemZISD::UCMP128HI", SDT_ZCmp>;
+
+// String operations that set CC as a side-effect.
 def z_vfae_cc           : SDNode<"SystemZISD::VFAE_CC", SDT_ZVecTernaryIntCC>;
 def z_vfaez_cc          : SDNode<"SystemZISD::VFAEZ_CC", SDT_ZVecTernaryIntCC>;
 def z_vfee_cc           : SDNode<"SystemZISD::VFEE_CC", SDT_ZVecBinaryCC>;
@@ -423,12 +584,24 @@ def z_vstrs_cc          : SDNode<"SystemZISD::VSTRS_CC",
                                  SDT_ZVecTernaryConvCC>;
 def z_vstrsz_cc         : SDNode<"SystemZISD::VSTRSZ_CC",
                                  SDT_ZVecTernaryConvCC>;
+
+// Test floating-point data class for vectors.
 def z_vftci             : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvIntCC>;
 
 class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
   : SDNode<"SystemZISD::"#name, profile,
            [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 
+// Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
+// ATOMIC_LOAD_<op>.
+//
+// Operand 0: the address of the containing 32-bit-aligned field
+// Operand 1: the second operand of <op>, in the high bits of an i32
+//            for everything except ATOMIC_SWAPW
+// Operand 2: how many bits to rotate the i32 left to bring the first
+//            operand into the high bits
+// Operand 3: the negative of operand 2, for rotating the other way
+// Operand 4: the width of the field in bits (8 or 16)
 def z_atomic_swapw      : AtomicWOp<"ATOMIC_SWAPW">;
 def z_atomic_loadw_add  : AtomicWOp<"ATOMIC_LOADW_ADD">;
 def z_atomic_loadw_sub  : AtomicWOp<"ATOMIC_LOADW_SUB">;
@@ -441,55 +614,117 @@ def z_atomic_loadw_max  : AtomicWOp<"ATOMIC_LOADW_MAX">;
 def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">;
 def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
 
+// Atomic compare-and-swap returning CC value.
+// Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
 def z_atomic_cmp_swap   : SDNode<"SystemZISD::ATOMIC_CMP_SWAP",
                                  SDT_ZAtomicCmpSwap,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                                   SDNPMemOperand]>;
+
+// A wrapper around the inner loop of an ATOMIC_CMP_SWAP.
+//
+// Operand 0: the address of the containing 32-bit-aligned field
+// Operand 1: the compare value, in the low bits of an i32
+// Operand 2: the swap value, in the low bits of an i32
+// Operand 3: how many bits to rotate the i32 left to bring the first
+//            operand into the high bits
+// Operand 4: the negative of operand 2, for rotating the other way
+// Operand 5: the width of the field in bits (8 or 16)
 def z_atomic_cmp_swapw  : SDNode<"SystemZISD::ATOMIC_CMP_SWAPW",
                                  SDT_ZAtomicCmpSwapW,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                                   SDNPMemOperand]>;
 
+// 128-bit atomic load.
+// Val, OUTCHAIN = ATOMIC_LOAD_128(INCHAIN, ptr)
 def z_atomic_load_128   : SDNode<"SystemZISD::ATOMIC_LOAD_128",
                                  SDT_ZAtomicLoad128,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// 128-bit atomic store.
+// OUTCHAIN = ATOMIC_STORE_128(INCHAIN, val, ptr)
 def z_atomic_store_128  : SDNode<"SystemZISD::ATOMIC_STORE_128",
                                  SDT_ZAtomicStore128,
                                  [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// 128-bit atomic compare-and-swap.
+// Val, CC, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap)
 def z_atomic_cmp_swap_128 : SDNode<"SystemZISD::ATOMIC_CMP_SWAP_128",
                                    SDT_ZAtomicCmpSwap128,
                                    [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
                                     SDNPMemOperand]>;
 
+// Use a series of MVCs to copy bytes from one memory location to another.
+// The operands are:
+// - the target address
+// - the source address
+// - the constant length
+//
+// This isn't a memory opcode because we'd need to attach two
+// MachineMemOperands rather than one.
 def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Similar to MVC, but for logic operations (AND, OR, XOR).
 def z_nc                : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_oc                : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
 def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
                                   [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Use CLC to compare two blocks of memory, with the same comments
+// as for MVC.
 def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
+
+// Use MVC to set a block of memory after storing the first byte.
 def z_memset_mvc        : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Use a CLST-based sequence to implement strcmp().  The two input operands
+// are the addresses of the strings to compare.
 def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
+
+// Use an MVST-based sequence to implement stpcpy().
 def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+
+// Use an SRST-based sequence to search a block of memory.  The first
+// operand is the end address, the second is the start, and the third
+// is the character to search for.  CC is set to 1 on success and 2
+// on failure.
 def z_search_string     : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZStringCC,
                                  [SDNPHasChain, SDNPMayLoad]>;
+
+// Prefetch from the second operand using the 4-bit control code in
+// the first operand.  The code is 1 for a load prefetch and 2 for
+// a store prefetch.
 def z_prefetch          : SDNode<"SystemZISD::PREFETCH", SDT_ZPrefetch,
                                  [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
                                   SDNPMemOperand]>;
 
+// Transaction begin.  The first operand is the chain, the second
+// the TDB pointer, and the third the immediate control field.
+// Returns CC value and chain.
 def z_tbegin            : SDNode<"SystemZISD::TBEGIN", SDT_ZTBegin,
                                  [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
 def z_tbegin_nofloat    : SDNode<"SystemZISD::TBEGIN_NOFLOAT", SDT_ZTBegin,
                                  [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
+
+// Transaction end.  Just the chain operand.  Returns CC value and chain.
 def z_tend              : SDNode<"SystemZISD::TEND", SDT_ZTEnd,
                                  [SDNPHasChain, SDNPSideEffect]>;
 
+// z/OS XPLINK ADA Entry
+// Wraps a TargetGlobalAddress that should be loaded from a function's
+// AssociatedData Area (ADA). Tha ADA is passed to the function by the
+// caller in the XPLink ABI defined register R5.
+// Operand 0: the GlobalValue/External Symbol
+// Operand 1: the ADA register
+// Operand 2: the offset (0 for the first and 8 for the second element in the
+// function descriptor)
 def z_ada_entry         : SDNode<"SystemZISD::ADA_ENTRY",
                                   SDT_ZADAENTRY>;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index eb00d484af693..88feba8adce0e 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -10,21 +10,27 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SystemZSelectionDAGInfo.h"
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 
+#define GET_SDNODE_DESC
+#include "SystemZGenSDNodeInfo.inc"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "systemz-selectiondag-info"
 
-bool SystemZSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
-  return Opcode >= SystemZISD::FIRST_MEMORY_OPCODE &&
-         Opcode <= SystemZISD::LAST_MEMORY_OPCODE;
-}
+SystemZSelectionDAGInfo::SystemZSelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(SystemZGenSDNodeInfo) {}
+
+const char *SystemZSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
+  switch (static_cast<SystemZISD::NodeType>(Opcode)) {
+  case SystemZISD::GET_CCMASK:
+    return "SystemZISD::GET_CCMASK";
+  }
 
-bool SystemZSelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const {
-  return Opcode >= SystemZISD::FIRST_STRICTFP_OPCODE &&
-         Opcode <= SystemZISD::LAST_STRICTFP_OPCODE;
+  return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
 }
 
 static unsigned getMemMemLenAdj(unsigned Op) {
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index 200566f9646c1..d25fddab65161 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -15,15 +15,34 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "SystemZGenSDNodeInfo.inc"
+
 namespace llvm {
+namespace SystemZISD {
+
+enum NodeType : unsigned {
+  // Set the condition code from a boolean value in operand 0.
+  // Operand 1 is a mask of all condition-code values that may result of this
+  // operation, operand 2 is a mask of condition-code values that may result
+  // if the boolean is true.
+  // Note that this operation is always optimized away, we will never
+  // generate any code for it.
+  GET_CCMASK = GENERATED_OPCODE_END,
+};
 
-class SystemZSelectionDAGInfo : public SelectionDAGTargetInfo {
-public:
-  explicit SystemZSelectionDAGInfo() = default;
+// Return true if OPCODE is some kind of PC-relative address.
+inline bool isPCREL(unsigned Opcode) {
+  return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
+}
 
-  bool isTargetMemoryOpcode(unsigned Opcode) const override;
+} // namespace SystemZISD
+
+class SystemZSelectionDAGInfo : public SelectionDAGGenTargetInfo {
+public:
+  SystemZSelectionDAGInfo();
 
-  bool isTargetStrictFPOpcode(unsigned Opcode) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Dst, SDValue Src,

From b32c434426a181590234f65f2e32f43735bf5b5a Mon Sep 17 00:00:00 2001
From: Prabhu Rajasekaran <prabhukr@google.com>
Date: Mon, 17 Nov 2025 12:08:39 -0800
Subject: [PATCH 44/67] [libc][Github] Perform baremetal libc builds (#167583)

Currently there are no 32 bit presubmit builds for libc. This PR
performs 32 bit build only (no test) to check any changes that land in
libc break 32 bit builds.

Co-authored-by: Aiden Grossman <aidengrossman@google.com>
---
 .github/workflows/libc-fullbuild-tests.yml    | 83 +++++++++++++++----
 libc/cmake/caches/armv6m-none-eabi.cmake      |  8 ++
 libc/cmake/caches/armv7em-none-eabi.cmake     |  8 ++
 libc/cmake/caches/armv7m-none-eabi.cmake      |  8 ++
 .../caches/armv8.1m.main-none-eabi.cmake      |  8 ++
 libc/cmake/caches/armv8m.main-none-eabi.cmake |  8 ++
 libc/cmake/caches/baremetal_common.cmake      | 21 +++++
 libc/cmake/caches/riscv32-unknown-elf.cmake   |  4 +
 8 files changed, 132 insertions(+), 16 deletions(-)
 create mode 100644 libc/cmake/caches/armv6m-none-eabi.cmake
 create mode 100644 libc/cmake/caches/armv7em-none-eabi.cmake
 create mode 100644 libc/cmake/caches/armv7m-none-eabi.cmake
 create mode 100644 libc/cmake/caches/armv8.1m.main-none-eabi.cmake
 create mode 100644 libc/cmake/caches/armv8m.main-none-eabi.cmake
 create mode 100644 libc/cmake/caches/baremetal_common.cmake
 create mode 100644 libc/cmake/caches/riscv32-unknown-elf.cmake

diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml
index 3a048aeb9405b..c5b7f606a115a 100644
--- a/.github/workflows/libc-fullbuild-tests.yml
+++ b/.github/workflows/libc-fullbuild-tests.yml
@@ -48,6 +48,42 @@ jobs:
             cpp_compiler: clang++-22
             target: x86_64-unknown-uefi-llvm
             include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv6m-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv7m-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv7em-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv8m.main-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: armv8.1m.main-none-eabi
+            include_scudo: OFF
+          - os: ubuntu-24.04
+            build__type: MinSizeRel
+            c_compiler: clang-22
+            cpp_compiler: clang++-22
+            target: riscv32-unknown-elf
+            include_scudo: OFF
           # TODO: add back gcc build when it is fixed
           # - c_compiler: gcc
           #   cpp_compiler: g++
@@ -93,28 +129,39 @@ jobs:
       run: |
         export RUNTIMES="libc"
 
+        export CMAKE_FLAGS="
+          -G Ninja
+          -S ${{ github.workspace }}/runtimes
+          -B ${{ steps.strings.outputs.build-output-dir }}
+          -DCMAKE_ASM_COMPILER=${{ matrix.c_compiler }}
+          -DCMAKE_C_COMPILER=${{ matrix.c_compiler }}
+          -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
+          -DCMAKE_C_COMPILER_LAUNCHER=sccache
+          -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+          -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.build-install-dir }}"
+
         if [[ ${{ matrix.include_scudo}} == "ON" ]]; then
           export RUNTIMES="$RUNTIMES;compiler-rt"
-          export CMAKE_FLAGS="
+          export CMAKE_FLAGS="$CMAKE_FLAGS
             -DLLVM_LIBC_INCLUDE_SCUDO=ON
             -DCOMPILER_RT_BUILD_SCUDO_STANDALONE_WITH_LLVM_LIBC=ON
             -DCOMPILER_RT_BUILD_GWP_ASAN=OFF
             -DCOMPILER_RT_SCUDO_STANDALONE_BUILD_SHARED=OFF"
         fi
 
-        cmake -B ${{ steps.strings.outputs.build-output-dir }} \
-        -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} \
-        -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} \
-        -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-        -DCMAKE_C_COMPILER_LAUNCHER=sccache \
-        -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \
-        -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.build-install-dir }} \
-        -DLLVM_RUNTIME_TARGETS=${{ matrix.target }} \
-        -DLLVM_ENABLE_RUNTIMES="$RUNTIMES" \
-        -DLLVM_LIBC_FULL_BUILD=ON \
-        -G Ninja \
-        -S ${{ github.workspace }}/runtimes \
-        $CMAKE_FLAGS
+        case "${{ matrix.target }}" in
+          *-none-eabi|riscv32-unknown-elf)
+            cmake $CMAKE_FLAGS \
+              -C ${{ github.workspace }}/libc/cmake/caches/${{ matrix.target }}.cmake
+            ;;
+          *)
+            cmake -DLLVM_RUNTIME_TARGETS=${{ matrix.target }} \
+              -DLLVM_ENABLE_RUNTIMES="$RUNTIMES" \
+              -DLLVM_LIBC_FULL_BUILD=ON \
+              $CMAKE_FLAGS
+            ;;
+        esac
 
     - name: Build
       run: >
@@ -124,8 +171,12 @@ jobs:
         --target install
 
     - name: Test
-      # Skip UEFI tests until we have testing set up.
-      if: ${{ ! endsWith(matrix.target, '-uefi-llvm') }}
+      # Skip UEFI and baremetal tests until we have testing set up.
+      if: ${{
+          !endsWith(matrix.target, '-uefi-llvm') &&
+          !endsWith(matrix.target, '-none-eabi') &&
+          matrix.target != 'riscv32-unknown-elf'
+        }}
       run: >
         cmake 
         --build ${{ steps.strings.outputs.build-output-dir }} 
diff --git a/libc/cmake/caches/armv6m-none-eabi.cmake b/libc/cmake/caches/armv6m-none-eabi.cmake
new file mode 100644
index 0000000000000..1f463ae5c0ead
--- /dev/null
+++ b/libc/cmake/caches/armv6m-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv6m-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-march=armv6m -mcpu=cortex-m0plus -mfloat-abi=soft -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dfputs(string, stream)=puts(string)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv7em-none-eabi.cmake b/libc/cmake/caches/armv7em-none-eabi.cmake
new file mode 100644
index 0000000000000..afbe9c87dffe1
--- /dev/null
+++ b/libc/cmake/caches/armv7em-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv7em-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-march=armv7em -mcpu=cortex-m4 -mfloat-abi=soft -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dfputs(string, stream)=puts(string)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv7m-none-eabi.cmake b/libc/cmake/caches/armv7m-none-eabi.cmake
new file mode 100644
index 0000000000000..796adb2f31148
--- /dev/null
+++ b/libc/cmake/caches/armv7m-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv7m-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-march=armv7m -mcpu=cortex-m4 -mfloat-abi=soft -Wno-atomic-alignment \"-Dvfprintf(stream, format, vlist)=vprintf(format, vlist)\" \"-Dfprintf(stream, format, ...)=printf(format)\" \"-Dfputs(string, stream)=puts(string)\" -D_LIBCPP_PRINT=1" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv8.1m.main-none-eabi.cmake b/libc/cmake/caches/armv8.1m.main-none-eabi.cmake
new file mode 100644
index 0000000000000..4095facce46ac
--- /dev/null
+++ b/libc/cmake/caches/armv8.1m.main-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv8.1m.main-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-mfloat-abi=hard -march=armv8.1-m.main+mve.fp+fp.dp -mcpu=cortex-m55" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/armv8m.main-none-eabi.cmake b/libc/cmake/caches/armv8m.main-none-eabi.cmake
new file mode 100644
index 0000000000000..4b69f6a822e71
--- /dev/null
+++ b/libc/cmake/caches/armv8m.main-none-eabi.cmake
@@ -0,0 +1,8 @@
+set(CMAKE_SYSTEM_PROCESSOR arm CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "armv8m.main-none-eabi" CACHE STRING "")
+
+foreach(lang C;CXX;ASM)
+    set(CMAKE_${lang}_FLAGS "-mfloat-abi=softfp -march=armv8m.main+fp+dsp -mcpu=cortex-m33" CACHE STRING "")
+endforeach()
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)
diff --git a/libc/cmake/caches/baremetal_common.cmake b/libc/cmake/caches/baremetal_common.cmake
new file mode 100644
index 0000000000000..c0d665d790393
--- /dev/null
+++ b/libc/cmake/caches/baremetal_common.cmake
@@ -0,0 +1,21 @@
+# Expects target triple to be passed as `RUNTIMES_TARGET_TRIPLE`
+
+set(CMAKE_SYSTEM_NAME Generic CACHE STRING "")
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY CACHE STRING "")
+set(LLVM_ENABLE_RUNTIMES "libc" CACHE STRING "")
+set(LLVM_INCLUDE_TESTS OFF CACHE BOOL "")
+set(CMAKE_C_COMPILER_WORKS ON CACHE BOOL "")
+set(CMAKE_CXX_COMPILER_WORKS ON CACHE BOOL "")
+set(CMAKE_SYSROOT "" CACHE STRING "")
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
+
+set(CMAKE_C_COMPILER_TARGET ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(CMAKE_CXX_COMPILER_TARGET ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(CMAKE_ASM_COMPILER_TARGET ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(LLVM_DEFAULT_TARGET_TRIPLE ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+set(LIBC_TARGET_TRIPLE ${RUNTIMES_TARGET_TRIPLE} CACHE STRING "")
+
+set(LLVM_LIBC_FULL_BUILD "ON" CACHE BOOL "")
diff --git a/libc/cmake/caches/riscv32-unknown-elf.cmake b/libc/cmake/caches/riscv32-unknown-elf.cmake
new file mode 100644
index 0000000000000..960fb2bb51a4f
--- /dev/null
+++ b/libc/cmake/caches/riscv32-unknown-elf.cmake
@@ -0,0 +1,4 @@
+set(CMAKE_SYSTEM_PROCESSOR RISCV CACHE STRING "")
+set(RUNTIMES_TARGET_TRIPLE "riscv32-unknown-elf" CACHE STRING "")
+
+include(${CMAKE_CURRENT_LIST_DIR}/baremetal_common.cmake)

From 557a6b826b865cd1797ae421f59f286609b94e59 Mon Sep 17 00:00:00 2001
From: Ebuka Ezike <yerimyah1@gmail.com>
Date: Mon, 17 Nov 2025 20:15:33 +0000
Subject: [PATCH 45/67] [lldb][NFC] use llvm::erase_if to remove non matching
 types (#168279)

---
 lldb/source/Symbol/Symtab.cpp | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/lldb/source/Symbol/Symtab.cpp b/lldb/source/Symbol/Symtab.cpp
index 6080703998ff2..9964ae492bc00 100644
--- a/lldb/source/Symbol/Symtab.cpp
+++ b/lldb/source/Symbol/Symtab.cpp
@@ -722,15 +722,11 @@ Symtab::AppendSymbolIndexesWithNameAndType(ConstString symbol_name,
                                            std::vector<uint32_t> &indexes) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
-  if (AppendSymbolIndexesWithName(symbol_name, indexes) > 0) {
-    std::vector<uint32_t>::iterator pos = indexes.begin();
-    while (pos != indexes.end()) {
-      if (symbol_type == eSymbolTypeAny ||
-          m_symbols[*pos].GetType() == symbol_type)
-        ++pos;
-      else
-        pos = indexes.erase(pos);
-    }
+  if (AppendSymbolIndexesWithName(symbol_name, indexes) > 0 &&
+      symbol_type != eSymbolTypeAny) {
+    llvm::erase_if(indexes, [this, symbol_type](uint32_t index) {
+      return m_symbols[index].GetType() != symbol_type;
+    });
   }
   return indexes.size();
 }
@@ -742,15 +738,11 @@ uint32_t Symtab::AppendSymbolIndexesWithNameAndType(
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
 
   if (AppendSymbolIndexesWithName(symbol_name, symbol_debug_type,
-                                  symbol_visibility, indexes) > 0) {
-    std::vector<uint32_t>::iterator pos = indexes.begin();
-    while (pos != indexes.end()) {
-      if (symbol_type == eSymbolTypeAny ||
-          m_symbols[*pos].GetType() == symbol_type)
-        ++pos;
-      else
-        pos = indexes.erase(pos);
-    }
+                                  symbol_visibility, indexes) > 0 &&
+      symbol_type != eSymbolTypeAny) {
+    llvm::erase_if(indexes, [this, symbol_type](uint32_t index) {
+      return m_symbols[index].GetType() != symbol_type;
+    });
   }
   return indexes.size();
 }

From bac8d01a4da14802ec03907d094f3bbc68f6a5cc Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Mon, 17 Nov 2025 12:18:01 -0800
Subject: [PATCH 46/67] [bazel][libc] Fixes #165219 (#168429)

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 788c6570081a2..a27abbd5b386a 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1103,6 +1103,7 @@ libc_support_library(
         ":func_realloc",
         ":hdr_stdio_macros",
         ":hdr_stdio_overlay",
+        ":string_memory_utils",
         ":types_off_t",
     ],
 )

From 3fb374256b2fcd3dc091612c6c18a6ad6b6bf138 Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Mon, 17 Nov 2025 12:38:57 -0800
Subject: [PATCH 47/67] [bazel] Fix #168113 (#168434)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index d582f448c2213..d021f9da38dbb 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -3177,6 +3177,10 @@ llvm_target_lib_list = [lib for lib in [
                 ["-gen-subtarget"],
                 "lib/Target/SystemZ/SystemZGenSubtargetInfo.inc",
             ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/SystemZ/SystemZGenSDNodeInfo.inc",
+            )
         ],
     },
     {

From 4bec74a9fb82b70db0c1acfc3d1d92d8003d51fd Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Mon, 17 Nov 2025 12:42:46 -0800
Subject: [PATCH 48/67] [mlir][bazel] Fix #168066 (#168435)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 153c7eeedd0ab..452380a8953ff 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -13105,6 +13105,7 @@ cc_library(
         ":RuntimeVerifiableOpInterface",
         ":ShapedOpInterfaces",
         ":SideEffectInterfaces",
+        ":UBDialect",
         ":ValueBoundsOpInterface",
         ":ViewLikeInterface",
         "//llvm:Support",

From b00588ffb4f518605b3a1778458e38f21784b9fa Mon Sep 17 00:00:00 2001
From: Alan Li <me@alanli.org>
Date: Mon, 17 Nov 2025 15:46:14 -0500
Subject: [PATCH 49/67] Fix bazel dep caused by f5b73760 (#168436)


From 321b9d190b32c2c10bbd59761e34ef0305bdb954 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 17 Nov 2025 15:45:29 +0000
Subject: [PATCH 50/67] [VPlan] Replace VPIRMetadata::addMetadata with
 setMetadata. (NFC)

Replace addMetadata with setMetadata, which sets metadata, updating
existing entries or adding a new entry otherwise.

This isn't strictly needed at the moment, but will be needed for
follow-up patches.
---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 19 +++++++++++--------
 .../Vectorize/VPlanConstruction.cpp           |  6 +++---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  2 +-
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0932922c07126..67fa294d095bd 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -970,14 +970,17 @@ class VPIRMetadata {
   /// Add all metadata to \p I.
   void applyMetadata(Instruction &I) const;
 
-  /// Add metadata with kind \p Kind and \p Node.
-  void addMetadata(unsigned Kind, MDNode *Node) {
-    assert(none_of(Metadata,
-                   [Kind](const std::pair<unsigned, MDNode *> &P) {
-                     return P.first == Kind;
-                   }) &&
-           "Kind must appear at most once in Metadata");
-    Metadata.emplace_back(Kind, Node);
+  /// Set metadata with kind \p Kind to \p Node. If metadata with \p Kind
+  /// already exists, it will be replaced. Otherwise, it will be added.
+  void setMetadata(unsigned Kind, MDNode *Node) {
+    auto It =
+        llvm::find_if(Metadata, [Kind](const std::pair<unsigned, MDNode *> &P) {
+          return P.first == Kind;
+        });
+    if (It != Metadata.end())
+      It->second = Node;
+    else
+      Metadata.emplace_back(Kind, Node);
   }
 
   /// Intersect this VPIRMetada object with \p MD, keeping only metadata
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 4ffd5577d31a4..aed85271350c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -672,7 +672,7 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
     MDBuilder MDB(Plan.getContext());
     MDNode *BranchWeights =
         MDB.createBranchWeights(CheckBypassWeights, /*IsExpected=*/false);
-    Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+    Term->setMetadata(LLVMContext::MD_prof, BranchWeights);
   }
 }
 
@@ -756,7 +756,7 @@ void VPlanTransforms::addMinimumIterationCheck(
     MDBuilder MDB(Plan.getContext());
     MDNode *BranchWeights = MDB.createBranchWeights(
         ArrayRef(MinItersBypassWeights, 2), /*IsExpected=*/false);
-    Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
+    Term->setMetadata(LLVMContext::MD_prof, BranchWeights);
   }
 }
 
@@ -793,7 +793,7 @@ void VPlanTransforms::addMinimumVectorEpilogueIterationCheck(
   MDBuilder MDB(Plan.getContext());
   MDNode *BranchWeights =
       MDB.createBranchWeights(Weights, /*IsExpected=*/false);
-  Branch->addMetadata(LLVMContext::MD_prof, BranchWeights);
+  Branch->setMetadata(LLVMContext::MD_prof, BranchWeights);
 }
 
 /// If \p RedPhiR is used by a ComputeReductionResult recipe, return it.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index bbeb447de45cb..3e2c47e4556a6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -4458,7 +4458,7 @@ void VPlanTransforms::addBranchWeightToMiddleTerminator(
   MDBuilder MDB(Plan.getContext());
   MDNode *BranchWeights =
       MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
-  MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights);
+  MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
 }
 
 /// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the

From 54c2c7cf0da21bf7d85f144aa6cb6875e2a9373a Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Mon, 17 Nov 2025 13:01:52 -0800
Subject: [PATCH 51/67] [LLDB] Fix test compilation errors under asan (NFC)
 (#168408)

https://green.lab.llvm.org/job/llvm.org/view/LLDB/job/lldb-cmake-sanitized/2744/consoleText
---
 lldb/packages/Python/lldbsuite/test/make/Makefile.rules     | 4 +++-
 .../commands/target/auto-install-main-executable/Makefile   | 2 +-
 .../macosx/find-dsym/bundle-with-dot-in-filename/Makefile   | 4 ++--
 lldb/test/API/macosx/find-dsym/deep-bundle/Makefile         | 4 ++--
 lldb/test/API/macosx/posix_spawn/Makefile                   | 6 +++---
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 0122fe8409c29..55dbd3934860f 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -339,9 +339,11 @@ endif
 # library to make ASAN tests work for most users, including the bots.
 ifeq "$(OS)" "Darwin"
 ifneq "$(ASAN_OPTIONS)" ""
-LDFLAGS += -Wl,-lto_library -Wl,$(shell dirname $(shell xcrun -find clang))/../lib/libLTO.dylib
+ASAN_LDFLAGS = -Wl,-lto_library -Wl,$(shell dirname $(shell xcrun -find clang))/../lib/libLTO.dylib
 endif
 endif
+LDFLAGS += $(ASAN_LDFLAGS)
+
 OBJECTS =
 EXE ?= a.out
 
diff --git a/lldb/test/API/commands/target/auto-install-main-executable/Makefile b/lldb/test/API/commands/target/auto-install-main-executable/Makefile
index 07e6c9a1d0f15..d0578fb699d1b 100644
--- a/lldb/test/API/commands/target/auto-install-main-executable/Makefile
+++ b/lldb/test/API/commands/target/auto-install-main-executable/Makefile
@@ -6,4 +6,4 @@ a.out: a.device.out
 include Makefile.rules
 
 a.device.out:
-	$(CXX) $(CXXFLAGS) -DBUILD=74 -o $@ $(SRCDIR)/main.cpp
+	$(CXX) $(ASAN_LDFLAGS) $(CXXFLAGS) -DBUILD=74 -o $@ $(SRCDIR)/main.cpp
diff --git a/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile b/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile
index 12781fd847768..f13584041fb51 100644
--- a/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile
+++ b/lldb/test/API/macosx/find-dsym/bundle-with-dot-in-filename/Makefile
@@ -5,7 +5,7 @@ all: clean $(EXE)
 include Makefile.rules
 
 $(EXE):
-	$(CC) $(CFLAGS) -dynamiclib -o com.apple.sbd $(SRCDIR)/bundle.c
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -dynamiclib -o com.apple.sbd $(SRCDIR)/bundle.c
 	mkdir com.apple.sbd.xpc
 	mv com.apple.sbd com.apple.sbd.xpc/
 	mkdir -p com.apple.sbd.xpc.dSYM/Contents/Resources/DWARF
@@ -13,7 +13,7 @@ $(EXE):
 	rm -rf com.apple.sbd.dSYM
 	mkdir hide.app
 	tar cf - com.apple.sbd.xpc com.apple.sbd.xpc.dSYM | ( cd hide.app;tar xBpf -)
-	$(CC) $(CFLAGS) -o find-bundle-with-dots-in-fn $(SRCDIR)/main.c
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -o find-bundle-with-dots-in-fn $(SRCDIR)/main.c
 
 clean::
 	rm -rf a.out a.out.dSYM hide.app com.apple.sbd com.apple.sbd.dSYM com.apple.sbd.xpc com.apple.sbd.xpc.dSYM find-bundle-with-dots-in-fn find-bundle-with-dots-in-fn.dSYM
diff --git a/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile b/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile
index 806c840c9f2ee..c041d9e7a0e95 100644
--- a/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile
+++ b/lldb/test/API/macosx/find-dsym/deep-bundle/Makefile
@@ -4,7 +4,7 @@ all: clean $(EXE)
 include Makefile.rules
 
 $(EXE):
-	$(CC) $(CFLAGS) -install_name $(shell pwd)/MyFramework.framework/Versions/A/MyFramework -dynamiclib -o MyFramework $(SRCDIR)/myframework.c
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -install_name $(shell pwd)/MyFramework.framework/Versions/A/MyFramework -dynamiclib -o MyFramework $(SRCDIR)/myframework.c
 	mkdir -p MyFramework.framework/Versions/A/Headers
 	mkdir -p MyFramework.framework/Versions/A/Resources
 	cp MyFramework MyFramework.framework/Versions/A
@@ -18,7 +18,7 @@ $(EXE):
 	mkdir hide.app
 	rm -f MyFramework
 	tar cf - MyFramework.framework MyFramework.framework.dSYM | ( cd hide.app;tar xBpf -)
-	$(CC) $(CFLAGS) -o deep-bundle $(SRCDIR)/main.c -F. -framework MyFramework
+	$(CC) $(ASAN_LDFLAGS) $(CFLAGS) -o deep-bundle $(SRCDIR)/main.c -F. -framework MyFramework
 
 clean::
 	rm -rf a.out a.out.dSYM deep-bundle deep-bundle.dSYM MyFramework.framework MyFramework.framework.dSYM MyFramework MyFramework.dSYM hide.app
diff --git a/lldb/test/API/macosx/posix_spawn/Makefile b/lldb/test/API/macosx/posix_spawn/Makefile
index 7ae46ca95828d..cbdee9122e3f2 100644
--- a/lldb/test/API/macosx/posix_spawn/Makefile
+++ b/lldb/test/API/macosx/posix_spawn/Makefile
@@ -6,13 +6,13 @@ include Makefile.rules
 all: fat.out
 
 x86_64.out: x86_64.c
-	$(CC) -isysroot $(SDKROOT) -target x86_64-apple-macosx10.9 -o x86_64.out $<
+	$(CC) $(ASAN_LDFLAGS) -isysroot $(SDKROOT) -target x86_64-apple-macosx10.9 -o x86_64.out $<
 
 x86_64h.out: x86_64h.c
-	$(CC) -isysroot $(SDKROOT) -target x86_64h-apple-macosx10.9 -o x86_64h.out $<
+	$(CC) $(ASAN_LDFLAGS) -isysroot $(SDKROOT) -target x86_64h-apple-macosx10.9 -o x86_64h.out $<
 
 arm64.out: arm64.c
-	$(CC) -isysroot $(SDKROOT) -target arm64-apple-macosx10.9 -o arm64.out $<
+	$(CC) $(ASAN_LDFLAGS) -isysroot $(SDKROOT) -target arm64-apple-macosx10.9 -o arm64.out $<
 
 fat.out: x86_64.out x86_64h.out arm64.out
 	$(LIPO) -o fat.out -create $^

From 24c524d01423dd4b922fd4118613717a1b7e7f41 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 17 Nov 2025 16:07:28 -0500
Subject: [PATCH 52/67] [libc++] Enable compiler-rt when performing a
 bootstrapping build (#167065)

Otherwise, we end up using whatever system-provided compiler runtime is
available, which doesn't work on macOS since compiler-rt is located
inside the toolchain path, which can't be found by default.

However, disable the tests for compiler-rt since those are linking
against the system C++ standard library while using the just-built
libc++ headers, which is non-sensical and leads to undefined references
on macOS.
---
 libcxx/docs/VendorDocumentation.rst | 14 ++++++++------
 libcxx/utils/ci/run-buildbot        |  3 ++-
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/libcxx/docs/VendorDocumentation.rst b/libcxx/docs/VendorDocumentation.rst
index 15677c7428263..b14c7a70aee04 100644
--- a/libcxx/docs/VendorDocumentation.rst
+++ b/libcxx/docs/VendorDocumentation.rst
@@ -81,12 +81,14 @@ CMake invocation at ``<monorepo>/llvm``:
 .. code-block:: bash
 
   $ mkdir build
-  $ cmake -G Ninja -S llvm -B build -DLLVM_ENABLE_PROJECTS="clang"                      \  # Configure
-                                    -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
-                                    -DLLVM_RUNTIME_TARGETS="<target-triple>"
-  $ ninja -C build runtimes                                                                # Build
-  $ ninja -C build check-runtimes                                                          # Test
-  $ ninja -C build install-runtimes                                                        # Install
+  $ cmake -G Ninja -S llvm -B build                                       \
+          -DCMAKE_BUILD_TYPE=RelWithDebInfo                               \
+          -DLLVM_ENABLE_PROJECTS="clang"                                  \  # Configure
+          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind;compiler-rt" \
+          -DLLVM_RUNTIME_TARGETS="<target-triple>"
+  $ ninja -C build runtimes                                                  # Build
+  $ ninja -C build check-runtimes                                            # Test
+  $ ninja -C build install-runtimes                                          # Install
 
 .. note::
   - This type of build is also commonly called a "Runtimes build", but we would like to move
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index d265dddebe11f..7442361627104 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -366,11 +366,12 @@ bootstrapping-build)
           -DCMAKE_BUILD_TYPE=Release \
           -DCMAKE_INSTALL_PREFIX="${INSTALL_DIR}" \
           -DLLVM_ENABLE_PROJECTS="clang;lldb" \
-          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind" \
+          -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind;compiler-rt" \
           -DLLVM_RUNTIME_TARGETS="$(${CXX} --print-target-triple)" \
           -DLLVM_HOST_TRIPLE="$(${CXX} --print-target-triple)" \
           -DLLVM_TARGETS_TO_BUILD="host" \
           -DRUNTIMES_BUILD_ALLOW_DARWIN=ON \
+          -DCOMPILER_RT_INCLUDE_TESTS=OFF \
           -DLLVM_ENABLE_ASSERTIONS=ON \
           -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests"
 

From aae2b891e8b28adafde9be1ee2ddd327aa72ccfa Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 17 Nov 2025 16:07:45 -0500
Subject: [PATCH 53/67] [libc++] Replace a few .compile.fail.cpp tests by
 proper clang-verify tests (#167346)

We want to eliminate all .compile.fail.cpp tests since they are brittle:
these tests pass regardless of the specific compilation error, which
means that e.g. a mising include will render the test null.

This is not an exhaustive pass, just a few tests I stumbled upon.
---
 ...mpile.fail.cpp => gets-removed.verify.cpp} | 10 +--
 .../re.tokiter.cnstr/array.compile.fail.cpp   | 40 -----------
 .../re.tokiter.cnstr/init.compile.fail.cpp    | 37 ----------
 .../re.tokiter.cnstr/int.compile.fail.cpp     | 36 ----------
 .../temporary-objects.verify.cpp              | 72 +++++++++++++++++++
 .../re.tokiter.cnstr/vector.compile.fail.cpp  | 41 -----------
 6 files changed, 75 insertions(+), 161 deletions(-)
 rename libcxx/test/std/input.output/file.streams/c.files/{gets.compile.fail.cpp => gets-removed.verify.cpp} (70%)
 delete mode 100644 libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/array.compile.fail.cpp
 delete mode 100644 libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/init.compile.fail.cpp
 delete mode 100644 libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/int.compile.fail.cpp
 create mode 100644 libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/temporary-objects.verify.cpp
 delete mode 100644 libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/vector.compile.fail.cpp

diff --git a/libcxx/test/std/input.output/file.streams/c.files/gets.compile.fail.cpp b/libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
similarity index 70%
rename from libcxx/test/std/input.output/file.streams/c.files/gets.compile.fail.cpp
rename to libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
index 1a92cc925e2aa..281ef37e92d27 100644
--- a/libcxx/test/std/input.output/file.streams/c.files/gets.compile.fail.cpp
+++ b/libcxx/test/std/input.output/file.streams/c.files/gets-removed.verify.cpp
@@ -7,15 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11
-// test <cstdio>
 
-// gets
+// Verify that std::gets has been removed in C++14 and later
 
 #include <cstdio>
 
-int main(int, char**)
-{
-    (void) std::gets((char *) NULL);
-
-  return 0;
+void f(char const* str) {
+  (void)std::gets(str); // expected-error {{no member named 'gets' in namespace 'std'}}
 }
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/array.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/array.compile.fail.cpp
deleted file mode 100644
index a03fd52c03562..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/array.compile.fail.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// template <size_t N>
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re,
-//                      const int (&submatches)[N],
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <vector>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-        std::regex phone_numbers("\\d{3}-(\\d{4})");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        const int indices[] = {-1, 0, 1};
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                     std::regex("\\d{3}-\\d{4}"), indices);
-    }
-
-  return 0;
-}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/init.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/init.compile.fail.cpp
deleted file mode 100644
index b6913e6b32d12..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/init.compile.fail.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re,
-//                      initializer_list<int> submatches,
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-        std::regex phone_numbers("\\d{3}-(\\d{4})");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                      std::regex("\\d{3}-\\d{4}"), {-1, 0, 1});
-    }
-
-  return 0;
-}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/int.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/int.compile.fail.cpp
deleted file mode 100644
index 3c39d4983e26c..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/int.compile.fail.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re, int submatch = 0,
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-        std::regex phone_numbers("\\d{3}-\\d{4}");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                     std::regex("\\d{3}-\\d{4}"), -1);
-    }
-
-  return 0;
-}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/temporary-objects.verify.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/temporary-objects.verify.cpp
new file mode 100644
index 0000000000000..b1ab0f337de2f
--- /dev/null
+++ b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/temporary-objects.verify.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11
+
+// Ensure that we don't allow iterators into temporary std::regex objects.
+
+// <regex>
+//
+// class regex_iterator<BidirectionalIterator, charT, traits>
+//
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re, int submatch = 0,
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+//
+// template <size_t N>
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re,
+//                      const int (&submatches)[N],
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+//
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re,
+//                      initializer_list<int> submatches,
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+//
+// template <std::size_t N>
+// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
+//                      const regex_type&& re,
+//                      const std::vector<int>& submatches,
+//                      regex_constants::match_flag_type m =
+//                                              regex_constants::match_default);
+
+#include <iterator>
+#include <regex>
+#include <vector>
+
+void f() {
+  std::regex phone_numbers("\\d{3}-\\d{4}");
+  const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
+
+  { // int submatch
+    std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), -1);
+    // expected-error@-1 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+  { // const int (&submatches)[N]
+    const int indices[] = {-1, 0, 1};
+    std::cregex_token_iterator i(
+        std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), indices);
+    // expected-error@-2 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+  { // initializer_list<int> submatches
+    std::cregex_token_iterator i(
+        std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), {-1, 0, 1});
+    // expected-error@-2 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+  { // const std::vector<int>& submatches
+    std::vector<int> v;
+    v.push_back(-1);
+    v.push_back(-1);
+    std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book) - 1, std::regex("\\d{3}-\\d{4}"), v);
+    // expected-error@-1 {{call to deleted constructor of 'std::cregex_token_iterator'}}
+  }
+}
diff --git a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/vector.compile.fail.cpp b/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/vector.compile.fail.cpp
deleted file mode 100644
index 9b07df9d1a783..0000000000000
--- a/libcxx/test/std/re/re.iter/re.tokiter/re.tokiter.cnstr/vector.compile.fail.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <regex>
-
-// class regex_iterator<BidirectionalIterator, charT, traits>
-
-// template <std::size_t N>
-// regex_token_iterator(BidirectionalIterator a, BidirectionalIterator b,
-//                      const regex_type&& re,
-//                      const std::vector<int>& submatches,
-//                      regex_constants::match_flag_type m =
-//                                              regex_constants::match_default);
-
-#include <regex>
-#include <cassert>
-#include "test_macros.h"
-
-#if TEST_STD_VER < 14
-#error
-#endif
-
-int main(int, char**)
-{
-    {
-         std::regex phone_numbers("\\d{3}-(\\d{4})");
-        const char phone_book[] = "start 555-1234, 555-2345, 555-3456 end";
-        std::vector<int> v;
-        v.push_back(-1);
-        v.push_back(-1);
-        std::cregex_token_iterator i(std::begin(phone_book), std::end(phone_book)-1,
-                                     std::regex("\\d{3}-\\d{4}"), v);
-    }
-
-  return 0;
-}

From 7693f124ff7fbeacce66ef3012fef119b40db330 Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Mon, 17 Nov 2025 13:08:06 -0800
Subject: [PATCH 54/67] [mlir][bazel] Fix #167957 (#168441)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 452380a8953ff..1421ec553f251 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10318,6 +10318,8 @@ cc_library(
     ),
     includes = ["include"],
     deps = [
+        ":FunctionInterfaces",
+        ":IR",
         ":OpenACCDialect",
         ":OpenACCOpsIncGen",
         ":OpenACCPassIncGen",
@@ -10325,6 +10327,7 @@ cc_library(
         ":Support",
         ":ViewLikeInterface",
         "//llvm:Support",
+        "//llvm:ir_headers",
     ],
 )
 

From 0d8c29409ceeba7fc0561bae2b9d4e4e4e936cba Mon Sep 17 00:00:00 2001
From: Daniel Wedzicha <danielgitmail@proton.me>
Date: Mon, 17 Nov 2025 17:13:24 -0400
Subject: [PATCH 55/67] Fixed typo in llvm-otool (#168395)

---
 llvm/tools/llvm-objdump/OtoolOpts.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-objdump/OtoolOpts.td b/llvm/tools/llvm-objdump/OtoolOpts.td
index dc7a5b445cffe..706d9e0182f58 100644
--- a/llvm/tools/llvm-objdump/OtoolOpts.td
+++ b/llvm/tools/llvm-objdump/OtoolOpts.td
@@ -14,7 +14,7 @@ def G : Flag<["-"], "G">, HelpText<"print data-in-code table">;
 def h : Flag<["-"], "h">, HelpText<"print mach header">;
 def I : Flag<["-"], "I">, HelpText<"print indirect symbol table">;
 def j : Flag<["-"], "j">, HelpText<"print opcode bytes">;
-def l : Flag<["-"], "l">, HelpText<"print load commnads">;
+def l : Flag<["-"], "l">, HelpText<"print load commands">;
 def L : Flag<["-"], "L">, HelpText<"print used shared libraries">;
 def mcpu_EQ : Joined<["-"], "mcpu=">, HelpText<"select cpu for disassembly">;
 def o : Flag<["-"], "o">, HelpText<"print Objective-C segment">;

From ed617bd78082bb569059f2f698e41cbba5317afb Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Mon, 17 Nov 2025 13:21:59 -0800
Subject: [PATCH 56/67] [bazel][buildifier] reformat changes in #168434
 (#168443)

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index d021f9da38dbb..85c64ffd58ca6 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -3180,7 +3180,7 @@ llvm_target_lib_list = [lib for lib in [
             (
                 ["-gen-sd-node-info"],
                 "lib/Target/SystemZ/SystemZGenSDNodeInfo.inc",
-            )
+            ),
         ],
     },
     {

From 3cba379e3d9bd2f929f5625fe38d17c34f4b7bb7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 17 Nov 2025 21:28:49 +0000
Subject: [PATCH 57/67] [VPlan] Populate and use VPIRMetadata from
 VPInstructions (NFC) (#167253)

Update VPlan to populate VPIRMetadata during VPInstruction construction
and use it when creating widened recipes, instead of constructing
VPIRMetadata from the underlying IR instruction each time.

This centralizes VPIRMetadata in VPInstructions and ensures metadata is
consistently available throughout VPlan transformations.

PR: https://github.com/llvm/llvm-project/pull/167253
---
 .../Vectorize/LoopVectorizationPlanner.h      | 41 ++++++------
 .../Transforms/Vectorize/LoopVectorize.cpp    | 39 ++++++------
 .../Transforms/Vectorize/VPRecipeBuilder.h    | 10 +--
 llvm/lib/Transforms/Vectorize/VPlan.h         | 62 +++++++++----------
 .../Vectorize/VPlanConstruction.cpp           | 37 ++++++++---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 12 ----
 llvm/lib/Transforms/Vectorize/VPlanSLP.cpp    |  3 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 35 ++++++-----
 .../Transforms/Vectorize/VPlanTransforms.h    |  3 +-
 .../Transforms/Vectorize/VPlanTest.cpp        |  8 +--
 10 files changed, 127 insertions(+), 123 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 5dc3175382254..f533a47150a7b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -63,9 +63,11 @@ class VPBuilder {
   }
 
   VPInstruction *createInstruction(unsigned Opcode,
-                                   ArrayRef<VPValue *> Operands, DebugLoc DL,
+                                   ArrayRef<VPValue *> Operands,
+                                   const VPIRMetadata &MD, DebugLoc DL,
                                    const Twine &Name = "") {
-    return tryInsertInstruction(new VPInstruction(Opcode, Operands, DL, Name));
+    return tryInsertInstruction(
+        new VPInstruction(Opcode, Operands, {}, MD, DL, Name));
   }
 
 public:
@@ -150,17 +152,17 @@ class VPBuilder {
   /// its underlying Instruction.
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               Instruction *Inst = nullptr,
+                              const VPIRMetadata &MD = {},
+                              DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
-    DebugLoc DL = DebugLoc::getUnknown();
-    if (Inst)
-      DL = Inst->getDebugLoc();
-    VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name);
+    VPInstruction *NewVPInst = tryInsertInstruction(
+        new VPInstruction(Opcode, Operands, {}, MD, DL, Name));
     NewVPInst->setUnderlyingValue(Inst);
     return NewVPInst;
   }
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               DebugLoc DL, const Twine &Name = "") {
-    return createInstruction(Opcode, Operands, DL, Name);
+    return createInstruction(Opcode, Operands, {}, DL, Name);
   }
   VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
                               const VPIRFlags &Flags,
@@ -174,8 +176,8 @@ class VPBuilder {
                               Type *ResultTy, const VPIRFlags &Flags = {},
                               DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
-    return tryInsertInstruction(
-        new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name));
+    return tryInsertInstruction(new VPInstructionWithType(
+        Opcode, Operands, ResultTy, Flags, {}, DL, Name));
   }
 
   VPInstruction *createOverflowingOp(
@@ -189,13 +191,14 @@ class VPBuilder {
   VPInstruction *createNot(VPValue *Operand,
                            DebugLoc DL = DebugLoc::getUnknown(),
                            const Twine &Name = "") {
-    return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
+    return createInstruction(VPInstruction::Not, {Operand}, {}, DL, Name);
   }
 
   VPInstruction *createAnd(VPValue *LHS, VPValue *RHS,
                            DebugLoc DL = DebugLoc::getUnknown(),
                            const Twine &Name = "") {
-    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name);
+    return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, {}, DL,
+                             Name);
   }
 
   VPInstruction *createOr(VPValue *LHS, VPValue *RHS,
@@ -210,20 +213,18 @@ class VPBuilder {
   VPInstruction *createLogicalAnd(VPValue *LHS, VPValue *RHS,
                                   DebugLoc DL = DebugLoc::getUnknown(),
                                   const Twine &Name = "") {
-    return tryInsertInstruction(
-        new VPInstruction(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name));
+    return createNaryOp(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name);
   }
 
   VPInstruction *
   createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal,
                DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "",
                std::optional<FastMathFlags> FMFs = std::nullopt) {
-    auto *Select =
-        FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
-                                 *FMFs, {}, DL, Name)
-             : new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal},
-                                 DL, Name);
-    return tryInsertInstruction(Select);
+    if (!FMFs)
+      return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal}, DL,
+                          Name);
+    return tryInsertInstruction(new VPInstruction(
+        Instruction::Select, {Cond, TrueVal, FalseVal}, *FMFs, {}, DL, Name));
   }
 
   /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
@@ -306,7 +307,7 @@ class VPBuilder {
                                   const VPIRFlags &Flags = {},
                                   const VPIRMetadata &Metadata = {}) {
     return tryInsertInstruction(
-        new VPInstructionWithType(Opcode, Op, ResultTy, DL, Flags, Metadata));
+        new VPInstructionWithType(Opcode, Op, ResultTy, Flags, Metadata, DL));
   }
 
   VPValue *createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy,
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 10bd6cd471152..356d759b94799 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7616,14 +7616,13 @@ VPWidenMemoryRecipe *VPRecipeBuilder::tryToWidenMemory(VPInstruction *VPI,
   }
   if (VPI->getOpcode() == Instruction::Load) {
     auto *Load = cast<LoadInst>(I);
-    return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
-                                 VPIRMetadata(*Load, LVer), I->getDebugLoc());
+    return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse, *VPI,
+                                 VPI->getDebugLoc());
   }
 
   StoreInst *Store = cast<StoreInst>(I);
   return new VPWidenStoreRecipe(*Store, Ptr, VPI->getOperand(0), Mask,
-                                Consecutive, Reverse,
-                                VPIRMetadata(*Store, LVer), VPI->getDebugLoc());
+                                Consecutive, Reverse, *VPI, VPI->getDebugLoc());
 }
 
 /// Creates a VPWidenIntOrFpInductionRecipe for \p PhiR. If needed, it will
@@ -7751,7 +7750,7 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
                 },
                 Range);
   if (ShouldUseVectorIntrinsic)
-    return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
+    return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(), *VPI,
                                       VPI->getDebugLoc());
 
   Function *Variant = nullptr;
@@ -7843,7 +7842,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
       auto *SafeRHS =
           Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
       Ops[1] = SafeRHS;
-      return new VPWidenRecipe(*I, Ops);
+      return new VPWidenRecipe(*I, Ops, *VPI, VPI->getDebugLoc());
     }
     [[fallthrough]];
   }
@@ -7889,7 +7888,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
       // For other binops, the legacy cost model only checks the second operand.
       NewOps[1] = GetConstantViaSCEV(NewOps[1]);
     }
-    return new VPWidenRecipe(*I, NewOps);
+    return new VPWidenRecipe(*I, NewOps, *VPI, VPI->getDebugLoc());
   }
   case Instruction::ExtractValue: {
     SmallVector<VPValue *> NewOps(VPI->operands());
@@ -7897,7 +7896,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
     assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
     unsigned Idx = EVI->getIndices()[0];
     NewOps.push_back(Plan.getConstantInt(32, Idx));
-    return new VPWidenRecipe(*I, NewOps);
+    return new VPWidenRecipe(*I, NewOps, *VPI, VPI->getDebugLoc());
   }
   };
 }
@@ -7981,8 +7980,8 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(VPInstruction *VPI,
   assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
           (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
          "Should not predicate a uniform recipe");
-  auto *Recipe = new VPReplicateRecipe(I, VPI->operands(), IsUniform,
-                                       BlockInMask, VPIRMetadata(*I, LVer));
+  auto *Recipe =
+      new VPReplicateRecipe(I, VPI->operands(), IsUniform, BlockInMask, *VPI);
   return Recipe;
 }
 
@@ -8235,13 +8234,14 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
     return new VPWidenGEPRecipe(cast<GetElementPtrInst>(Instr), R->operands());
 
   if (VPI->getOpcode() == Instruction::Select)
-    return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands());
+    return new VPWidenSelectRecipe(*cast<SelectInst>(Instr), R->operands(),
+                                   *VPI);
 
   if (Instruction::isCast(VPI->getOpcode())) {
     auto *CastR = cast<VPInstructionWithType>(R);
     auto *CI = cast<CastInst>(Instr);
     return new VPWidenCastRecipe(CI->getOpcode(), VPI->getOperand(0),
-                                 CastR->getResultType(), *CI);
+                                 CastR->getResultType(), *CI, *VPI);
   }
 
   return tryToWiden(VPI);
@@ -8269,7 +8269,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(VPInstruction *Reduction,
     SmallVector<VPValue *, 2> Ops;
     Ops.push_back(Plan.getOrAddLiveIn(Zero));
     Ops.push_back(BinOp);
-    BinOp = new VPWidenRecipe(*ReductionI, Ops);
+    BinOp = new VPWidenRecipe(*ReductionI, Ops, VPIRMetadata(),
+                              ReductionI->getDebugLoc());
     Builder.insert(BinOp->getDefiningRecipe());
     ReductionOpcode = Instruction::Add;
   }
@@ -8302,7 +8303,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
   // candidates built later for specific VF ranges.
   auto VPlan0 = VPlanTransforms::buildVPlan0(
       OrigLoop, *LI, Legal->getWidestInductionType(),
-      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
+      getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer);
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
@@ -8408,7 +8409,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // VPInstructions in the loop.
   // ---------------------------------------------------------------------------
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
-                                Builder, BlockMaskCache, LVer);
+                                Builder, BlockMaskCache);
   // TODO: Handle partial reductions with EVL tail folding.
   if (!CM.foldTailWithEVL())
     RecipeBuilder.collectScaledReductions(Range);
@@ -8453,9 +8454,9 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
         // Only create recipe for the final invariant store of the reduction.
         if (Legal->isInvariantStoreOfReduction(SI)) {
-          auto *Recipe =
-              new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
-                                    nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
+          auto *Recipe = new VPReplicateRecipe(
+              SI, R.operands(), true /* IsUniform */, nullptr /*Mask*/,
+              *cast<VPInstruction>(SingleDef));
           Recipe->insertBefore(*MiddleVPBB, MBIP);
         }
         R.eraseFromParent();
@@ -8606,7 +8607,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
   // addScalarResumePhis.
   DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
   VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
-                                Builder, BlockMaskCache, nullptr /*LVer*/);
+                                Builder, BlockMaskCache);
   for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
     if (isa<VPCanonicalIVPHIRecipe>(&R))
       continue;
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index a7000aff06379..87280b83fc0e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -84,10 +84,6 @@ class VPRecipeBuilder {
   /// A mapping of partial reduction exit instructions to their scaling factor.
   DenseMap<const Instruction *, unsigned> ScaledReductionMap;
 
-  /// Loop versioning instance for getting noalias metadata guaranteed by
-  /// runtime checks.
-  LoopVersioning *LVer;
-
   /// Check if \p I can be widened at the start of \p Range and possibly
   /// decrease the range such that the returned value holds for the entire \p
   /// Range. The function should not be called for memory instructions or calls.
@@ -144,11 +140,9 @@ class VPRecipeBuilder {
                   LoopVectorizationLegality *Legal,
                   LoopVectorizationCostModel &CM,
                   PredicatedScalarEvolution &PSE, VPBuilder &Builder,
-                  DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache,
-                  LoopVersioning *LVer)
+                  DenseMap<VPBasicBlock *, VPValue *> &BlockMaskCache)
       : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
-        CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache),
-        LVer(LVer) {}
+        CM(CM), PSE(PSE), Builder(Builder), BlockMaskCache(BlockMaskCache) {}
 
   std::optional<unsigned> getScalingForReduction(const Instruction *ExitInst) {
     auto It = ScaledReductionMap.find(ExitInst);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 67fa294d095bd..c81834e401726 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -65,7 +65,6 @@ class VPReplicateRecipe;
 class VPlanSlp;
 class Value;
 class LoopVectorizationCostModel;
-class LoopVersioning;
 
 struct VPCostContext;
 
@@ -958,10 +957,6 @@ class VPIRMetadata {
   /// \p I.
   VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); }
 
-  /// Adds metatadata that can be preserved from the original instruction
-  /// \p I and noalias metadata guaranteed by runtime checks using \p LVer.
-  VPIRMetadata(Instruction &I, LoopVersioning *LVer);
-
   /// Copy constructor for cloning.
   VPIRMetadata(const VPIRMetadata &Other) = default;
 
@@ -1120,11 +1115,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
 
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "")
-      : VPInstruction(Opcode, Operands, {}, {}, DL, Name) {}
-
-  VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                const VPIRFlags &Flags, const VPIRMetadata &MD = {},
+                const VPIRFlags &Flags = {}, const VPIRMetadata &MD = {},
                 DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "");
 
   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
@@ -1214,14 +1205,10 @@ class VPInstructionWithType : public VPInstruction {
 
 public:
   VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                        Type *ResultTy, const VPIRFlags &Flags, DebugLoc DL,
+                        Type *ResultTy, const VPIRFlags &Flags = {},
+                        const VPIRMetadata &Metadata = {},
+                        DebugLoc DL = DebugLoc::getUnknown(),
                         const Twine &Name = "")
-      : VPInstruction(Opcode, Operands, Flags, {}, DL, Name),
-        ResultTy(ResultTy) {}
-
-  VPInstructionWithType(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                        Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags,
-                        const VPIRMetadata &Metadata, const Twine &Name = "")
       : VPInstruction(Opcode, Operands, Flags, Metadata, DL, Name),
         ResultTy(ResultTy) {}
 
@@ -1250,7 +1237,7 @@ class VPInstructionWithType : public VPInstruction {
   VPInstruction *clone() override {
     auto *New =
         new VPInstructionWithType(getOpcode(), operands(), getResultType(),
-                                  *this, getDebugLoc(), getName());
+                                  *this, *this, getDebugLoc(), getName());
     New->setUnderlyingValue(getUnderlyingValue());
     return New;
   }
@@ -1334,7 +1321,7 @@ class VPPhiAccessors {
 
 struct LLVM_ABI_FOR_TEST VPPhi : public VPInstruction, public VPPhiAccessors {
   VPPhi(ArrayRef<VPValue *> Operands, DebugLoc DL, const Twine &Name = "")
-      : VPInstruction(Instruction::PHI, Operands, DL, Name) {}
+      : VPInstruction(Instruction::PHI, Operands, {}, {}, DL, Name) {}
 
   static inline bool classof(const VPUser *U) {
     auto *VPI = dyn_cast<VPInstruction>(U);
@@ -1478,9 +1465,10 @@ class LLVM_ABI_FOR_TEST VPWidenRecipe : public VPRecipeWithIRFlags,
       : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, Flags, DL),
         VPIRMetadata(Metadata), Opcode(Opcode) {}
 
-  VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPIRMetadata(I),
-        Opcode(I.getOpcode()) {}
+  VPWidenRecipe(Instruction &I, ArrayRef<VPValue *> Operands,
+                const VPIRMetadata &Metadata, DebugLoc DL)
+      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
+        VPIRMetadata(Metadata), Opcode(I.getOpcode()) {}
 
   ~VPWidenRecipe() override = default;
 
@@ -1521,13 +1509,12 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
 
 public:
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
-                    CastInst &UI)
-      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), VPIRMetadata(UI),
-        Opcode(Opcode), ResultTy(ResultTy) {
+                    CastInst &UI, const VPIRMetadata &Metadata)
+      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI),
+        VPIRMetadata(Metadata), Opcode(Opcode), ResultTy(ResultTy) {
     assert(UI.getOpcode() == Opcode &&
            "opcode of underlying cast doesn't match");
   }
-
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
                     const VPIRFlags &Flags = {},
                     const VPIRMetadata &Metadata = {},
@@ -1590,18 +1577,23 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
 public:
   VPWidenIntrinsicRecipe(CallInst &CI, Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
+                         const VPIRMetadata &MD = {},
                          DebugLoc DL = DebugLoc::getUnknown())
       : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, CI),
-        VPIRMetadata(CI), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
+        VPIRMetadata(MD), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
         MayReadFromMemory(CI.mayReadFromMemory()),
         MayWriteToMemory(CI.mayWriteToMemory()),
         MayHaveSideEffects(CI.mayHaveSideEffects()) {}
 
   VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
                          ArrayRef<VPValue *> CallArguments, Type *Ty,
+                         const VPIRFlags &Flags = {},
+                         const VPIRMetadata &Metadata = {},
                          DebugLoc DL = DebugLoc::getUnknown())
-      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, DL),
-        VPIRMetadata(), VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty) {
+      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments, Flags,
+                            DL),
+        VPIRMetadata(Metadata), VectorIntrinsicID(VectorIntrinsicID),
+        ResultTy(Ty) {
     LLVMContext &Ctx = Ty->getContext();
     AttributeSet Attrs = Intrinsic::getFnAttributes(Ctx, VectorIntrinsicID);
     MemoryEffects ME = Attrs.getMemoryEffects();
@@ -1617,9 +1609,10 @@ class VPWidenIntrinsicRecipe : public VPRecipeWithIRFlags, public VPIRMetadata {
   VPWidenIntrinsicRecipe *clone() override {
     if (Value *CI = getUnderlyingValue())
       return new VPWidenIntrinsicRecipe(*cast<CallInst>(CI), VectorIntrinsicID,
-                                        operands(), ResultTy, getDebugLoc());
+                                        operands(), ResultTy, *this,
+                                        getDebugLoc());
     return new VPWidenIntrinsicRecipe(VectorIntrinsicID, operands(), ResultTy,
-                                      getDebugLoc());
+                                      *this, *this, getDebugLoc());
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenIntrinsicSC)
@@ -1760,15 +1753,16 @@ class VPHistogramRecipe : public VPRecipeBase {
 /// instruction.
 struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
                                                public VPIRMetadata {
-  VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands)
+  VPWidenSelectRecipe(SelectInst &I, ArrayRef<VPValue *> Operands,
+                      const VPIRMetadata &MD = {})
       : VPRecipeWithIRFlags(VPDef::VPWidenSelectSC, Operands, I),
-        VPIRMetadata(I) {}
+        VPIRMetadata(MD) {}
 
   ~VPWidenSelectRecipe() override = default;
 
   VPWidenSelectRecipe *clone() override {
     return new VPWidenSelectRecipe(*cast<SelectInst>(getUnderlyingInstr()),
-                                   operands());
+                                   operands(), *this);
   }
 
   VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index aed85271350c8..612202d049774 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
 
 #define DEBUG_TYPE "vplan"
 
@@ -37,6 +38,9 @@ class PlainCFGBuilder {
   // Loop Info analysis.
   LoopInfo *LI;
 
+  // Loop versioning for alias metadata.
+  LoopVersioning *LVer;
+
   // Vectorization plan that we are working on.
   std::unique_ptr<VPlan> Plan;
 
@@ -65,8 +69,8 @@ class PlainCFGBuilder {
   void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
 
 public:
-  PlainCFGBuilder(Loop *Lp, LoopInfo *LI)
-      : TheLoop(Lp), LI(LI), Plan(std::make_unique<VPlan>(Lp)) {}
+  PlainCFGBuilder(Loop *Lp, LoopInfo *LI, LoopVersioning *LVer)
+      : TheLoop(Lp), LI(LI), LVer(LVer), Plan(std::make_unique<VPlan>(Lp)) {}
 
   /// Build plain CFG for TheLoop and connect it to Plan's entry.
   std::unique_ptr<VPlan> buildPlainCFG();
@@ -186,7 +190,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       // recipes.
       if (Br->isConditional()) {
         VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
-        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst);
+        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst,
+                                 VPIRMetadata(*Inst), Inst->getDebugLoc());
       }
 
       // Skip the rest of the Instruction processing for Branch instructions.
@@ -200,7 +205,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
       for (auto Case : SI->cases())
         Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
-      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
+      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst,
+                               VPIRMetadata(*Inst), Inst->getDebugLoc());
       continue;
     }
 
@@ -228,6 +234,18 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
               VPPredToIncomingValue.lookup(Pred->getExitingBasicBlock()));
       }
     } else {
+      // Build VPIRMetadata from the instruction and add loop versioning
+      // metadata for loads and stores.
+      VPIRMetadata MD(*Inst);
+      if (isa<LoadInst, StoreInst>(Inst) && LVer) {
+        const auto &[AliasScopeMD, NoAliasMD] =
+            LVer->getNoAliasMetadataFor(Inst);
+        if (AliasScopeMD)
+          MD.setMetadata(LLVMContext::MD_alias_scope, AliasScopeMD);
+        if (NoAliasMD)
+          MD.setMetadata(LLVMContext::MD_noalias, NoAliasMD);
+      }
+
       // Translate LLVM-IR operands into VPValue operands and set them in the
       // new VPInstruction.
       SmallVector<VPValue *, 4> VPOperands;
@@ -236,12 +254,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
 
       if (auto *CI = dyn_cast<CastInst>(Inst)) {
         NewR = VPIRBuilder.createScalarCast(CI->getOpcode(), VPOperands[0],
-                                            CI->getType(), CI->getDebugLoc());
+                                            CI->getType(), CI->getDebugLoc(),
+                                            {}, MD);
         NewR->setUnderlyingValue(CI);
       } else {
         // Build VPInstruction for any arbitrary Instruction without specific
         // representation in VPlan.
-        NewR = VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst);
+        NewR = VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst, MD,
+                                        Inst->getDebugLoc());
       }
     }
 
@@ -537,8 +557,9 @@ static void addInitialSkeleton(VPlan &Plan, Type *InductionTy, DebugLoc IVDL,
 
 std::unique_ptr<VPlan>
 VPlanTransforms::buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy,
-                             DebugLoc IVDL, PredicatedScalarEvolution &PSE) {
-  PlainCFGBuilder Builder(TheLoop, &LI);
+                             DebugLoc IVDL, PredicatedScalarEvolution &PSE,
+                             LoopVersioning *LVer) {
+  PlainCFGBuilder Builder(TheLoop, &LI, LVer);
   std::unique_ptr<VPlan> VPlan0 = Builder.buildPlainCFG();
   addInitialSkeleton(*VPlan0, InductionTy, IVDL, PSE, TheLoop);
   return VPlan0;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e2a8e495d5ed5..fca6554ad77c6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -36,7 +36,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/LoopVersioning.h"
 #include <cassert>
 
 using namespace llvm;
@@ -1674,17 +1673,6 @@ void VPIRPhi::printRecipe(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-VPIRMetadata::VPIRMetadata(Instruction &I, LoopVersioning *LVer)
-    : VPIRMetadata(I) {
-  if (!LVer || !isa<LoadInst, StoreInst>(&I))
-    return;
-  const auto &[AliasScopeMD, NoAliasMD] = LVer->getNoAliasMetadataFor(&I);
-  if (AliasScopeMD)
-    Metadata.emplace_back(LLVMContext::MD_alias_scope, AliasScopeMD);
-  if (NoAliasMD)
-    Metadata.emplace_back(LLVMContext::MD_noalias, NoAliasMD);
-}
-
 void VPIRMetadata::applyMetadata(Instruction &I) const {
   for (const auto &[Kind, Node] : Metadata)
     I.setMetadata(Kind, Node);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
index 1453c6623625b..3b5cc9fcb9820 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -517,7 +517,8 @@ VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
 
   assert(CombinedOperands.size() > 0 && "Need more some operands");
   auto *Inst = cast<VPInstruction>(Values[0])->getUnderlyingInstr();
-  auto *VPI = new VPInstruction(Opcode, CombinedOperands, Inst->getDebugLoc());
+  auto *VPI =
+      new VPInstruction(Opcode, CombinedOperands, {}, {}, Inst->getDebugLoc());
 
   LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " " << Values[0]
                     << "\n");
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3e2c47e4556a6..89118b49bed44 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -85,20 +85,19 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
               Ingredient.getDebugLoc());
         }
       } else {
-        assert(isa<VPInstruction>(&Ingredient) &&
-               "only VPInstructions expected here");
+        auto *VPI = cast<VPInstruction>(&Ingredient);
         assert(!isa<PHINode>(Inst) && "phis should be handled above");
         // Create VPWidenMemoryRecipe for loads and stores.
         if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
           NewRecipe = new VPWidenLoadRecipe(
               *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
-              false /*Consecutive*/, false /*Reverse*/, VPIRMetadata(*Load),
+              false /*Consecutive*/, false /*Reverse*/, *VPI,
               Ingredient.getDebugLoc());
         } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
           NewRecipe = new VPWidenStoreRecipe(
               *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
-              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/,
-              VPIRMetadata(*Store), Ingredient.getDebugLoc());
+              nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
+              Ingredient.getDebugLoc());
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
@@ -107,15 +106,17 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
             return false;
           NewRecipe = new VPWidenIntrinsicRecipe(
               *CI, getVectorIntrinsicIDForCall(CI, &TLI),
-              drop_end(Ingredient.operands()), CI->getType(),
+              drop_end(Ingredient.operands()), CI->getType(), *VPI,
               CI->getDebugLoc());
         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
-          NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
+          NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands(), *VPI);
         } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
-          NewRecipe = new VPWidenCastRecipe(
-              CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), *CI);
+          NewRecipe =
+              new VPWidenCastRecipe(CI->getOpcode(), Ingredient.getOperand(0),
+                                    CI->getType(), *CI, *VPI);
         } else {
-          NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands());
+          NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
+                                        Ingredient.getDebugLoc());
         }
       }
 
@@ -1705,8 +1706,9 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
       Ops.append({ALM, Plan.getOrAddLiveIn(
                            ConstantInt::get(IntegerType::getInt64Ty(Ctx),
                                             VF.getKnownMinValue() * Part))});
-      auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
-                                             IntegerType::getInt1Ty(Ctx), DL);
+      auto *Ext =
+          new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
+                                     IntegerType::getInt1Ty(Ctx), {}, {}, DL);
       Extracts[Part] = Ext;
       Ext->insertAfter(ALM);
     }
@@ -1845,7 +1847,7 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF,
     // The vector region contains header phis for which we cannot remove the
     // loop region yet.
     auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
-                                  Term->getDebugLoc());
+                                  {}, {}, Term->getDebugLoc());
     ExitingVPBB->appendRecipe(BOC);
   }
 
@@ -2679,13 +2681,13 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
             m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
     return new VPWidenIntrinsicRecipe(
         Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
-        TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
+        TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
 
   if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
                                  m_VPValue(RHS))))
     return new VPWidenIntrinsicRecipe(
         Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
-        TypeInfo.inferScalarType(LHS), CurRecipe.getDebugLoc());
+        TypeInfo.inferScalarType(LHS), {}, {}, CurRecipe.getDebugLoc());
 
   return nullptr;
 }
@@ -2753,7 +2755,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
         VPWidenIntrinsicRecipe *VPSplice = new VPWidenIntrinsicRecipe(
             Intrinsic::experimental_vp_splice,
             {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
-            TypeInfo.inferScalarType(R.getVPSingleValue()), R.getDebugLoc());
+            TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
+            R.getDebugLoc());
         VPSplice->insertBefore(&R);
         R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
         ToErase.push_back(&R);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index e3bde8a47dcbc..a44a4f69c917b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -23,6 +23,7 @@ namespace llvm {
 
 class InductionDescriptor;
 class Instruction;
+class LoopVersioning;
 class PHINode;
 class ScalarEvolution;
 class PredicatedScalarEvolution;
@@ -99,7 +100,7 @@ struct VPlanTransforms {
   ///      >[ ]     <-- original loop exit block(s), wrapped in VPIRBasicBlocks.
   LLVM_ABI_FOR_TEST static std::unique_ptr<VPlan>
   buildVPlan0(Loop *TheLoop, LoopInfo &LI, Type *InductionTy, DebugLoc IVDL,
-              PredicatedScalarEvolution &PSE);
+              PredicatedScalarEvolution &PSE, LoopVersioning *LVer = nullptr);
 
   /// Update \p Plan to account for all early exits.
   LLVM_ABI_FOR_TEST static void handleEarlyExits(VPlan &Plan,
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index ee7fa175ca918..0e76c64f09f59 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1009,7 +1009,7 @@ TEST_F(VPRecipeTest, CastVPWidenRecipeToVPUser) {
   SmallVector<VPValue *, 2> Args;
   Args.push_back(Op1);
   Args.push_back(Op2);
-  VPWidenRecipe WidenR(*AI, make_range(Args.begin(), Args.end()));
+  VPWidenRecipe WidenR(*AI, Args, VPIRMetadata(), DebugLoc());
 
   checkVPRecipeCastImpl<VPWidenRecipe, VPUser>(&WidenR);
   delete AI;
@@ -1092,7 +1092,7 @@ TEST_F(VPRecipeTest, CastVPWidenCastRecipeToVPUser) {
   IntegerType *Int64 = IntegerType::get(C, 64);
   auto *Cast = CastInst::CreateZExtOrBitCast(PoisonValue::get(Int32), Int64);
   VPValue *Op1 = Plan.getOrAddLiveIn(ConstantInt::get(Int32, 1));
-  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast);
+  VPWidenCastRecipe Recipe(Instruction::ZExt, Op1, Int64, *Cast, {});
 
   checkVPRecipeCastImpl<VPWidenCastRecipe, VPUser>(&Recipe);
   delete Cast;
@@ -1263,7 +1263,7 @@ TEST_F(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) {
     SmallVector<VPValue *, 2> Args;
     Args.push_back(Op1);
     Args.push_back(Op2);
-    VPWidenRecipe Recipe(*AI, make_range(Args.begin(), Args.end()));
+    VPWidenRecipe Recipe(*AI, Args, VPIRMetadata(), DebugLoc());
     EXPECT_FALSE(Recipe.mayHaveSideEffects());
     EXPECT_FALSE(Recipe.mayReadFromMemory());
     EXPECT_FALSE(Recipe.mayWriteToMemory());
@@ -1468,7 +1468,7 @@ TEST_F(VPRecipeTest, dumpRecipeInPlan) {
   Args.push_back(ExtVPV1);
   Args.push_back(ExtVPV2);
   VPWidenRecipe *WidenR =
-      new VPWidenRecipe(*AI, make_range(Args.begin(), Args.end()));
+      new VPWidenRecipe(*AI, Args, VPIRMetadata(), DebugLoc());
   VPBB1->appendRecipe(WidenR);
 
   {

From 92c8c87c49100e3f14e3ec46abf47f27191f8b53 Mon Sep 17 00:00:00 2001
From: Alexey Samsonov <vonosmas@gmail.com>
Date: Mon, 17 Nov 2025 13:42:12 -0800
Subject: [PATCH 58/67] [libc] Implement wcstod and wcstold. (#168020)

These are simply implemented as specializations of strtofloatingpoint
for double / long double and for wchar_t. The unit tests are copied from
the strtod / strtold ones.
---
 libc/config/linux/x86_64/entrypoints.txt |   2 +
 libc/include/wchar.yaml                  |  14 +
 libc/src/wchar/CMakeLists.txt            |  22 +
 libc/src/wchar/wcstod.cpp                |  30 ++
 libc/src/wchar/wcstod.h                  |  20 +
 libc/src/wchar/wcstold.cpp               |  30 ++
 libc/src/wchar/wcstold.h                 |  21 +
 libc/test/src/wchar/CMakeLists.txt       |  29 +-
 libc/test/src/wchar/wcstod_test.cpp      | 586 +++++++++++++++++++++++
 libc/test/src/wchar/wcstold_test.cpp     | 262 ++++++++++
 10 files changed, 1015 insertions(+), 1 deletion(-)
 create mode 100644 libc/src/wchar/wcstod.cpp
 create mode 100644 libc/src/wchar/wcstod.h
 create mode 100644 libc/src/wchar/wcstold.cpp
 create mode 100644 libc/src/wchar/wcstold.h
 create mode 100644 libc/test/src/wchar/wcstod_test.cpp
 create mode 100644 libc/test/src/wchar/wcstold_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index d3bcad470b3e1..5036c9438a503 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -398,9 +398,11 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.wchar.wmemchr
     libc.src.wchar.wcpcpy
     libc.src.wchar.wcpncpy
+    libc.src.wchar.wcstod
     libc.src.wchar.wcstof
     libc.src.wchar.wcstok
     libc.src.wchar.wcstol
+    libc.src.wchar.wcstold
     libc.src.wchar.wcstoll
     libc.src.wchar.wcstoul
     libc.src.wchar.wcstoull
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index faceb9bb4e12d..a524c7f56bed0 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -367,3 +367,17 @@ functions:
     arguments:
       - type: const wchar_t *__restrict
       - type: wchar_t **__restrict
+  - name: wcstod
+    standards:
+      - stdc
+    return_type: double
+    arguments:
+      - type: const wchar_t *__restrict
+      - type: wchar_t **__restrict
+  - name: wcstold
+    standards:
+      - stdc
+    return_type: long double
+    arguments:
+      - type: const wchar_t *__restrict
+      - type: wchar_t **__restrict
diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt
index e3fac9fb80529..e6d9af9eacf73 100644
--- a/libc/src/wchar/CMakeLists.txt
+++ b/libc/src/wchar/CMakeLists.txt
@@ -110,6 +110,28 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  wcstod
+  SRCS
+    wcstod.cpp
+  HDRS
+    wcstod.h
+  DEPENDS
+    libc.src.__support.str_to_float
+    libc.src.errno.errno
+)
+
+add_entrypoint_object(
+  wcstold
+  SRCS
+    wcstold.cpp
+  HDRS
+    wcstold.h
+  DEPENDS
+    libc.src.__support.str_to_float
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   wcstok
   SRCS
diff --git a/libc/src/wchar/wcstod.cpp b/libc/src/wchar/wcstod.cpp
new file mode 100644
index 0000000000000..95351c304c0ff
--- /dev/null
+++ b/libc/src/wchar/wcstod.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of wcstod ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wcstod.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_float.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(double, wcstod,
+                   (const wchar_t *__restrict str,
+                    wchar_t **__restrict str_end)) {
+  auto result = internal::strtofloatingpoint<double>(str);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<wchar_t *>(str + result.parsed_len);
+
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wcstod.h b/libc/src/wchar/wcstod.h
new file mode 100644
index 0000000000000..ff397b93d405d
--- /dev/null
+++ b/libc/src/wchar/wcstod.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for wcstod ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_WCSTOD_H
+#define LLVM_LIBC_SRC_WCHAR_WCSTOD_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+double wcstod(const wchar_t *__restrict str, wchar_t **__restrict str_end);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_WCSTOD_H
diff --git a/libc/src/wchar/wcstold.cpp b/libc/src/wchar/wcstold.cpp
new file mode 100644
index 0000000000000..ffbc3f248b883
--- /dev/null
+++ b/libc/src/wchar/wcstold.cpp
@@ -0,0 +1,30 @@
+//===-- Implementation of wcstold -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wcstold.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/str_to_float.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(long double, wcstold,
+                   (const wchar_t *__restrict str,
+                    wchar_t **__restrict str_end)) {
+  auto result = internal::strtofloatingpoint<long double>(str);
+  if (result.has_error())
+    libc_errno = result.error;
+
+  if (str_end != nullptr)
+    *str_end = const_cast<wchar_t *>(str + result.parsed_len);
+
+  return result.value;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/wchar/wcstold.h b/libc/src/wchar/wcstold.h
new file mode 100644
index 0000000000000..1525362b33571
--- /dev/null
+++ b/libc/src/wchar/wcstold.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for wcstold -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_WCHAR_WCSTOLD_H
+#define LLVM_LIBC_SRC_WCHAR_WCSTOLD_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+long double wcstold(const wchar_t *__restrict str,
+                    wchar_t **__restrict str_end);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_WCHAR_WCSTOLD_H
diff --git a/libc/test/src/wchar/CMakeLists.txt b/libc/test/src/wchar/CMakeLists.txt
index 122cad2575327..a62a30fe00124 100644
--- a/libc/test/src/wchar/CMakeLists.txt
+++ b/libc/test/src/wchar/CMakeLists.txt
@@ -538,5 +538,32 @@ add_libc_test(
   DEPENDS
     libc.src.wchar.wcstof
     libc.test.UnitTest.ErrnoCheckingTest
-    libc.test.UnitTest.LibcFPTestHelpers
+  LINK_LIBRARIES
+    LibcFPTestHelpers
+)
+
+add_libc_test(
+  wcstod_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    wcstod_test.cpp
+  DEPENDS
+    libc.src.wchar.wcstod
+    libc.test.UnitTest.ErrnoCheckingTest
+  LINK_LIBRARIES
+    LibcFPTestHelpers
+)
+
+add_libc_test(
+  wcstold_test
+  SUITE
+    libc_wchar_unittests
+  SRCS
+    wcstold_test.cpp
+  DEPENDS
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.uint128
+    libc.src.wchar.wcstold
+    libc.test.UnitTest.ErrnoCheckingTest
 )
diff --git a/libc/test/src/wchar/wcstod_test.cpp b/libc/test/src/wchar/wcstod_test.cpp
new file mode 100644
index 0000000000000..0c2b82cfba898
--- /dev/null
+++ b/libc/test/src/wchar/wcstod_test.cpp
@@ -0,0 +1,586 @@
+//===-- Unittests for wcstod ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/wchar/wcstod.h"
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/RoundingModeUtils.h"
+#include "test/UnitTest/Test.h"
+
+#include <stddef.h>
+
+using LIBC_NAMESPACE::fputil::testing::ForceRoundingModeTest;
+using LIBC_NAMESPACE::fputil::testing::RoundingMode;
+
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+
+class LlvmLibcWcstodTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest,
+                           ForceRoundingModeTest<RoundingMode::Nearest> {
+public:
+  void run_test(const wchar_t *inputString, const ptrdiff_t expectedStrLen,
+                const uint64_t expectedRawData, const int expectedErrno = 0) {
+    // expectedRawData is the expected double result as a uint64_t, organized
+    // according to IEEE754:
+    //
+    // +-- 1 Sign Bit                        +-- 52 Mantissa bits
+    // |                                     |
+    // |           +-------------------------+------------------------+
+    // |           |                                                  |
+    // SEEEEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+    //  |         |
+    //  +----+----+
+    //       |
+    //       +-- 11 Exponent Bits
+    //
+    //  This is so that the result can be compared in parts.
+    wchar_t *str_end = nullptr;
+
+    LIBC_NAMESPACE::fputil::FPBits<double> expected_fp =
+        LIBC_NAMESPACE::fputil::FPBits<double>(expectedRawData);
+
+    double result = LIBC_NAMESPACE::wcstod(inputString, &str_end);
+    if (expectedErrno == 0)
+      EXPECT_THAT(result, Succeeds<double>(expected_fp.get_val()));
+    else
+      EXPECT_THAT(result, Fails<double>(expectedErrno, expected_fp.get_val()));
+    EXPECT_EQ(str_end - inputString, expectedStrLen);
+  }
+};
+
+TEST_F(LlvmLibcWcstodTest, SimpleTest) {
+  run_test(L"123", 3, uint64_t(0x405ec00000000000));
+
+  // This should fail on Eisel-Lemire, forcing a fallback to simple decimal
+  // conversion.
+  run_test(L"12345678901234549760", 20, uint64_t(0x43e56a95319d63d8));
+
+  // Found while looking for difficult test cases here:
+  // https://github.com/nigeltao/parse-number-fxx-test-data/blob/main/more-test-cases/golang-org-issue-36657.txt
+  run_test(L"1090544144181609348835077142190", 31,
+           uint64_t(0x462b8779f2474dfb));
+
+  run_test(L"0x123", 5, uint64_t(0x4072300000000000));
+}
+
+// These are tests that have caused problems in the past.
+TEST_F(LlvmLibcWcstodTest, SpecificFailures) {
+  run_test(L"3E70000000000000", 16, uint64_t(0x7FF0000000000000), ERANGE);
+  run_test(L"358416272e-33", 13, uint64_t(0x3adbbb2a68c9d0b9));
+  run_test(L"2.16656806400000023841857910156251e9", 36,
+           uint64_t(0x41e0246690000001));
+  run_test(L"27949676547093071875", 20, uint64_t(0x43f83e132bc608c9));
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "0000000000e-800",
+      806, 0x3ff0000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "0000000000e-799",
+      806, 0x4024000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "00000000000e-800",
+      807, 0x4024000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000e-64",
+      69, 0x3ff0000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "0000000000000000000000000000000000000000000000000000000000e-128",
+      134, 0x3ff0000000000000);
+  run_test(L"100000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "000000000000000000000000000000000000000000000000000000000e-256",
+           262, 0x3ff0000000000000);
+  run_test(L"100000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "0000000000000000000000000000000000000000000000000000000000000000000"
+           "000000000000000000000000000000000000000000000e-512",
+           518, 0x3ff0000000000000);
+  run_test(
+      L"10000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000e-1024",
+      1031, 0x3ff0000000000000);
+  run_test(
+      L"0"
+      "100000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "00000000000000000e-1024",
+      1032, 0x3ff0000000000000);
+}
+
+TEST_F(LlvmLibcWcstodTest, FuzzFailures) {
+  run_test(L"-\xff\xff\xff\xff\xff\xff\xff\x01", 0, uint64_t(0));
+  run_test(L"-.????", 0, uint64_t(0));
+  run_test(
+      L"44444444444444444444444444444444444444444444444444A44444444444444444"
+      "44444444444*\x99\xff\xff\xff\xff",
+      50, uint64_t(0x4a3e68fdd0e0b2d8));
+  run_test(L"-NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNKNNNNNNNNNNNNNNNNNN?"
+           "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN?",
+           0, uint64_t(0));
+  run_test(L"0x.666E40", 9, uint64_t(0x3fd99b9000000000));
+
+  // glibc version 2.36 and higher (not tested with lower versions) disagrees
+  // with this result, but ours is correct for the nearest rounding mode. See
+  // this bug: https://sourceware.org/bugzilla/show_bug.cgi?id=30220
+  run_test(L"0x30000002222225p-1077", 22, uint64_t(0x0006000000444445), ERANGE);
+
+  // This value triggered a bug by having an exponent exactly equal to the
+  // maximum. The overflow checks would accept a value less than the max value
+  // as valid and greater than the max value as invalid (and set it to the max),
+  // but an exponent of exactly max value hit the else condition which is
+  // intended for underflow and set the exponent to the min exponent.
+  run_test(
+      L"18477446000000000000000000000000000005230000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000935166201543003765631683711878842"
+      "388777446000000000000430037600000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000005238581124701719460000000"
+      "000000000017194600000000000000000070046000000000000000000000000100000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000002000000000000000"
+      "000000000000056316837118788423887774460000000000000000000000000000052385"
+      "811247017194600000000000000000171946000000000000000000700460000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000002000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000523858112470171946000000"
+      "000000000001719460000000000000000007004600000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "0200000000000000000E608",
+      1462, uint64_t(0x7ff0000000000000), ERANGE);
+
+  // Same as above but for hex.
+  run_test(L"0x0164810157p2047", 17, uint64_t(0x7ff0000000000000), ERANGE);
+
+  // This test ensures that only the correct number of characters is accepted.
+  // An exponent symbol followed by a sign isn't a valid exponent.
+  run_test(L"2e+", 1, uint64_t(0x4000000000000000));
+  run_test(L"0x2p+", 3, uint64_t(0x4000000000000000));
+
+  // This bug was in the handling of very large exponents in the exponent
+  // marker. Previously anything greater than 10,000 would be set to 10,000.
+  // This caused incorrect behavior if there were more than 10,000 '0's in the
+  // input number, and then a correspondingly large exponent. This test case has
+  // 24,744 zeroes.
+  run_test(
+      L"0x."
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000000000000000000000000000"
+      "000000000000000000000000000000000000000000000000fp551615",
+      24755, uint64_t(0x7ff0000000000000), ERANGE);
+}
diff --git a/libc/test/src/wchar/wcstold_test.cpp b/libc/test/src/wchar/wcstold_test.cpp
new file mode 100644
index 0000000000000..3a7fdfce3e732
--- /dev/null
+++ b/libc/test/src/wchar/wcstold_test.cpp
@@ -0,0 +1,262 @@
+//===-- Unittests for wcstold ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/uint128.h"
+#include "src/wchar/wcstold.h"
+
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/Test.h"
+
+#include <stddef.h>
+
+#if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64)
+#define SELECT_CONST(val, _, __) val
+#elif defined(LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80)
+#define SELECT_CONST(_, val, __) val
+#elif defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT128)
+#define SELECT_CONST(_, __, val) val
+#else
+#error "Unknown long double type"
+#endif
+
+class LlvmLibcWcstoldTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest {
+public:
+#if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64)
+  void run_test(const wchar_t *inputString, const ptrdiff_t expectedStrLen,
+                const uint64_t expectedRawData, const int expectedErrno = 0)
+#else
+  void run_test(const wchar_t *inputString, const ptrdiff_t expectedStrLen,
+                const UInt128 expectedRawData, const int expectedErrno = 0)
+#endif
+  {
+    // expectedRawData64 is the expected long double result as a uint64_t,
+    // organized according to the IEEE754 double precision format:
+    //
+    // +-- 1 Sign Bit                        +-- 52 Mantissa bits
+    // |                                     |
+    // |           +-------------------------+------------------------+
+    // |           |                                                  |
+    // SEEEEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
+    //  |         |
+    //  +----+----+
+    //       |
+    //       +-- 11 Exponent Bits
+
+    // expectedRawData80 is the expected long double result as a UInt128,
+    // organized according to the x86 extended precision format:
+    //
+    // +-- 1 Sign Bit
+    // |
+    // |               +-- 1 Integer part bit (1 unless this is a subnormal)
+    // |               |
+    // SEEEEEEEEEEEEEEEIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM...M
+    //  |             | |                                                      |
+    //  +------+------+ +---------------------------+--------------------------+
+    //         |                                    |
+    //         +-- 15 Exponent Bits                 +-- 63 Mantissa bits
+
+    // expectedRawData128 is the expected long double result as a UInt128,
+    // organized according to IEEE754 quadruple precision format:
+    //
+    // +-- 1 Sign Bit                               +-- 112 Mantissa bits
+    // |                                            |
+    // |               +----------------------------+--------------------------+
+    // |               |                                                       |
+    // SEEEEEEEEEEEEEEEMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM...M
+    //  |             |
+    //  +------+------+
+    //         |
+    //         +-- 15 Exponent Bits
+    wchar_t *str_end = nullptr;
+
+    using FPBits = LIBC_NAMESPACE::fputil::FPBits<long double>;
+    FPBits expected_fp =
+        FPBits(static_cast<FPBits::StorageType>(expectedRawData));
+    const int expected_errno = expectedErrno;
+
+    long double result = LIBC_NAMESPACE::wcstold(inputString, &str_end);
+
+    LIBC_NAMESPACE::fputil::FPBits<long double> actual_fp =
+        LIBC_NAMESPACE::fputil::FPBits<long double>();
+    actual_fp = LIBC_NAMESPACE::fputil::FPBits<long double>(result);
+
+    EXPECT_EQ(str_end - inputString, expectedStrLen);
+
+    EXPECT_EQ(actual_fp.uintval(), expected_fp.uintval());
+    EXPECT_EQ(actual_fp.is_neg(), expected_fp.is_neg());
+    EXPECT_EQ(actual_fp.get_exponent(), expected_fp.get_exponent());
+    EXPECT_EQ(actual_fp.get_mantissa(), expected_fp.get_mantissa());
+    ASSERT_ERRNO_EQ(expected_errno);
+  }
+};
+
+TEST_F(LlvmLibcWcstoldTest, SimpleTest) {
+  run_test(L"123", 3,
+           SELECT_CONST(uint64_t(0x405ec00000000000),
+                        UInt128(0x4005f60000) << 40,
+                        UInt128(0x4005ec0000000000) << 64));
+
+  // This should fail on Eisel-Lemire, forcing a fallback to simple decimal
+  // conversion.
+  run_test(L"12345678901234549760", 20,
+           SELECT_CONST(uint64_t(0x43e56a95319d63d8),
+                        (UInt128(0x403eab54a9) << 40) + UInt128(0x8ceb1ec400),
+                        (UInt128(0x403e56a95319d63d) << 64) +
+                            UInt128(0x8800000000000000)));
+
+  // Found while looking for difficult test cases here:
+  // https://github.com/nigeltao/parse-number-fxx-test-data/blob/main/more-test-cases/golang-org-issue-36657.txt
+  run_test(L"1090544144181609348835077142190", 31,
+           SELECT_CONST(uint64_t(0x462b8779f2474dfb),
+                        (UInt128(0x4062dc3bcf) << 40) + UInt128(0x923a6fd402),
+                        (UInt128(0x4062b8779f2474df) << 64) +
+                            UInt128(0xa804bfd8c6d5c000)));
+
+  run_test(L"0x123", 5,
+           SELECT_CONST(uint64_t(0x4072300000000000),
+                        (UInt128(0x4007918000) << 40),
+                        (UInt128(0x4007230000000000) << 64)));
+}
+
+// These are tests that have caused problems for doubles in the past.
+TEST_F(LlvmLibcWcstoldTest, Float64SpecificFailures) {
+  run_test(L"3E70000000000000", 16,
+           SELECT_CONST(uint64_t(0x7FF0000000000000),
+                        (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7fff000000000000) << 64)),
+           ERANGE);
+  run_test(L"358416272e-33", 13,
+           SELECT_CONST(uint64_t(0x3adbbb2a68c9d0b9),
+                        (UInt128(0x3fadddd953) << 40) + UInt128(0x464e85c400),
+                        (UInt128(0x3fadbbb2a68c9d0b) << 64) +
+                            UInt128(0x8800e7969e1c5fc8)));
+  run_test(L"2.16656806400000023841857910156251e9", 36,
+           SELECT_CONST(uint64_t(0x41e0246690000001),
+                        (UInt128(0x401e812334) << 40) + UInt128(0x8000000400),
+                        (UInt128(0x401e024669000000) << 64) +
+                            UInt128(0x800000000000018)));
+  run_test(L"27949676547093071875", 20,
+           SELECT_CONST(uint64_t(0x43f83e132bc608c9),
+                        (UInt128(0x403fc1f099) << 40) + UInt128(0x5e30464402),
+                        (UInt128(0x403f83e132bc608c) << 64) +
+                            UInt128(0x8803000000000000)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, Float80SpecificFailures) {
+  run_test(L"777777777777777777777777777777777777777777777777777777777777777777"
+           "7777777777777777777777777777777777",
+           100,
+           SELECT_CONST(uint64_t(0x54ac729b8fcaf734),
+                        (UInt128(0x414ae394dc) << 40) + UInt128(0x7e57b9a0c2),
+                        (UInt128(0x414ac729b8fcaf73) << 64) +
+                            UInt128(0x4184a3d793224129)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, MaxSizeNumbers) {
+  run_test(L"1.1897314953572317650e4932", 26,
+           SELECT_CONST(uint64_t(0x7FF0000000000000),
+                        (UInt128(0x7ffeffffff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7ffeffffffffffff) << 64) +
+                            UInt128(0xfffd57322e3f8675)),
+           SELECT_CONST(ERANGE, 0, 0));
+  run_test(L"1.18973149535723176508e4932", 27,
+           SELECT_CONST(uint64_t(0x7FF0000000000000),
+                        (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7ffeffffffffffff) << 64) +
+                            UInt128(0xffffd2478338036c)),
+           SELECT_CONST(ERANGE, ERANGE, 0));
+}
+
+// These tests check subnormal behavior for 80 bit and 128 bit floats. They will
+// be too small for 64 bit floats.
+TEST_F(LlvmLibcWcstoldTest, SubnormalTests) {
+  run_test(L"1e-4950", 7,
+           SELECT_CONST(uint64_t(0), (UInt128(0x00000000000000000003)),
+                        (UInt128(0x000000000000000000057c9647e1a018))),
+           ERANGE);
+  run_test(L"1.89e-4951", 10,
+           SELECT_CONST(uint64_t(0), (UInt128(0x00000000000000000001)),
+                        (UInt128(0x0000000000000000000109778a006738))),
+           ERANGE);
+  run_test(L"4e-4966", 7,
+           SELECT_CONST(uint64_t(0), (UInt128(0)),
+                        (UInt128(0x00000000000000000000000000000001))),
+           ERANGE);
+}
+
+TEST_F(LlvmLibcWcstoldTest, SmallNormalTests) {
+  run_test(L"3.37e-4932", 10,
+           SELECT_CONST(
+               uint64_t(0), (UInt128(0x1804cf7) << 40) + UInt128(0x908850712),
+               (UInt128(0x10099ee12110a) << 64) + UInt128(0xe24b75c0f50dc0c)),
+           SELECT_CONST(ERANGE, 0, 0));
+}
+
+TEST_F(LlvmLibcWcstoldTest, ComplexHexadecimalTests) {
+  run_test(L"0x1p16383", 9,
+           SELECT_CONST(0x7ff0000000000000, (UInt128(0x7ffe800000) << 40),
+                        (UInt128(0x7ffe000000000000) << 64)),
+           SELECT_CONST(ERANGE, 0, 0));
+  run_test(L"0x123456789abcdef", 17,
+           SELECT_CONST(0x43723456789abcdf,
+                        (UInt128(0x403791a2b3) << 40) + UInt128(0xc4d5e6f780),
+                        (UInt128(0x403723456789abcd) << 64) +
+                            UInt128(0xef00000000000000)));
+  run_test(L"0x123456789abcdef0123456789ABCDEF", 33,
+           SELECT_CONST(0x47723456789abcdf,
+                        (UInt128(0x407791a2b3) << 40) + UInt128(0xc4d5e6f781),
+                        (UInt128(0x407723456789abcd) << 64) +
+                            UInt128(0xef0123456789abce)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, InfTests) {
+  run_test(L"INF", 3,
+           SELECT_CONST(0x7ff0000000000000, (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7fff000000000000) << 64)));
+  run_test(L"INFinity", 8,
+           SELECT_CONST(0x7ff0000000000000, (UInt128(0x7fff800000) << 40),
+                        (UInt128(0x7fff000000000000) << 64)));
+  run_test(L"-inf", 4,
+           SELECT_CONST(0xfff0000000000000, (UInt128(0xffff800000) << 40),
+                        (UInt128(0xffff000000000000) << 64)));
+}
+
+TEST_F(LlvmLibcWcstoldTest, NaNTests) {
+  run_test(L"NaN", 3,
+           SELECT_CONST(0x7ff8000000000000, (UInt128(0x7fffc00000) << 40),
+                        (UInt128(0x7fff800000000000) << 64)));
+  run_test(L"-nAn", 4,
+           SELECT_CONST(0xfff8000000000000, (UInt128(0xffffc00000) << 40),
+                        (UInt128(0xffff800000000000) << 64)));
+  run_test(L"NaN()", 5,
+           SELECT_CONST(0x7ff8000000000000, (UInt128(0x7fffc00000) << 40),
+                        (UInt128(0x7fff800000000000) << 64)));
+  run_test(L"NaN(1234)", 9,
+           SELECT_CONST(0x7ff80000000004d2,
+                        (UInt128(0x7fffc00000) << 40) + UInt128(0x4d2),
+                        (UInt128(0x7fff800000000000) << 64) + UInt128(0x4d2)));
+  run_test(L"NaN(0xffffffffffff)", 19,
+           SELECT_CONST(0x7ff8ffffffffffff,
+                        (UInt128(0x7fffc000ff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7fff800000000000) << 64) +
+                            UInt128(0xffffffffffff)));
+  run_test(L"NaN(0xfffffffffffff)", 20,
+           SELECT_CONST(0x7fffffffffffffff,
+                        (UInt128(0x7fffc00fff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7fff800000000000) << 64) +
+                            UInt128(0xfffffffffffff)));
+  run_test(L"NaN(0xffffffffffffffff)", 23,
+           SELECT_CONST(0x7fffffffffffffff,
+                        (UInt128(0x7fffffffff) << 40) + UInt128(0xffffffffff),
+                        (UInt128(0x7fff800000000000) << 64) +
+                            UInt128(0xffffffffffffffff)));
+  run_test(L"NaN( 1234)", 3,
+           SELECT_CONST(0x7ff8000000000000, (UInt128(0x7fffc00000) << 40),
+                        (UInt128(0x7fff800000000000) << 64)));
+}

From 3f60d220514c4be00e548a17a85c2fa8fa89cc35 Mon Sep 17 00:00:00 2001
From: Ziqing Luo <ziqing_luo@apple.com>
Date: Mon, 17 Nov 2025 13:43:53 -0800
Subject: [PATCH 59/67] [-Wunsafe-buffer-usage] Fold the expression "cond ? E1
 : E2" when checking safe patterns, if "cond" is a constant (#167989)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In `-Wunsafe-buffer-usage`, many safe pattern checks can benefit from
constant folding. This commit improves null-terminated pointer checks by
folding conditional expressions.

rdar://159374822

---------

Co-authored-by: Balázs Benics <benicsbalazs@gmail.com>
---
 clang/lib/Analysis/UnsafeBufferUsage.cpp      | 26 +++++++++++++---
 ...n-unsafe-buffer-usage-fold-conditional.cpp | 31 +++++++++++++++++++
 2 files changed, 52 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/SemaCXX/warn-unsafe-buffer-usage-fold-conditional.cpp

diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index f5a368636c43d..da155d31d4a88 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -781,9 +781,25 @@ struct LibcFunNamePrefixSuffixParser {
   }
 };
 
+// Constant fold a conditional expression 'cond ? A : B' to
+// - 'A', if 'cond' has constant true value;
+// - 'B', if 'cond' has constant false value.
+static const Expr *tryConstantFoldConditionalExpr(const Expr *E,
+                                                  const ASTContext &Ctx) {
+  // FIXME: more places can use this function
+  if (const auto *CE = dyn_cast<ConditionalOperator>(E)) {
+    bool CondEval;
+
+    if (CE->getCond()->EvaluateAsBooleanCondition(CondEval, Ctx))
+      return CondEval ? CE->getLHS() : CE->getRHS();
+  }
+  return E;
+}
+
 // A pointer type expression is known to be null-terminated, if it has the
 // form: E.c_str(), for any expression E of `std::string` type.
-static bool isNullTermPointer(const Expr *Ptr) {
+static bool isNullTermPointer(const Expr *Ptr, ASTContext &Ctx) {
+  Ptr = tryConstantFoldConditionalExpr(Ptr, Ctx);
   if (isa<clang::StringLiteral>(Ptr->IgnoreParenImpCasts()))
     return true;
   if (isa<PredefinedExpr>(Ptr->IgnoreParenImpCasts()))
@@ -874,7 +890,7 @@ static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg,
 
       const Expr *Arg = Call->getArg(ArgIdx);
 
-      if (isNullTermPointer(Arg))
+      if (isNullTermPointer(Arg, Ctx))
         // If Arg is a null-terminated pointer, it is safe anyway.
         return true; // continue parsing
 
@@ -922,8 +938,8 @@ static bool hasUnsafeFormatOrSArg(const CallExpr *Call, const Expr *&UnsafeArg,
   // (including the format argument) is unsafe pointer.
   return llvm::any_of(
       llvm::make_range(Call->arg_begin() + FmtArgIdx, Call->arg_end()),
-      [&UnsafeArg](const Expr *Arg) -> bool {
-        if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) {
+      [&UnsafeArg, &Ctx](const Expr *Arg) -> bool {
+        if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg, Ctx)) {
           UnsafeArg = Arg;
           return true;
         }
@@ -1175,7 +1191,7 @@ static bool hasUnsafePrintfStringArg(const CallExpr &Node, ASTContext &Ctx,
   // We don't really recognize this "normal" printf, the only thing we
   // can do is to require all pointers to be null-terminated:
   for (const auto *Arg : Node.arguments())
-    if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg)) {
+    if (Arg->getType()->isPointerType() && !isNullTermPointer(Arg, Ctx)) {
       Result.addNode(Tag, DynTypedNode::create(*Arg));
       return true;
     }
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-fold-conditional.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fold-conditional.cpp
new file mode 100644
index 0000000000000..b4f30b533bc4b
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-fold-conditional.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -fsyntax-only -Wno-all -Wunsafe-buffer-usage -verify %s -std=c++20
+// RUN: %clang_cc1 -fsyntax-only -Wno-all -Wunsafe-buffer-usage -verify %s -x c
+// expected-no-diagnostics
+
+typedef struct {} FILE;
+int fprintf( FILE* stream, const char* format, ... );
+FILE * stderr;
+
+#define DEBUG_ASSERT_MESSAGE(name, assertion, label, message, file, line, value) \
+  fprintf(stderr, "AssertMacros: %s, %s file: %s, line: %d, value: %lld\n",      \
+          assertion, (message!=0) ? message : "", file, line, (long long) (value));
+
+
+#define Require(assertion, exceptionLabel)                              \
+  do                                                                    \
+    {                                                                   \
+      if ( __builtin_expect(!(assertion), 0) ) {                        \
+        DEBUG_ASSERT_MESSAGE(                                           \
+	  "DEBUG_ASSERT_COMPONENT_NAME_STRING",                         \
+	  #assertion, #exceptionLabel, 0, __FILE__, __LINE__,  0);      \
+	goto exceptionLabel;                                            \
+      }									\
+    } while ( 0 )
+
+
+void f(int x, int y) {
+  Require(x == y, L1);
+ L1:
+  return;
+}
+

From 909c9aacead077b14e2bff123d09641d08939fe5 Mon Sep 17 00:00:00 2001
From: Erick Ochoa Lopez <erick.ochoalopez@amd.com>
Date: Mon, 17 Nov 2025 16:51:52 -0500
Subject: [PATCH 60/67] [mlir][amdgpu] Add lowerings for ScaledExtPacked816
 (#168123)

* Adds lowerings for amdgpy.scaled_ext_packed816
* updates verifiers
---
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 191 +++++++++++++++++-
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  |  45 +++--
 .../AMDGPUToROCDL/amdgpu-to-rocdl.mlir        |   1 +
 .../AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir   | 164 +++++++++++++++
 mlir/test/Dialect/AMDGPU/invalid.mlir         |  32 ---
 5 files changed, 379 insertions(+), 54 deletions(-)
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index a5831559558ac..edc6565f44f00 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -43,6 +43,7 @@ constexpr Chipset kGfx908 = Chipset(9, 0, 8);
 constexpr Chipset kGfx90a = Chipset(9, 0, 0xa);
 constexpr Chipset kGfx942 = Chipset(9, 4, 2);
 constexpr Chipset kGfx950 = Chipset(9, 5, 0);
+constexpr Chipset kGfx1250 = Chipset(12, 5, 0);
 
 /// Convert an unsigned number `val` to i32.
 static Value convertUnsignedToI32(ConversionPatternRewriter &rewriter,
@@ -1149,7 +1150,7 @@ static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
                                  k, isRDNA3);
 
   // Handle gfx1250.
-  if (chipset == Chipset{12, 5, 0})
+  if (chipset == kGfx1250)
     return wmmaOpToIntrinsicGfx1250(elemSourceType, elemBSourceType,
                                     elemDestType, k);
 
@@ -1300,7 +1301,7 @@ struct WMMAOpLowering : public ConvertOpToLLVMPattern<WMMAOp> {
     if (chipset.majorVersion != 11 && chipset.majorVersion != 12)
       return op->emitOpError("WMMA only supported on gfx11 and gfx12");
 
-    bool isGFX1250 = chipset >= Chipset(12, 5, 0);
+    bool isGFX1250 = chipset >= kGfx1250;
 
     // The WMMA operations represent vectors of bf16s as vectors of i16s
     // (except on gfx1250), so we need to bitcast bfloats to i16 and then
@@ -1505,6 +1506,19 @@ struct ExtPackedFp8OpLowering final
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+struct ScaledExtPacked816OpLowering final
+    : public ConvertOpToLLVMPattern<ScaledExtPacked816Op> {
+  ScaledExtPacked816OpLowering(const LLVMTypeConverter &converter,
+                               Chipset chipset)
+      : ConvertOpToLLVMPattern<amdgpu::ScaledExtPacked816Op>(converter),
+        chipset(chipset) {}
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(ScaledExtPacked816Op op, ScaledExtPacked816OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 struct PackedTrunc2xFp8OpLowering final
     : public ConvertOpToLLVMPattern<PackedTrunc2xFp8Op> {
   PackedTrunc2xFp8OpLowering(const LLVMTypeConverter &converter,
@@ -1613,6 +1627,170 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite(
   return success();
 }
 
+int32_t getScaleSel(int32_t blockSize, unsigned bitWidth,
+                    int32_t firstScaleLane, int32_t firstScaleByte) {
+  // When lowering amdgpu.scaled_ext_packed816 to rocdl.cvt.scale.pk*.f*.f*
+  // operations, the attributes blockSize, sourceType, firstScaleLane and
+  // firstScaleByte are merged into a single attribute scaleSel. This is how
+  // those values are merged together.
+  assert(llvm::is_contained({16, 32}, blockSize));
+  assert(llvm::is_contained(llvm::ArrayRef<unsigned>{4, 6, 8}, bitWidth));
+
+  const bool is_fp8 = bitWidth == 8;
+  const bool is_block_16 = blockSize == 16;
+
+  if (!is_fp8) {
+    int bit_0 = is_block_16;
+    assert(llvm::is_contained({0, 1, 2}, firstScaleByte));
+    int bit_1 = (firstScaleByte == 2) << 1;
+    assert(llvm::is_contained({0, 1}, firstScaleLane));
+    int bit_2 = firstScaleLane << 2;
+    return bit_2 | bit_1 | bit_0;
+  }
+
+  int bit_0 = is_block_16;
+  // firstScaleByte is guaranteed to be defined by two bits.
+  assert(llvm::is_contained({0, 1, 2, 3}, firstScaleByte));
+  int bit_2_and_1 = firstScaleByte << 1;
+  assert(llvm::is_contained({0, 1}, firstScaleLane));
+  int bit_3 = firstScaleLane << 3;
+  int bits = bit_3 | bit_2_and_1 | bit_0;
+  // These are invalid cases.
+  assert(!llvm::is_contained(
+      {0b0011, 0b0101, 0b0111, 0b1000, 0b1001, 0b1011, 0b1111}, bits));
+  return bits;
+}
+
+static std::optional<StringRef>
+scaledExtPacked816ToIntrinsic(Type srcElemType, Type destElemType) {
+  using fp4 = Float4E2M1FNType;
+  using fp8 = Float8E4M3FNType;
+  using bf8 = Float8E5M2Type;
+  using fp6 = Float6E2M3FNType;
+  using bf6 = Float6E3M2FNType;
+  if (isa<fp4>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk8F16Fp4Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk8Bf16Fp4Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk8F32Fp4Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<fp8>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk8F16Fp8Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk8Bf16Fp8Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk8F32Fp8Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<bf8>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk8F16Bf8Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk8Bf16Bf8Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk8F32Bf8Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<fp6>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk16F16Fp6Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk16Bf16Fp6Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk16F32Fp6Op::getOperationName();
+    return std::nullopt;
+  }
+  if (isa<bf6>(srcElemType)) {
+    if (destElemType.isF16())
+      return ROCDL::CvtPkScalePk16F16Bf6Op::getOperationName();
+    if (destElemType.isBF16())
+      return ROCDL::CvtPkScalePk16Bf16Bf6Op::getOperationName();
+    if (destElemType.isF32())
+      return ROCDL::CvtPkScalePk16F32Bf6Op::getOperationName();
+    return std::nullopt;
+  }
+  llvm_unreachable("invalid combination of element types for packed conversion "
+                   "instructions");
+}
+
+LogicalResult ScaledExtPacked816OpLowering::matchAndRewrite(
+    ScaledExtPacked816Op op, ScaledExtPacked816OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {
+  using fp4 = Float4E2M1FNType;
+  using fp8 = Float8E4M3FNType;
+  using bf8 = Float8E5M2Type;
+  using fp6 = Float6E2M3FNType;
+  using bf6 = Float6E3M2FNType;
+  Location loc = op.getLoc();
+  if (chipset != kGfx1250) {
+    return rewriter.notifyMatchFailure(
+        loc,
+        "Scaled fp packed conversion instructions are not available on target "
+        "architecture and their emulation is not implemented");
+  }
+  int32_t firstScaleLane = op.getFirstScaleLane();
+  int32_t firstScaleByte = op.getFirstScaleByte();
+  int32_t blockSize = op.getBlockSize();
+  auto sourceType = cast<VectorType>(op.getSource().getType());
+  auto srcElemType = cast<FloatType>(sourceType.getElementType());
+  unsigned bitWidth = srcElemType.getWidth();
+  int32_t scaleSel =
+      getScaleSel(blockSize, bitWidth, firstScaleLane, firstScaleByte);
+
+  auto targetType = cast<VectorType>(op.getResult().getType());
+  auto destElemType = cast<FloatType>(targetType.getElementType());
+  IntegerType i32 = rewriter.getI32Type();
+  Value castedScale =
+      LLVM::BitcastOp::create(rewriter, loc, i32, adaptor.getScale());
+
+  Value source = adaptor.getSource();
+  Type llvmResultType = typeConverter->convertType(op.getResult().getType());
+  Type packedType = nullptr;
+  if (isa<fp4>(srcElemType)) {
+    packedType = i32;
+    packedType = getTypeConverter()->convertType(packedType);
+  } else if (isa<fp8, bf8>(srcElemType)) {
+    packedType = VectorType::get(2, i32);
+    packedType = getTypeConverter()->convertType(packedType);
+  } else if (isa<fp6, bf6>(srcElemType)) {
+    packedType = VectorType::get(3, i32);
+    packedType = getTypeConverter()->convertType(packedType);
+  } else {
+    llvm_unreachable("invalid element type for packed scaled ext");
+  }
+
+  if (!packedType || !llvmResultType) {
+    return rewriter.notifyMatchFailure(op, "type conversion failed");
+  }
+
+  Value castedSource =
+      LLVM::BitcastOp::create(rewriter, loc, packedType, source);
+
+  std::optional<StringRef> maybeIntrinsic =
+      scaledExtPacked816ToIntrinsic(srcElemType, destElemType);
+  if (!maybeIntrinsic.has_value())
+    return op.emitOpError(
+        "no intrinsic matching packed scaled conversion on the given chipset");
+
+  OperationState loweredOp(loc, *maybeIntrinsic);
+  loweredOp.addTypes({llvmResultType});
+  loweredOp.addOperands({castedSource, castedScale});
+
+  SmallVector<NamedAttribute, 1> attrs;
+  attrs.push_back(
+      NamedAttribute("scaleSel", rewriter.getI32IntegerAttr(scaleSel)));
+
+  loweredOp.addAttributes(attrs);
+  Operation *lowered = rewriter.create(loweredOp);
+  rewriter.replaceOp(op, lowered);
+
+  return success();
+}
+
 LogicalResult ScaledExtPackedOpLowering::matchAndRewrite(
     ScaledExtPackedOp op, ScaledExtPackedOpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -2151,9 +2329,10 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                ROCDL::RawPtrBufferAtomicCmpSwap>,
            AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
            SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
-           WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
-           PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
-           PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
-           TransposeLoadOpLowering, AMDGPUPermlaneLowering>(converter, chipset);
+           WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPacked816OpLowering,
+           ScaledExtPackedOpLowering, PackedScaledTruncOpLowering,
+           PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering,
+           GatherToLDSOpLowering, TransposeLoadOpLowering,
+           AMDGPUPermlaneLowering>(converter, chipset);
   patterns.add<AMDGPUSwizzleBitModeLowering>(converter);
 }
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 5c35823678576..d55f3cec47c1f 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -343,28 +343,41 @@ void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
 //===----------------------------------------------------------------------===//
 LogicalResult ScaledExtPacked816Op::verify() {
   int blockSize = getBlockSize();
-  assert((blockSize == 16 || blockSize == 32) && "invalid block size");
+  assert(llvm::is_contained({16, 32}, blockSize) && "invalid block size");
 
   int firstScaleByte = getFirstScaleByte();
+  int firstScaleLane = getFirstScaleLane();
   auto sourceType = cast<VectorType>(getSource().getType());
   Type elementType = sourceType.getElementType();
   auto floatType = cast<FloatType>(elementType);
-  int bitWidth = floatType.getWidth();
+  unsigned bitWidth = floatType.getWidth();
 
-  if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 16 &&
-      !llvm::is_contained({0, 1}, firstScaleByte)) {
-    return emitOpError("blockSize of 16 can only have firstScaleByte be 0 or 1 "
-                       "for f4 and f6.");
-  }
-  if (llvm::is_contained({4, 6}, bitWidth) && blockSize == 32 &&
-      !llvm::is_contained({0, 2}, firstScaleByte)) {
-    return emitOpError("blockSize of 32 can only have firstScaleByte be 0 or 2 "
-                       "for f4 and f6.");
-  }
-  if (bitWidth == 8 && blockSize == 16 &&
-      !llvm::is_contained({0, 2}, firstScaleByte)) {
-    return emitOpError(
-        "blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.");
+  assert(llvm::is_contained(llvm::ArrayRef<unsigned>{4, 6, 8}, bitWidth));
+
+  const bool is_fp8 = bitWidth == 8;
+  const bool is_block_16 = blockSize == 16;
+
+  if (!is_fp8) {
+    if (is_block_16) {
+      if (!llvm::is_contained({0, 1}, firstScaleByte)) {
+        return emitOpError("blockSize of 16 can only have firstScaleByte be 0 "
+                           "or 1 for f4 and f6.");
+      }
+    } else {
+      if (!llvm::is_contained({0, 2}, firstScaleByte)) {
+        return emitOpError("blockSize of 32 can only have firstScaleByte be 0 "
+                           "or 2 for f4 and f6.");
+      }
+    }
+  } else {
+    if (is_block_16) {
+      bool is_valid = ((firstScaleLane == 0) && (firstScaleByte == 0)) ||
+                      ((firstScaleLane == 1) && (firstScaleByte == 2));
+      if (!is_valid) {
+        return emitOpError("blockSize of 16 can only have (firstScaleLane, "
+                           "firstScaleByte) be (0, 0) or (1, 2) for f8.");
+      }
+    }
   }
 
   return success();
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index 2fd3df6dcfa71..432b8876696a9 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -456,3 +456,4 @@ func.func @sched_barrier() {
   amdgpu.sched_barrier allow = <valu|all_vmem>
   func.return
 }
+
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir b/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir
new file mode 100644
index 0000000000000..d2391140ce056
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/cvt_scale_pk-gfx1250.mlir
@@ -0,0 +1,164 @@
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1250 --split-input-file --verify-diagnostics \
+// RUN: | FileCheck %s
+
+// CHECK-LABEL: @scaled_ext_packed816_fp4
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf4E2M1FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_fp4(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK: %[[SOURCE_8xi4:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf4E2M1FN> to vector<8xi4>
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
+  // CHECK: rocdl.cvt.scale.pk8.f16.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
+  // CHECK: rocdl.cvt.scale.pk8.bf16.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_i32:.+]] = llvm.bitcast %[[SOURCE_8xi4]] : vector<8xi4> to i32
+  // CHECK: rocdl.cvt.scale.pk8.f32.fp4 %[[SOURCE_i32]], %[[SCALE_i32]][0] : vector<8xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+  func.return %ret0, %ret1, %ret2: vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: @scaled_ext_packed816_fp8
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf8E4M3FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_fp8(%v: vector<8xf8E4M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK: %[[SOURCE_8xi8:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf8E4M3FN> to vector<8xi8>
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.f16.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.bf16.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.f32.fp8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E4M3FN>, vector<4xf8E8M0FNU> -> vector<8xf32>
+
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+// CHECK-LABEL: @scaled_ext_packed816_bf8
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<8xf8E5M2>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_bf8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) -> (vector<8xf16>, vector<8xbf16>, vector<8xf32>) {
+  // CHECK: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK: %[[SOURCE_8xi8:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<8xf8E5M2> to vector<8xi8>
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: %[[RES:.+]] = rocdl.cvt.scale.pk8.f16.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.bf16.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v2xi32:.+]] = llvm.bitcast %[[SOURCE_8xi8]] : vector<8xi8> to vector<2xi32>
+  // CHECK: rocdl.cvt.scale.pk8.f32.bf8 %[[SOURCE_v2xi32]], %[[SCALE_i32]][0] : vector<8xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf32>
+  func.return %ret0, %ret1, %ret2 : vector<8xf16>, vector<8xbf16>, vector<8xf32>
+}
+
+
+// CHECK-LABEL: @scaled_ext_packed816_fp6
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<16xf6E2M3FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_fp6(%v: vector<16xf6E2M3FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+  // CHECK-DAG: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK-DAG: %[[SOURCE_16xi6:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<16xf6E2M3FN> to vector<16xi6>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f16.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.bf16.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f32.fp6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E2M3FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+  return %ret0, %ret1, %ret2: vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// CHECK-LABEL: @scaled_ext_packed816_bf6
+// CHECK-SAME: (%[[SOURCE:.+]]: vector<16xf6E3M2FN>, %[[SCALE:.+]]: vector<4xf8E8M0FNU>)
+func.func @scaled_ext_packed816_bf6(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>, vector<16xbf16>, vector<16xf32>) {
+  // CHECK-DAG: %[[SCALE_4xi8:.+]] = builtin.unrealized_conversion_cast %[[SCALE]] : vector<4xf8E8M0FNU> to vector<4xi8>
+  // CHECK-DAG: %[[SOURCE_16xi6:.+]] = builtin.unrealized_conversion_cast %[[SOURCE]] : vector<16xf6E3M2FN> to vector<16xi6>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f16.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf16>
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.bf16.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xbf16>
+  %ret1 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xbf16>
+
+  // CHECK: %[[SCALE_i32:.+]] = llvm.bitcast %[[SCALE_4xi8]] : vector<4xi8> to i32
+  // CHECK: %[[SOURCE_v3xi32:.+]] = llvm.bitcast %[[SOURCE_16xi6]] : vector<16xi6> to vector<3xi32>
+  // CHECK: rocdl.cvt.scale.pk16.f32.bf6 %[[SOURCE_v3xi32]], %[[SCALE_i32]][0] : vector<16xf32>
+  %ret2 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf32>
+  return %ret0, %ret1, %ret2: vector<16xf16>, vector<16xbf16>, vector<16xf32>
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have (firstScaleLane, firstScaleByte) be (0, 0) or (1, 2) for f8.}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op failed to verify that all of {source, res} have same shape}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16>
+  func.return
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_src_elem_type(%v: vector<16xf16>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf16>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op operand #0 must be}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf16>, vector<4xf8E8M0FNU> -> vector<16xf16>
+  return %ret0: vector<16xf16>
+}
+
+// -----
+
+func.func @amdgpu.scaled_ext_packed816_invalid_dst_elem_type(%v: vector<16xf6E3M2FN>, %scale: vector<4xf8E8M0FNU>) -> (vector<16xf64>) {
+  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op result #0 must be vector}}
+  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(0) : vector<16xf6E3M2FN>, vector<4xf8E8M0FNU> -> vector<16xf64>
+  return %ret0: vector<16xf64>
+}
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index 5c8cc8b67c4b3..61fdf29a78cbd 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -333,38 +333,6 @@ func.func @gather_to_lds_non_lds(%idx1 : index, %mem1 : memref<32xf16>, %mem2 :
 
 // -----
 
-func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_16(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 1 for f4 and f6}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(2) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
-  func.return
-}
-
-// -----
-
-func.func @amdgpu.scaled_ext_packed816_invalid_block_size_and_first_scale_byte_32(%v: vector<8xf4E2M1FN>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 32 can only have firstScaleByte be 0 or 2 for f4 and f6.}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(32) firstScaleLane(0) firstScaleByte(1) : vector<8xf4E2M1FN>, vector<4xf8E8M0FNU> -> vector<8xf16>
-  func.return
-}
-
-// -----
-
-func.func @amdgpu.scaled_ext_packed816_invalid_attributes_for_f8(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op blockSize of 16 can only have firstScaleByte be 0 or 2 for f8.}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(1) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<8xf16>
-  func.return
-}
-
-// -----
-
-func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf8E5M2>, %scale: vector<4xf8E8M0FNU>) {
-  // expected-error@+1 {{'amdgpu.scaled_ext_packed816' op failed to verify that all of {source, res} have same shape}}
-  %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16>
-  func.return
-}
-
-// -----
-
 func.func @scaled_mfma_invalid_m(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
   // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
   %0 = amdgpu.scaled_mfma 8x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>

From 7d0a2082bffb162f79fd739c79f2bf0b552b9007 Mon Sep 17 00:00:00 2001
From: Guy David <guyda96@gmail.com>
Date: Tue, 18 Nov 2025 00:05:54 +0200
Subject: [PATCH 61/67] [AArch64] Treat COPY between cross-register banks as
 expensive (#167661)

The motivation is to allow passes such as MachineLICM to hoist trivial
FMOV instructions out of loops, where previously it didn't do so even
when the RHS is a constant.
On most architectures, these expensive move instructions have a latency
of 2-6 cycles, and certainly not cheap as a 0-1 cycle move.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  25 +++
 .../CodeGen/AArch64/licm-regclass-copy.mir    | 197 ++++++++++++++++++
 2 files changed, 222 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/licm-regclass-copy.mir

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 221812f1ebc7b..00fe8ee8b9b4d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1144,6 +1144,28 @@ static bool isCheapImmediate(const MachineInstr &MI, unsigned BitSize) {
   return Is.size() <= 2;
 }
 
+// Check if a COPY instruction is cheap.
+static bool isCheapCopy(const MachineInstr &MI, const AArch64RegisterInfo &RI) {
+  assert(MI.isCopy() && "Expected COPY instruction");
+  const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+
+  // Cross-bank copies (e.g., between GPR and FPR) are expensive on AArch64,
+  // typically requiring an FMOV instruction with a 2-6 cycle latency.
+  auto GetRegClass = [&](Register Reg) -> const TargetRegisterClass * {
+    if (Reg.isVirtual())
+      return MRI.getRegClass(Reg);
+    if (Reg.isPhysical())
+      return RI.getMinimalPhysRegClass(Reg);
+    return nullptr;
+  };
+  const TargetRegisterClass *DstRC = GetRegClass(MI.getOperand(0).getReg());
+  const TargetRegisterClass *SrcRC = GetRegClass(MI.getOperand(1).getReg());
+  if (DstRC && SrcRC && !RI.getCommonSubClass(DstRC, SrcRC))
+    return false;
+
+  return MI.isAsCheapAsAMove();
+}
+
 // FIXME: this implementation should be micro-architecture dependent, so a
 // micro-architecture target hook should be introduced here in future.
 bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
@@ -1157,6 +1179,9 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   default:
     return MI.isAsCheapAsAMove();
 
+  case TargetOpcode::COPY:
+    return isCheapCopy(MI, RI);
+
   case AArch64::ADDWrs:
   case AArch64::ADDXrs:
   case AArch64::SUBWrs:
diff --git a/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir b/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir
new file mode 100644
index 0000000000000..6a10df68ddc71
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/licm-regclass-copy.mir
@@ -0,0 +1,197 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64 -run-pass=early-machinelicm -o - %s | FileCheck %s
+
+# This test verifies that cross-bank copies (e.g., GPR to FPR, FPR to GPR)
+# are hoisted out of loops by MachineLICM, as they are expensive on AArch64.
+
+--- |
+  declare void @use_float(float)
+  declare void @use_int(i32)
+
+  define void @gpr_to_fpr_virtual_copy_hoisted() {
+    ret void
+  }
+
+  define void @gpr_to_fpr_physical_copy_hoisted() {
+    ret void
+  }
+
+  define void @fpr_to_gpr_virtual_copy_hoisted() {
+    ret void
+  }
+...
+---
+name: gpr_to_fpr_virtual_copy_hoisted
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: gpr_to_fpr_virtual_copy_hoisted
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $w1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32 = COPY $w1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:fpr32 = COPY [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2
+  ; CHECK-NEXT:   [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.3, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $s0 = COPY [[COPY4]]
+  ; CHECK-NEXT:   BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+  ; CHECK-NEXT:   [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    liveins: $w0, $w1
+    %1:gpr32 = COPY $w0
+    %0:gpr32 = COPY $w1
+    %3:gpr32all = COPY $wzr
+    %2:gpr32all = COPY %3:gpr32all
+
+  bb.1:
+    %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
+    %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %7:fpr32 = COPY %0:gpr32
+    $s0 = COPY %7:fpr32
+    BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+    %8:gpr32sp = ADDWri %4:gpr32common, 1, 0
+    %5:gpr32all = COPY %8:gpr32sp
+    B %bb.1
+
+  bb.3:
+    RET_ReallyLR
+
+...
+---
+name: gpr_to_fpr_physical_copy_hoisted
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: gpr_to_fpr_physical_copy_hoisted
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr32all = COPY $wzr
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:fpr32 = COPY $wzr
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY2]], %bb.0, %4, %bb.2
+  ; CHECK-NEXT:   [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.3, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $s0 = COPY [[COPY3]]
+  ; CHECK-NEXT:   BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+  ; CHECK-NEXT:   [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    liveins: $w0
+    %1:gpr32 = COPY $w0
+    %3:gpr32all = COPY $wzr
+    %2:gpr32all = COPY %3:gpr32all
+
+  bb.1:
+    %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
+    %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %7:fpr32 = COPY $wzr
+    $s0 = COPY %7:fpr32
+    BL @use_float, implicit-def dead $lr, implicit $sp, implicit $s0, implicit-def $sp
+    %8:gpr32sp = ADDWri %4:gpr32common, 1, 0
+    %5:gpr32all = COPY %8:gpr32sp
+    B %bb.1
+
+  bb.3:
+    RET_ReallyLR
+
+...
+---
+name: fpr_to_gpr_virtual_copy_hoisted
+tracksRegLiveness: true
+body: |
+  ; CHECK-LABEL: name: fpr_to_gpr_virtual_copy_hoisted
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $w0, $s0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gpr32 = COPY $w0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:fpr32 = COPY $s0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr32all = COPY $wzr
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr32all = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gpr32 = COPY [[COPY1]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:gpr32common = PHI [[COPY3]], %bb.0, %5, %bb.2
+  ; CHECK-NEXT:   [[SUBSWrr:%[0-9]+]]:gpr32 = SUBSWrr [[PHI]], [[COPY]], implicit-def $nzcv
+  ; CHECK-NEXT:   Bcc 1, %bb.3, implicit $nzcv
+  ; CHECK-NEXT:   B %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $w0 = COPY [[COPY4]]
+  ; CHECK-NEXT:   BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp
+  ; CHECK-NEXT:   [[ADDWri:%[0-9]+]]:gpr32sp = ADDWri [[PHI]], 1, 0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gpr32all = COPY [[ADDWri]]
+  ; CHECK-NEXT:   B %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   RET_ReallyLR
+  bb.0:
+    liveins: $w0, $s0
+    %1:gpr32 = COPY $w0
+    %0:fpr32 = COPY $s0
+    %3:gpr32all = COPY $wzr
+    %2:gpr32all = COPY %3:gpr32all
+
+  bb.1:
+    %4:gpr32common = PHI %2:gpr32all, %bb.0, %5:gpr32all, %bb.2
+    %6:gpr32 = SUBSWrr %4:gpr32common, %1:gpr32, implicit-def $nzcv
+    Bcc 1, %bb.3, implicit $nzcv
+    B %bb.2
+
+  bb.2:
+    %7:gpr32 = COPY %0:fpr32
+    $w0 = COPY %7:gpr32
+    BL @use_int, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp
+    %8:gpr32sp = ADDWri %4:gpr32common, 1, 0
+    %5:gpr32all = COPY %8:gpr32sp
+    B %bb.1
+
+  bb.3:
+    RET_ReallyLR
+
+...

From 6245a4f875700594281022de6e38fba4439f5edf Mon Sep 17 00:00:00 2001
From: Ben Kallus <benjamin.p.kallus.gr@dartmouth.edu>
Date: Mon, 17 Nov 2025 22:14:08 +0000
Subject: [PATCH 62/67] Add support for the .base64 directive (#165549)

Starting in version 15, GCC emits a `.base64` directive instead of
`.string` or `.ascii` for char arrays of length `>= 3`.

See [this godbolt link](https://godbolt.org/z/ebhe3oenv) for an example.

This patch adds support for the .base64 directive to AsmParser.cpp, so
tools like `llvm-mc` can process the output of GCC more effectively.

This addresses #165499.
---
 llvm/lib/MC/MCParser/AsmParser.cpp        | 37 +++++++++++++++++++++++
 llvm/test/MC/AsmParser/directive_base64.s | 37 +++++++++++++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 llvm/test/MC/AsmParser/directive_base64.s

diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 3c9ab8e108ddd..c9cce7d69cc01 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -46,6 +46,7 @@
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Base64.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -528,6 +529,7 @@ class AsmParser : public MCAsmParser {
     DK_LTO_SET_CONDITIONAL,
     DK_CFI_MTE_TAGGED_FRAME,
     DK_MEMTAG,
+    DK_BASE64,
     DK_END
   };
 
@@ -550,6 +552,7 @@ class AsmParser : public MCAsmParser {
 
   // ".ascii", ".asciz", ".string"
   bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
+  bool parseDirectiveBase64();                  // ".base64"
   bool parseDirectiveReloc(SMLoc DirectiveLoc); // ".reloc"
   bool parseDirectiveValue(StringRef IDVal,
                            unsigned Size);       // ".byte", ".long", ...
@@ -1951,6 +1954,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     case DK_ASCIZ:
     case DK_STRING:
       return parseDirectiveAscii(IDVal, true);
+    case DK_BASE64:
+      return parseDirectiveBase64();
     case DK_BYTE:
     case DK_DC_B:
       return parseDirectiveValue(IDVal, 1);
@@ -3074,6 +3079,37 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   return parseMany(parseOp);
 }
 
+/// parseDirectiveBase64:
+//    ::= .base64 "string" (, "string" )*
+bool AsmParser::parseDirectiveBase64() {
+  auto parseOp = [&]() -> bool {
+    if (checkForValidSection())
+      return true;
+
+    if (getTok().isNot(AsmToken::String)) {
+      return true;
+    }
+
+    std::vector<char> Decoded;
+    std::string const str = getTok().getStringContents().str();
+    if (check(str.empty(), "expected nonempty string")) {
+      return true;
+    }
+
+    llvm::Error e = decodeBase64(str, Decoded);
+    if (e) {
+      consumeError(std::move(e));
+      return Error(Lexer.getLoc(), "failed to base64 decode string data");
+    }
+
+    getStreamer().emitBytes(std::string(Decoded.begin(), Decoded.end()));
+    Lex();
+    return false;
+  };
+
+  return check(parseMany(parseOp), "expected string");
+}
+
 /// parseDirectiveReloc
 ///  ::= .reloc expression , identifier [ , expression ]
 bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
@@ -5343,6 +5379,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".asciz"] = DK_ASCIZ;
   DirectiveKindMap[".string"] = DK_STRING;
   DirectiveKindMap[".byte"] = DK_BYTE;
+  DirectiveKindMap[".base64"] = DK_BASE64;
   DirectiveKindMap[".short"] = DK_SHORT;
   DirectiveKindMap[".value"] = DK_VALUE;
   DirectiveKindMap[".2byte"] = DK_2BYTE;
diff --git a/llvm/test/MC/AsmParser/directive_base64.s b/llvm/test/MC/AsmParser/directive_base64.s
new file mode 100644
index 0000000000000..46a477eef51dc
--- /dev/null
+++ b/llvm/test/MC/AsmParser/directive_base64.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s
+# RUN: not llvm-mc -triple i386-unknown-unknown -defsym=ERR=1 -o /dev/null %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+        .data
+# CHECK-LABEL: TEST0:
+# CHECK-NEXT: .byte 0
+TEST0:
+        .base64 "AA=="
+
+# CHECK-LABEL: TEST1:
+# CHECK-NEXT: .ascii "abcxyz"
+TEST1:
+        .base64 "YWJjeHl6"
+
+# CHECK-LABEL: TEST2:
+# CHECK-NEXT: .byte 1
+# CHECK-NEXT: .byte 2
+TEST2:
+        .base64 "AQ=="
+        .base64 "Ag=="
+
+# CHECK-LABEL: TEST3:
+# CHECK-NEXT: .byte 1
+# CHECK-NEXT: .byte 2
+TEST3:
+        .base64 "AQ==", "Ag=="
+
+.ifdef ERR
+# CHECK-ERROR: [[#@LINE+1]]:17: error: expected string
+        .base64 not-a-string
+
+# CHECK-ERROR: [[#@LINE+1]]:17: error: failed to base64 decode string data
+        .base64 "AA"
+
+# CHECK-ERROR: [[#@LINE+1]]:17: error: expected nonempty string
+        .base64 ""
+.endif

From 88b3969dfe20bd624209e44d5c8cef365d9484e0 Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Mon, 17 Nov 2025 14:19:22 -0800
Subject: [PATCH 63/67] [lldb-dap] Address a unit test race condition during
 initialization. (#167981)

During the initialization sequence in our tests the first 'threads'
response sould only be kept if the process is actually stopped,
otherwise we will have stale data.

In VSCode, during the debug session startup sequence immediately after
'configurationDone' a 'threads' request is made. This initial request is
to retrieve the main threads name and id so the UI can be populated.
However, in our tests we do not want to cache this value unless the
process is actually stopped. We do need to make this initial request
because lldb-dap is caching the initial thread list during
configurationDone before the process is resumed. We need to make this
call to ensure the cached initial threads are purged.

I noticed this in a CI job for another review
(https://github.com/llvm/llvm-project/actions/runs/19348261989/job/55353961798)
where the tests incorrectly failed to fetch the threads prior to
validating the thread names.
---
 .../lldbsuite/test/tools/lldb-dap/dap_server.py   | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index a4ca090021f3f..f85ab1910a2eb 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -191,6 +191,11 @@ class NotSupportedError(KeyError):
 
 
 class DebugCommunication(object):
+    @property
+    def is_stopped(self) -> bool:
+        """Returns True if the debuggee is stopped, otherwise False."""
+        return len(self.thread_stop_reasons) > 0 or self.exit_status is not None
+
     def __init__(
         self,
         recv: BinaryIO,
@@ -860,7 +865,17 @@ def request_configurationDone(self):
         response = self._send_recv(command_dict)
         if response:
             self.configuration_done_sent = True
+            stopped_on_entry = self.is_stopped
             self.request_threads()
+            if not stopped_on_entry:
+                # Drop the initial cached threads if we did not stop-on-entry.
+                # In VSCode, immediately following 'configurationDone', a
+                # 'threads' request is made to get the initial set of threads,
+                # specifically the main threads id and name.
+                # We issue the threads request to mimic this pattern but in our
+                # tests we don't want to cache the result unless the process is
+                # actually stopped.
+                self.threads = None
         return response
 
     def _process_stopped(self):

From e89e359313a036131e4926ce2a9a97b06f5993ce Mon Sep 17 00:00:00 2001
From: Prabhu Rajasekaran <prabhukr@google.com>
Date: Mon, 17 Nov 2025 14:25:58 -0800
Subject: [PATCH 64/67] [libc]Github] Fix typo on build_type param (#168453)

There is an extra underscore in build_type param in #167583 patch.
Fixing it in this PR.
---
 .github/workflows/libc-fullbuild-tests.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml
index c5b7f606a115a..01fd895cce7e8 100644
--- a/.github/workflows/libc-fullbuild-tests.yml
+++ b/.github/workflows/libc-fullbuild-tests.yml
@@ -49,37 +49,37 @@ jobs:
             target: x86_64-unknown-uefi-llvm
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: armv6m-none-eabi
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: armv7m-none-eabi
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: armv7em-none-eabi
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: armv8m.main-none-eabi
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: armv8.1m.main-none-eabi
             include_scudo: OFF
           - os: ubuntu-24.04
-            build__type: MinSizeRel
+            build_type: MinSizeRel
             c_compiler: clang-22
             cpp_compiler: clang++-22
             target: riscv32-unknown-elf

From f0f53326c735b9f2cb52f5a63312f2d0b700d6cf Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <razvan.lupusoru@gmail.com>
Date: Mon, 17 Nov 2025 14:55:52 -0800
Subject: [PATCH 65/67] [mlir][acc] Add ACCImplicitRoutine pass for implicit
 `acc routine` (#168433)

This change adds the ACCImplicitRoutine pass which implements the
OpenACC specification for implicit routine directives (OpenACC 3.4 spec,
section 2.15.1).

According to the specification: "If no explicit routine directive
applies to a procedure whose definition appears in the program unit
being compiled, then the implementation applies an implicit routine
directive to that procedure if any of the following conditions holds:
The procedure is called or its address is accessed in a compute region."

The pass automatically generates `acc.routine` operations for functions
called within OpenACC compute constructs or within existing routine
functions that do not already have explicit routine directives. It
recursively applies implicit routine directives while avoiding infinite
recursion when dependencies form cycles.

Key features:
- Walks through all OpenACC compute constructs (parallel, kernels,
serial) to identify function calls
- Creates implicit `acc.routine` operations for functions without
explicit routine declarations
- Recursively processes existing `acc.routine` operations to handle
transitive dependencies
- Avoids infinite recursion through proper tracking of processed
routines
- Respects device-type specific bind clauses to skip routines bound to
different device types

Requirements:
- Function operations must implement `mlir::FunctionOpInterface` to be
identified and associated with routine directives.
- Call operations must implement `mlir::CallOpInterface` to detect
function calls and traverse the call graph.
- Optionally pre-register `acc::OpenACCSupport` if custom behavior is
needed for determining if a symbol use is valid within GPU regions (such
as functions which are already considerations for offloading even
without `acc routine` markings)

Co-authored-by: delaram-talaashrafi<dtalaashrafi@nvidia.com>
---
 .../mlir/Dialect/OpenACC/Transforms/Passes.td |  38 +++
 .../OpenACC/Transforms/ACCImplicitRoutine.cpp | 237 ++++++++++++++++++
 .../Dialect/OpenACC/Transforms/CMakeLists.txt |   1 +
 3 files changed, 276 insertions(+)
 create mode 100644 mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitRoutine.cpp

diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
index 40ccd1fc6c1a0..970d9304d8289 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
@@ -63,4 +63,42 @@ def ACCImplicitData : Pass<"acc-implicit-data", "mlir::ModuleOp"> {
   ];
 }
 
+def ACCImplicitRoutine : Pass<"acc-implicit-routine", "mlir::ModuleOp"> {
+  let summary = "Generate implicit acc routine for functions in acc regions";
+  let description = [{
+    This pass implements the implicit rules described in OpenACC specification
+    for `Routine Directive` (OpenACC 3.4 spec, section 2.15.1).
+
+    "If no explicit routine directive applies to a procedure whose definition
+    appears in the program unit being compiled, then the implementation applies
+    an implicit routine directive to that procedure if any of the following
+    conditions holds:
+    - The procedure is called or its address is accessed in a compute region."
+
+    The specification further states:
+    "When the implementation applies an implicit routine directive to a procedure,
+    it must recursively apply implicit routine directives to other procedures for
+    which the above rules specify relevant dependencies. Such dependencies can
+    form a cycle, so the implementation must take care to avoid infinite recursion."
+
+    This pass implements these requirements by:
+    1. Walking through all OpenACC compute constructs and functions already
+       marked with `acc routine` in the module and identifying function calls
+       within these regions.
+    2. Creating implicit `acc.routine` operations for functions that don't already
+       have routine declarations.
+    3. Recursively walking through all existing `acc routine` and creating
+       implicit routine operations for function calls within these routines,
+       while avoiding infinite recursion through proper tracking.
+  }];
+  let dependentDialects = ["mlir::acc::OpenACCDialect"];
+  let options = [
+    Option<"deviceType", "device-type", "mlir::acc::DeviceType",
+           "mlir::acc::DeviceType::None",
+           "Target device type for implicit routine generation. "
+           "Ensures that `acc routine` device_type clauses are "
+           "properly considered not just default clauses.">
+  ];
+}
+
 #endif // MLIR_DIALECT_OPENACC_TRANSFORMS_PASSES
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitRoutine.cpp b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitRoutine.cpp
new file mode 100644
index 0000000000000..12efaf487a8ca
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Transforms/ACCImplicitRoutine.cpp
@@ -0,0 +1,237 @@
+//===- ACCImplicitRoutine.cpp - OpenACC Implicit Routine Transform -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the implicit rules described in OpenACC specification
+// for `Routine Directive` (OpenACC 3.4 spec, section 2.15.1).
+//
+// "If no explicit routine directive applies to a procedure whose definition
+// appears in the program unit being compiled, then the implementation applies
+// an implicit routine directive to that procedure if any of the following
+// conditions holds:
+// - The procedure is called or its address is accessed in a compute region."
+//
+// The specification further states:
+// "When the implementation applies an implicit routine directive to a
+// procedure, it must recursively apply implicit routine directives to other
+// procedures for which the above rules specify relevant dependencies. Such
+// dependencies can form a cycle, so the implementation must take care to avoid
+// infinite recursion."
+//
+// This pass implements these requirements by:
+// 1. Walking through all OpenACC compute constructs and functions already
+//    marked with `acc routine` in the module and identifying function calls
+//    within these regions.
+// 2. Creating implicit `acc.routine` operations for functions that don't
+//    already have routine declarations.
+// 3. Recursively walking through all existing `acc routine` and creating
+//    implicit routine operations for function calls within these routines,
+//    while avoiding infinite recursion through proper tracking.
+//
+// Requirements:
+// -------------
+// To use this pass in a pipeline, the following requirements must be met:
+//
+// 1. Operation Interface Implementation: Operations that define functions
+//    or call functions should implement `mlir::FunctionOpInterface` and
+//    `mlir::CallOpInterface` respectively.
+//
+// 2. Analysis Registration (Optional): If custom behavior is needed for
+//    determining if a symbol use is valid within GPU regions, the dialect
+//    should pre-register the `acc::OpenACCSupport` analysis.
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include <queue>
+
+#define DEBUG_TYPE "acc-implicit-routine"
+
+namespace mlir {
+namespace acc {
+#define GEN_PASS_DEF_ACCIMPLICITROUTINE
+#include "mlir/Dialect/OpenACC/Transforms/Passes.h.inc"
+} // namespace acc
+} // namespace mlir
+
+namespace {
+
+using namespace mlir;
+
+class ACCImplicitRoutine
+    : public acc::impl::ACCImplicitRoutineBase<ACCImplicitRoutine> {
+private:
+  unsigned routineCounter = 0;
+  static constexpr llvm::StringRef accRoutinePrefix = "acc_routine_";
+
+  // Count existing routine operations and update counter
+  void initRoutineCounter(ModuleOp module) {
+    module.walk([&](acc::RoutineOp routineOp) { routineCounter++; });
+  }
+
+  // Check if routine has a default bind clause or a device-type specific bind
+  // clause. Returns true if `acc routine` has a default bind clause or
+  // a device-type specific bind clause.
+  bool isACCRoutineBindDefaultOrDeviceType(acc::RoutineOp op,
+                                           acc::DeviceType deviceType) {
+    // Fast check to avoid device-type specific lookups.
+    if (!op.getBindIdName() && !op.getBindStrName())
+      return false;
+    return op.getBindNameValue().has_value() ||
+           op.getBindNameValue(deviceType).has_value();
+  }
+
+  // Generate a unique name for the routine and create the routine operation
+  acc::RoutineOp createRoutineOp(OpBuilder &builder, Location loc,
+                                 FunctionOpInterface &callee) {
+    std::string routineName =
+        (accRoutinePrefix + std::to_string(routineCounter++)).str();
+    auto routineOp = acc::RoutineOp::create(
+        builder, loc,
+        /* sym_name=*/builder.getStringAttr(routineName),
+        /* func_name=*/
+        mlir::SymbolRefAttr::get(builder.getContext(),
+                                 builder.getStringAttr(callee.getName())),
+        /* bindIdName=*/nullptr,
+        /* bindStrName=*/nullptr,
+        /* bindIdNameDeviceType=*/nullptr,
+        /* bindStrNameDeviceType=*/nullptr,
+        /* worker=*/nullptr,
+        /* vector=*/nullptr,
+        /* seq=*/nullptr,
+        /* nohost=*/nullptr,
+        /* implicit=*/builder.getUnitAttr(),
+        /* gang=*/nullptr,
+        /* gangDim=*/nullptr,
+        /* gangDimDeviceType=*/nullptr);
+
+    // Assert that the callee does not already have routine info attribute
+    assert(!callee->hasAttr(acc::getRoutineInfoAttrName()) &&
+           "function is already associated with a routine");
+
+    callee->setAttr(
+        acc::getRoutineInfoAttrName(),
+        mlir::acc::RoutineInfoAttr::get(
+            builder.getContext(),
+            {mlir::SymbolRefAttr::get(builder.getContext(),
+                                      builder.getStringAttr(routineName))}));
+    return routineOp;
+  }
+
+  // Used to walk through a compute region looking for function calls.
+  void
+  implicitRoutineForCallsInComputeRegions(Operation *op, SymbolTable &symTab,
+                                          mlir::OpBuilder &builder,
+                                          acc::OpenACCSupport &accSupport) {
+    op->walk([&](CallOpInterface callOp) {
+      if (!callOp.getCallableForCallee())
+        return;
+
+      auto calleeSymbolRef =
+          dyn_cast<SymbolRefAttr>(callOp.getCallableForCallee());
+      // When call is done through ssa value, the callee is not a symbol.
+      // Skip it because we don't know the call target.
+      if (!calleeSymbolRef)
+        return;
+
+      auto callee = symTab.lookup<FunctionOpInterface>(
+          calleeSymbolRef.getLeafReference().str());
+      // If the callee does not exist or is already a valid symbol for GPU
+      // regions, skip it
+
+      assert(callee && "callee function must be found in symbol table");
+      if (accSupport.isValidSymbolUse(callOp.getOperation(), calleeSymbolRef))
+        return;
+      builder.setInsertionPoint(callee);
+      createRoutineOp(builder, callee.getLoc(), callee);
+    });
+  }
+
+  // Recursively handle calls within a routine operation
+  void implicitRoutineForCallsInRoutine(acc::RoutineOp routineOp,
+                                        mlir::OpBuilder &builder,
+                                        acc::OpenACCSupport &accSupport,
+                                        acc::DeviceType targetDeviceType) {
+    // When bind clause is used, it means that the target is different than the
+    // function to which the `acc routine` is used with. Skip this case to
+    // avoid implicitly recursively marking calls that would not end up on
+    // device.
+    if (isACCRoutineBindDefaultOrDeviceType(routineOp, targetDeviceType))
+      return;
+
+    SymbolTable symTab(routineOp->getParentOfType<ModuleOp>());
+    std::queue<acc::RoutineOp> routineQueue;
+    routineQueue.push(routineOp);
+    while (!routineQueue.empty()) {
+      auto currentRoutine = routineQueue.front();
+      routineQueue.pop();
+      auto func = symTab.lookup<FunctionOpInterface>(
+          currentRoutine.getFuncName().getLeafReference());
+      func.walk([&](CallOpInterface callOp) {
+        if (!callOp.getCallableForCallee())
+          return;
+
+        auto calleeSymbolRef =
+            dyn_cast<SymbolRefAttr>(callOp.getCallableForCallee());
+        // When call is done through ssa value, the callee is not a symbol.
+        // Skip it because we don't know the call target.
+        if (!calleeSymbolRef)
+          return;
+
+        auto callee = symTab.lookup<FunctionOpInterface>(
+            calleeSymbolRef.getLeafReference().str());
+        // If the callee does not exist or is already a valid symbol for GPU
+        // regions, skip it
+        assert(callee && "callee function must be found in symbol table");
+        if (accSupport.isValidSymbolUse(callOp.getOperation(), calleeSymbolRef))
+          return;
+        builder.setInsertionPoint(callee);
+        auto newRoutineOp = createRoutineOp(builder, callee.getLoc(), callee);
+        routineQueue.push(newRoutineOp);
+      });
+    }
+  }
+
+public:
+  using ACCImplicitRoutineBase<ACCImplicitRoutine>::ACCImplicitRoutineBase;
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    mlir::OpBuilder builder(module.getContext());
+    SymbolTable symTab(module);
+    initRoutineCounter(module);
+
+    acc::OpenACCSupport &accSupport = getAnalysis<acc::OpenACCSupport>();
+
+    // Handle compute regions
+    module.walk([&](Operation *op) {
+      if (isa<ACC_COMPUTE_CONSTRUCT_OPS>(op))
+        implicitRoutineForCallsInComputeRegions(op, symTab, builder,
+                                                accSupport);
+    });
+
+    // Use the device type option from the pass options.
+    acc::DeviceType targetDeviceType = deviceType;
+
+    // Handle existing routines
+    module.walk([&](acc::RoutineOp routineOp) {
+      implicitRoutineForCallsInRoutine(routineOp, builder, accSupport,
+                                       targetDeviceType);
+    });
+  }
+};
+
+} // namespace
diff --git a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
index f8fff5958f8c7..028af0362f26e 100644
--- a/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/Transforms/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIROpenACCTransforms
   ACCImplicitData.cpp
+  ACCImplicitRoutine.cpp
   LegalizeDataValues.cpp
 
   ADDITIONAL_HEADER_DIRS

From 43dacd07f660064d4023342efb067f39fafc592f Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Tue, 18 Nov 2025 01:58:26 +0300
Subject: [PATCH 66/67] [PowerPC] TableGen-erate SDNode descriptions (#168108)

This allows SDNodes to be validated against their expected type profiles
and reduces the number of changes required to add a new node.

The validation functionality has detected several issues, see
`PPCSelectionDAGInfo::verifyTargetNode()`.

Most of the nodes have a description in `*.td` files and were
successfully "imported". Those that don't have a description are listed
in the enum in `PPCSelectionDAGInfo.td`. These nodes are not validated.

Part of #119709.

Pull Request: https://github.com/llvm/llvm-project/pull/168108
---
 llvm/lib/Target/PowerPC/CMakeLists.txt        |   1 +
 llvm/lib/Target/PowerPC/PPCFastISel.cpp       |   1 +
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   |   1 +
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   | 185 +-----
 llvm/lib/Target/PowerPC/PPCISelLowering.h     | 578 ------------------
 llvm/lib/Target/PowerPC/PPCInstrInfo.td       | 301 ++++++++-
 llvm/lib/Target/PowerPC/PPCInstrP10.td        |  21 +-
 llvm/lib/Target/PowerPC/PPCInstrVSX.td        |  60 ++
 .../Target/PowerPC/PPCSelectionDAGInfo.cpp    |  66 +-
 llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h |  57 +-
 10 files changed, 477 insertions(+), 794 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/CMakeLists.txt b/llvm/lib/Target/PowerPC/CMakeLists.txt
index 2182039e0eef8..53d565013c4bc 100644
--- a/llvm/lib/Target/PowerPC/CMakeLists.txt
+++ b/llvm/lib/Target/PowerPC/CMakeLists.txt
@@ -11,6 +11,7 @@ tablegen(LLVM PPCGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM PPCGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM PPCGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM PPCGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM PPCGenExegesis.inc -gen-exegesis)
 tablegen(LLVM PPCGenRegisterBank.inc -gen-register-bank)
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index ea4e597d0fd7d..ca3fe18273ff5 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -17,6 +17,7 @@
 #include "PPCCallingConv.h"
 #include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
+#include "PPCSelectionDAGInfo.h"
 #include "PPCSubtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 89165fa8f8fdb..dd537c204cec1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -16,6 +16,7 @@
 #include "PPC.h"
 #include "PPCISelLowering.h"
 #include "PPCMachineFunctionInfo.h"
+#include "PPCSelectionDAGInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/APInt.h"
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f55336bafd251..220010c4d3d34 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
 #include "PPCRegisterInfo.h"
+#include "PPCSelectionDAGInfo.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/APFloat.h"
@@ -1678,190 +1679,6 @@ bool PPCTargetLowering::shallExtractConstSplatVectorElementToStore(
   return false;
 }
 
-const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch ((PPCISD::NodeType)Opcode) {
-  case PPCISD::FIRST_NUMBER:    break;
-  case PPCISD::FSEL:            return "PPCISD::FSEL";
-  case PPCISD::XSMAXC:          return "PPCISD::XSMAXC";
-  case PPCISD::XSMINC:          return "PPCISD::XSMINC";
-  case PPCISD::FCFID:           return "PPCISD::FCFID";
-  case PPCISD::FCFIDU:          return "PPCISD::FCFIDU";
-  case PPCISD::FCFIDS:          return "PPCISD::FCFIDS";
-  case PPCISD::FCFIDUS:         return "PPCISD::FCFIDUS";
-  case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
-  case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
-  case PPCISD::FCTIDUZ:         return "PPCISD::FCTIDUZ";
-  case PPCISD::FCTIWUZ:         return "PPCISD::FCTIWUZ";
-  case PPCISD::FRE:             return "PPCISD::FRE";
-  case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
-  case PPCISD::FTSQRT:
-    return "PPCISD::FTSQRT";
-  case PPCISD::FSQRT:
-    return "PPCISD::FSQRT";
-  case PPCISD::STFIWX:          return "PPCISD::STFIWX";
-  case PPCISD::VPERM:           return "PPCISD::VPERM";
-  case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
-  case PPCISD::XXSPLTI_SP_TO_DP:
-    return "PPCISD::XXSPLTI_SP_TO_DP";
-  case PPCISD::XXSPLTI32DX:
-    return "PPCISD::XXSPLTI32DX";
-  case PPCISD::VECINSERT:       return "PPCISD::VECINSERT";
-  case PPCISD::XXPERMDI:        return "PPCISD::XXPERMDI";
-  case PPCISD::XXPERM:
-    return "PPCISD::XXPERM";
-  case PPCISD::VECSHL:          return "PPCISD::VECSHL";
-  case PPCISD::VSRQ:
-    return "PPCISD::VSRQ";
-  case PPCISD::CMPB:            return "PPCISD::CMPB";
-  case PPCISD::Hi:              return "PPCISD::Hi";
-  case PPCISD::Lo:              return "PPCISD::Lo";
-  case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
-  case PPCISD::ATOMIC_CMP_SWAP_8: return "PPCISD::ATOMIC_CMP_SWAP_8";
-  case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
-  case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
-  case PPCISD::DYNAREAOFFSET:   return "PPCISD::DYNAREAOFFSET";
-  case PPCISD::PROBED_ALLOCA:   return "PPCISD::PROBED_ALLOCA";
-  case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
-  case PPCISD::SRL:             return "PPCISD::SRL";
-  case PPCISD::SRA:             return "PPCISD::SRA";
-  case PPCISD::SHL:             return "PPCISD::SHL";
-  case PPCISD::SRA_ADDZE:       return "PPCISD::SRA_ADDZE";
-  case PPCISD::CALL:            return "PPCISD::CALL";
-  case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
-  case PPCISD::CALL_NOTOC:      return "PPCISD::CALL_NOTOC";
-  case PPCISD::CALL_RM:
-    return "PPCISD::CALL_RM";
-  case PPCISD::CALL_NOP_RM:
-    return "PPCISD::CALL_NOP_RM";
-  case PPCISD::CALL_NOTOC_RM:
-    return "PPCISD::CALL_NOTOC_RM";
-  case PPCISD::MTCTR:           return "PPCISD::MTCTR";
-  case PPCISD::BCTRL:           return "PPCISD::BCTRL";
-  case PPCISD::BCTRL_LOAD_TOC:  return "PPCISD::BCTRL_LOAD_TOC";
-  case PPCISD::BCTRL_RM:
-    return "PPCISD::BCTRL_RM";
-  case PPCISD::BCTRL_LOAD_TOC_RM:
-    return "PPCISD::BCTRL_LOAD_TOC_RM";
-  case PPCISD::RET_GLUE:        return "PPCISD::RET_GLUE";
-  case PPCISD::READ_TIME_BASE:  return "PPCISD::READ_TIME_BASE";
-  case PPCISD::EH_SJLJ_SETJMP:  return "PPCISD::EH_SJLJ_SETJMP";
-  case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
-  case PPCISD::MFOCRF:          return "PPCISD::MFOCRF";
-  case PPCISD::MFVSR:           return "PPCISD::MFVSR";
-  case PPCISD::MTVSRA:          return "PPCISD::MTVSRA";
-  case PPCISD::MTVSRZ:          return "PPCISD::MTVSRZ";
-  case PPCISD::SINT_VEC_TO_FP:  return "PPCISD::SINT_VEC_TO_FP";
-  case PPCISD::UINT_VEC_TO_FP:  return "PPCISD::UINT_VEC_TO_FP";
-  case PPCISD::SCALAR_TO_VECTOR_PERMUTED:
-    return "PPCISD::SCALAR_TO_VECTOR_PERMUTED";
-  case PPCISD::ANDI_rec_1_EQ_BIT:
-    return "PPCISD::ANDI_rec_1_EQ_BIT";
-  case PPCISD::ANDI_rec_1_GT_BIT:
-    return "PPCISD::ANDI_rec_1_GT_BIT";
-  case PPCISD::VCMP:            return "PPCISD::VCMP";
-  case PPCISD::VCMP_rec:        return "PPCISD::VCMP_rec";
-  case PPCISD::LBRX:            return "PPCISD::LBRX";
-  case PPCISD::STBRX:           return "PPCISD::STBRX";
-  case PPCISD::LFIWAX:          return "PPCISD::LFIWAX";
-  case PPCISD::LFIWZX:          return "PPCISD::LFIWZX";
-  case PPCISD::LXSIZX:          return "PPCISD::LXSIZX";
-  case PPCISD::STXSIX:          return "PPCISD::STXSIX";
-  case PPCISD::VEXTS:           return "PPCISD::VEXTS";
-  case PPCISD::LXVD2X:          return "PPCISD::LXVD2X";
-  case PPCISD::STXVD2X:         return "PPCISD::STXVD2X";
-  case PPCISD::LOAD_VEC_BE:     return "PPCISD::LOAD_VEC_BE";
-  case PPCISD::STORE_VEC_BE:    return "PPCISD::STORE_VEC_BE";
-  case PPCISD::ST_VSR_SCAL_INT:
-                                return "PPCISD::ST_VSR_SCAL_INT";
-  case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
-  case PPCISD::BDNZ:            return "PPCISD::BDNZ";
-  case PPCISD::BDZ:             return "PPCISD::BDZ";
-  case PPCISD::MFFS:            return "PPCISD::MFFS";
-  case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
-  case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
-  case PPCISD::CR6SET:          return "PPCISD::CR6SET";
-  case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
-  case PPCISD::PPC32_GOT:       return "PPCISD::PPC32_GOT";
-  case PPCISD::PPC32_PICGOT:    return "PPCISD::PPC32_PICGOT";
-  case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
-  case PPCISD::LD_GOT_TPREL_L:  return "PPCISD::LD_GOT_TPREL_L";
-  case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
-  case PPCISD::ADDIS_TLSGD_HA:  return "PPCISD::ADDIS_TLSGD_HA";
-  case PPCISD::ADDI_TLSGD_L:    return "PPCISD::ADDI_TLSGD_L";
-  case PPCISD::GET_TLS_ADDR:    return "PPCISD::GET_TLS_ADDR";
-  case PPCISD::GET_TLS_MOD_AIX: return "PPCISD::GET_TLS_MOD_AIX";
-  case PPCISD::GET_TPOINTER:    return "PPCISD::GET_TPOINTER";
-  case PPCISD::ADDI_TLSGD_L_ADDR: return "PPCISD::ADDI_TLSGD_L_ADDR";
-  case PPCISD::TLSGD_AIX:       return "PPCISD::TLSGD_AIX";
-  case PPCISD::TLSLD_AIX:       return "PPCISD::TLSLD_AIX";
-  case PPCISD::ADDIS_TLSLD_HA:  return "PPCISD::ADDIS_TLSLD_HA";
-  case PPCISD::ADDI_TLSLD_L:    return "PPCISD::ADDI_TLSLD_L";
-  case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
-  case PPCISD::ADDI_TLSLD_L_ADDR: return "PPCISD::ADDI_TLSLD_L_ADDR";
-  case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
-  case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
-  case PPCISD::PADDI_DTPREL:
-    return "PPCISD::PADDI_DTPREL";
-  case PPCISD::VADD_SPLAT:
-    return "PPCISD::VADD_SPLAT";
-  case PPCISD::XXSWAPD:         return "PPCISD::XXSWAPD";
-  case PPCISD::SWAP_NO_CHAIN:   return "PPCISD::SWAP_NO_CHAIN";
-  case PPCISD::BUILD_FP128:     return "PPCISD::BUILD_FP128";
-  case PPCISD::BUILD_SPE64:     return "PPCISD::BUILD_SPE64";
-  case PPCISD::EXTRACT_SPE:     return "PPCISD::EXTRACT_SPE";
-  case PPCISD::EXTSWSLI:        return "PPCISD::EXTSWSLI";
-  case PPCISD::LD_VSX_LH:       return "PPCISD::LD_VSX_LH";
-  case PPCISD::FP_EXTEND_HALF:  return "PPCISD::FP_EXTEND_HALF";
-  case PPCISD::MAT_PCREL_ADDR:  return "PPCISD::MAT_PCREL_ADDR";
-  case PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR:
-    return "PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR";
-  case PPCISD::TLS_LOCAL_EXEC_MAT_ADDR:
-    return "PPCISD::TLS_LOCAL_EXEC_MAT_ADDR";
-  case PPCISD::ACC_BUILD:       return "PPCISD::ACC_BUILD";
-  case PPCISD::PAIR_BUILD:      return "PPCISD::PAIR_BUILD";
-  case PPCISD::EXTRACT_VSX_REG: return "PPCISD::EXTRACT_VSX_REG";
-  case PPCISD::XXMFACC:         return "PPCISD::XXMFACC";
-  case PPCISD::LD_SPLAT:        return "PPCISD::LD_SPLAT";
-  case PPCISD::ZEXT_LD_SPLAT:   return "PPCISD::ZEXT_LD_SPLAT";
-  case PPCISD::SEXT_LD_SPLAT:   return "PPCISD::SEXT_LD_SPLAT";
-  case PPCISD::FNMSUB:          return "PPCISD::FNMSUB";
-  case PPCISD::STRICT_FADDRTZ:
-    return "PPCISD::STRICT_FADDRTZ";
-  case PPCISD::STRICT_FCTIDZ:
-    return "PPCISD::STRICT_FCTIDZ";
-  case PPCISD::STRICT_FCTIWZ:
-    return "PPCISD::STRICT_FCTIWZ";
-  case PPCISD::STRICT_FCTIDUZ:
-    return "PPCISD::STRICT_FCTIDUZ";
-  case PPCISD::STRICT_FCTIWUZ:
-    return "PPCISD::STRICT_FCTIWUZ";
-  case PPCISD::STRICT_FCFID:
-    return "PPCISD::STRICT_FCFID";
-  case PPCISD::STRICT_FCFIDU:
-    return "PPCISD::STRICT_FCFIDU";
-  case PPCISD::STRICT_FCFIDS:
-    return "PPCISD::STRICT_FCFIDS";
-  case PPCISD::STRICT_FCFIDUS:
-    return "PPCISD::STRICT_FCFIDUS";
-  case PPCISD::LXVRZX:          return "PPCISD::LXVRZX";
-  case PPCISD::STORE_COND:
-    return "PPCISD::STORE_COND";
-  case PPCISD::SETBC:
-    return "PPCISD::SETBC";
-  case PPCISD::SETBCR:
-    return "PPCISD::SETBCR";
-  case PPCISD::ADDC:
-    return "PPCISD::ADDC";
-  case PPCISD::ADDE:
-    return "PPCISD::ADDE";
-  case PPCISD::SUBC:
-    return "PPCISD::SUBC";
-  case PPCISD::SUBE:
-    return "PPCISD::SUBE";
-  }
-  return nullptr;
-}
-
 EVT PPCTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &C,
                                           EVT VT) const {
   if (!VT.isVector())
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index d967018982734..680b529b4e2e5 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -34,580 +34,6 @@
 
 namespace llvm {
 
-  namespace PPCISD {
-
-  // When adding a NEW PPCISD node please add it to the correct position in
-  // the enum. The order of elements in this enum matters!
-  // Values that are added between FIRST_MEMORY_OPCODE and LAST_MEMORY_OPCODE
-  // are considered memory opcodes and are treated differently than other
-  // entries.
-  enum NodeType : unsigned {
-    // Start the numbering where the builtin ops and target ops leave off.
-    FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-    /// FSEL - Traditional three-operand fsel node.
-    ///
-    FSEL,
-
-    /// XSMAXC[DQ]P, XSMINC[DQ]P - C-type min/max instructions.
-    XSMAXC,
-    XSMINC,
-
-    /// FCFID - The FCFID instruction, taking an f64 operand and producing
-    /// and f64 value containing the FP representation of the integer that
-    /// was temporarily in the f64 operand.
-    FCFID,
-
-    /// Newer FCFID[US] integer-to-floating-point conversion instructions for
-    /// unsigned integers and single-precision outputs.
-    FCFIDU,
-    FCFIDS,
-    FCFIDUS,
-
-    /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
-    /// operand, producing an f64 value containing the integer representation
-    /// of that FP value.
-    FCTIDZ,
-    FCTIWZ,
-
-    /// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
-    /// unsigned integers with round toward zero.
-    FCTIDUZ,
-    FCTIWUZ,
-
-    /// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
-    /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
-    VEXTS,
-
-    /// Reciprocal estimate instructions (unary FP ops).
-    FRE,
-    FRSQRTE,
-
-    /// Test instruction for software square root.
-    FTSQRT,
-
-    /// Square root instruction.
-    FSQRT,
-
-    /// VPERM - The PPC VPERM Instruction.
-    ///
-    VPERM,
-
-    /// XXSPLT - The PPC VSX splat instructions
-    ///
-    XXSPLT,
-
-    /// XXSPLTI_SP_TO_DP - The PPC VSX splat instructions for immediates for
-    /// converting immediate single precision numbers to double precision
-    /// vector or scalar.
-    XXSPLTI_SP_TO_DP,
-
-    /// XXSPLTI32DX - The PPC XXSPLTI32DX instruction.
-    ///
-    XXSPLTI32DX,
-
-    /// VECINSERT - The PPC vector insert instruction
-    ///
-    VECINSERT,
-
-    /// VECSHL - The PPC vector shift left instruction
-    ///
-    VECSHL,
-
-    /// XXPERMDI - The PPC XXPERMDI instruction
-    ///
-    XXPERMDI,
-    XXPERM,
-
-    /// The CMPB instruction (takes two operands of i32 or i64).
-    CMPB,
-
-    /// Hi/Lo - These represent the high and low 16-bit parts of a global
-    /// address respectively.  These nodes have two operands, the first of
-    /// which must be a TargetGlobalAddress, and the second of which must be a
-    /// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
-    /// though these are usually folded into other nodes.
-    Hi,
-    Lo,
-
-    /// The following two target-specific nodes are used for calls through
-    /// function pointers in the 64-bit SVR4 ABI.
-
-    /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
-    /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
-    /// compute an allocation on the stack.
-    DYNALLOC,
-
-    /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
-    /// compute an offset from native SP to the address  of the most recent
-    /// dynamic alloca.
-    DYNAREAOFFSET,
-
-    /// To avoid stack clash, allocation is performed by block and each block is
-    /// probed.
-    PROBED_ALLOCA,
-
-    /// The result of the mflr at function entry, used for PIC code.
-    GlobalBaseReg,
-
-    /// These nodes represent PPC shifts.
-    ///
-    /// For scalar types, only the last `n + 1` bits of the shift amounts
-    /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
-    /// for exact behaviors.
-    ///
-    /// For vector types, only the last n bits are used. See vsld.
-    SRL,
-    SRA,
-    SHL,
-
-    /// These nodes represent PPC arithmetic operations with carry.
-    ADDC,
-    ADDE,
-    SUBC,
-    SUBE,
-
-    /// FNMSUB - Negated multiply-subtract instruction.
-    FNMSUB,
-
-    /// EXTSWSLI = The PPC extswsli instruction, which does an extend-sign
-    /// word and shift left immediate.
-    EXTSWSLI,
-
-    /// The combination of sra[wd]i and addze used to implemented signed
-    /// integer division by a power of 2. The first operand is the dividend,
-    /// and the second is the constant shift amount (representing the
-    /// divisor).
-    SRA_ADDZE,
-
-    /// CALL - A direct function call.
-    /// CALL_NOP is a call with the special NOP which follows 64-bit
-    /// CALL_NOTOC the caller does not use the TOC.
-    /// SVR4 calls and 32-bit/64-bit AIX calls.
-    CALL,
-    CALL_NOP,
-    CALL_NOTOC,
-
-    /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
-    /// MTCTR instruction.
-    MTCTR,
-
-    /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
-    /// BCTRL instruction.
-    BCTRL,
-
-    /// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl
-    /// instruction and the TOC reload required on 64-bit ELF, 32-bit AIX
-    /// and 64-bit AIX.
-    BCTRL_LOAD_TOC,
-
-    /// The variants that implicitly define rounding mode for calls with
-    /// strictfp semantics.
-    CALL_RM,
-    CALL_NOP_RM,
-    CALL_NOTOC_RM,
-    BCTRL_RM,
-    BCTRL_LOAD_TOC_RM,
-
-    /// Return with a glue operand, matched by 'blr'
-    RET_GLUE,
-
-    /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
-    /// This copies the bits corresponding to the specified CRREG into the
-    /// resultant GPR.  Bits corresponding to other CR regs are undefined.
-    MFOCRF,
-
-    /// Direct move from a VSX register to a GPR
-    MFVSR,
-
-    /// Direct move from a GPR to a VSX register (algebraic)
-    MTVSRA,
-
-    /// Direct move from a GPR to a VSX register (zero)
-    MTVSRZ,
-
-    /// Direct move of 2 consecutive GPR to a VSX register.
-    BUILD_FP128,
-
-    /// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and
-    /// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is
-    /// unsupported for this target.
-    /// Merge 2 GPRs to a single SPE register.
-    BUILD_SPE64,
-
-    /// Extract SPE register component, second argument is high or low.
-    EXTRACT_SPE,
-
-    /// Extract a subvector from signed integer vector and convert to FP.
-    /// It is primarily used to convert a (widened) illegal integer vector
-    /// type to a legal floating point vector type.
-    /// For example v2i32 -> widened to v4i32 -> v2f64
-    SINT_VEC_TO_FP,
-
-    /// Extract a subvector from unsigned integer vector and convert to FP.
-    /// As with SINT_VEC_TO_FP, used for converting illegal types.
-    UINT_VEC_TO_FP,
-
-    /// PowerPC instructions that have SCALAR_TO_VECTOR semantics tend to
-    /// place the value into the least significant element of the most
-    /// significant doubleword in the vector. This is not element zero for
-    /// anything smaller than a doubleword on either endianness. This node has
-    /// the same semantics as SCALAR_TO_VECTOR except that the value remains in
-    /// the aforementioned location in the vector register.
-    SCALAR_TO_VECTOR_PERMUTED,
-
-    // FIXME: Remove these once the ANDI glue bug is fixed:
-    /// i1 = ANDI_rec_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
-    /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
-    /// implement truncation of i32 or i64 to i1.
-    ANDI_rec_1_EQ_BIT,
-    ANDI_rec_1_GT_BIT,
-
-    // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit
-    // target (returns (Lo, Hi)). It takes a chain operand.
-    READ_TIME_BASE,
-
-    // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
-    EH_SJLJ_SETJMP,
-
-    // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
-    EH_SJLJ_LONGJMP,
-
-    /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
-    /// instructions.  For lack of better number, we use the opcode number
-    /// encoding for the OPC field to identify the compare.  For example, 838
-    /// is VCMPGTSH.
-    VCMP,
-
-    /// RESVEC, OUTFLAG = VCMP_rec(LHS, RHS, OPC) - Represents one of the
-    /// altivec VCMP*_rec instructions.  For lack of better number, we use the
-    /// opcode number encoding for the OPC field to identify the compare.  For
-    /// example, 838 is VCMPGTSH.
-    VCMP_rec,
-
-    /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
-    /// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
-    /// condition register to branch on, OPC is the branch opcode to use (e.g.
-    /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
-    /// an optional input flag argument.
-    COND_BRANCH,
-
-    /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
-    /// loops.
-    BDNZ,
-    BDZ,
-
-    /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
-    /// towards zero.  Used only as part of the long double-to-int
-    /// conversion sequence.
-    FADDRTZ,
-
-    /// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
-    MFFS,
-
-    /// TC_RETURN - A tail call return.
-    ///   operand #0 chain
-    ///   operand #1 callee (register or absolute)
-    ///   operand #2 stack adjustment
-    ///   operand #3 optional in flag
-    TC_RETURN,
-
-    /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
-    CR6SET,
-    CR6UNSET,
-
-    /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
-    /// for non-position independent code on PPC32.
-    PPC32_GOT,
-
-    /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
-    /// local dynamic TLS and position indendepent code on PPC32.
-    PPC32_PICGOT,
-
-    /// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec
-    /// TLS model, produces an ADDIS8 instruction that adds the GOT
-    /// base to sym\@got\@tprel\@ha.
-    ADDIS_GOT_TPREL_HA,
-
-    /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
-    /// TLS model, produces a LD instruction with base register G8RReg
-    /// and offset sym\@got\@tprel\@l.  This completes the addition that
-    /// finds the offset of "sym" relative to the thread pointer.
-    LD_GOT_TPREL_L,
-
-    /// G8RC = ADD_TLS G8RReg, Symbol - Can be used by the initial-exec
-    /// and local-exec TLS models, produces an ADD instruction that adds
-    /// the contents of G8RReg to the thread pointer.  Symbol contains a
-    /// relocation sym\@tls which is to be replaced by the thread pointer
-    /// and identifies to the linker that the instruction is part of a
-    /// TLS sequence.
-    ADD_TLS,
-
-    /// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS
-    /// model, produces an ADDIS8 instruction that adds the GOT base
-    /// register to sym\@got\@tlsgd\@ha.
-    ADDIS_TLSGD_HA,
-
-    /// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
-    /// model, produces an ADDI8 instruction that adds G8RReg to
-    /// sym\@got\@tlsgd\@l and stores the result in X3.  Hidden by
-    /// ADDIS_TLSGD_L_ADDR until after register assignment.
-    ADDI_TLSGD_L,
-
-    /// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS
-    /// model, produces a call to __tls_get_addr(sym\@tlsgd).  Hidden by
-    /// ADDIS_TLSGD_L_ADDR until after register assignment.
-    GET_TLS_ADDR,
-
-    /// %x3 = GET_TPOINTER - Used for the local- and initial-exec TLS model on
-    /// 32-bit AIX, produces a call to .__get_tpointer to retrieve the thread
-    /// pointer. At the end of the call, the thread pointer is found in R3.
-    GET_TPOINTER,
-
-    /// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that
-    /// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following
-    /// register assignment.
-    ADDI_TLSGD_L_ADDR,
-
-    /// GPRC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
-    /// G8RC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
-    /// Op that combines two register copies of TOC entries
-    /// (region handle into R3 and variable offset into R4) followed by a
-    /// GET_TLS_ADDR node which will be expanded to a call to .__tls_get_addr.
-    /// This node is used in 64-bit mode as well (in which case the result is
-    /// G8RC and inputs are X3/X4).
-    TLSGD_AIX,
-
-    /// %x3 = GET_TLS_MOD_AIX _$TLSML - For the AIX local-dynamic TLS model,
-    /// produces a call to .__tls_get_mod(_$TLSML\@ml).
-    GET_TLS_MOD_AIX,
-
-    /// [GP|G8]RC = TLSLD_AIX, TOC_ENTRY(module handle)
-    /// Op that requires a single input of the module handle TOC entry in R3,
-    /// and generates a GET_TLS_MOD_AIX node which will be expanded into a call
-    /// to .__tls_get_mod. This node is used in both 32-bit and 64-bit modes.
-    /// The only difference is the register class.
-    TLSLD_AIX,
-
-    /// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDIS8 instruction that adds the GOT base
-    /// register to sym\@got\@tlsld\@ha.
-    ADDIS_TLSLD_HA,
-
-    /// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDI8 instruction that adds G8RReg to
-    /// sym\@got\@tlsld\@l and stores the result in X3.  Hidden by
-    /// ADDIS_TLSLD_L_ADDR until after register assignment.
-    ADDI_TLSLD_L,
-
-    /// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS
-    /// model, produces a call to __tls_get_addr(sym\@tlsld).  Hidden by
-    /// ADDIS_TLSLD_L_ADDR until after register assignment.
-    GET_TLSLD_ADDR,
-
-    /// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that
-    /// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion
-    /// following register assignment.
-    ADDI_TLSLD_L_ADDR,
-
-    /// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDIS8 instruction that adds X3 to
-    /// sym\@dtprel\@ha.
-    ADDIS_DTPREL_HA,
-
-    /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
-    /// model, produces an ADDI8 instruction that adds G8RReg to
-    /// sym\@got\@dtprel\@l.
-    ADDI_DTPREL_L,
-
-    /// G8RC = PADDI_DTPREL %x3, Symbol - For the pc-rel based local-dynamic TLS
-    /// model, produces a PADDI8 instruction that adds X3 to sym\@dtprel.
-    PADDI_DTPREL,
-
-    /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
-    /// during instruction selection to optimize a BUILD_VECTOR into
-    /// operations on splats.  This is necessary to avoid losing these
-    /// optimizations due to constant folding.
-    VADD_SPLAT,
-
-    /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
-    /// endian.  Maps to an xxswapd instruction that corrects an lxvd2x
-    /// or stxvd2x instruction.  The chain is necessary because the
-    /// sequence replaces a load and needs to provide the same number
-    /// of outputs.
-    XXSWAPD,
-
-    /// An SDNode for swaps that are not associated with any loads/stores
-    /// and thereby have no chain.
-    SWAP_NO_CHAIN,
-
-    /// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or
-    /// lower (IDX=1) half of v4f32 to v2f64.
-    FP_EXTEND_HALF,
-
-    /// MAT_PCREL_ADDR = Materialize a PC Relative address. This can be done
-    /// either through an add like PADDI or through a PC Relative load like
-    /// PLD.
-    MAT_PCREL_ADDR,
-
-    /// TLS_DYNAMIC_MAT_PCREL_ADDR = Materialize a PC Relative address for
-    /// TLS global address when using dynamic access models. This can be done
-    /// through an add like PADDI.
-    TLS_DYNAMIC_MAT_PCREL_ADDR,
-
-    /// TLS_LOCAL_EXEC_MAT_ADDR = Materialize an address for TLS global address
-    /// when using local exec access models, and when prefixed instructions are
-    /// available. This is used with ADD_TLS to produce an add like PADDI.
-    TLS_LOCAL_EXEC_MAT_ADDR,
-
-    /// ACC_BUILD = Build an accumulator register from 4 VSX registers.
-    ACC_BUILD,
-
-    /// PAIR_BUILD = Build a vector pair register from 2 VSX registers.
-    PAIR_BUILD,
-
-    /// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of
-    /// an accumulator or pair register. This node is needed because
-    /// EXTRACT_SUBVECTOR expects the input and output vectors to have the same
-    /// element type.
-    EXTRACT_VSX_REG,
-
-    /// XXMFACC = This corresponds to the xxmfacc instruction.
-    XXMFACC,
-
-    // Constrained conversion from floating point to int
-    FIRST_STRICTFP_OPCODE,
-    STRICT_FCTIDZ = FIRST_STRICTFP_OPCODE,
-    STRICT_FCTIWZ,
-    STRICT_FCTIDUZ,
-    STRICT_FCTIWUZ,
-
-    /// Constrained integer-to-floating-point conversion instructions.
-    STRICT_FCFID,
-    STRICT_FCFIDU,
-    STRICT_FCFIDS,
-    STRICT_FCFIDUS,
-
-    /// Constrained floating point add in round-to-zero mode.
-    STRICT_FADDRTZ,
-    LAST_STRICTFP_OPCODE = STRICT_FADDRTZ,
-
-    /// SETBC - The ISA 3.1 (P10) SETBC instruction.
-    SETBC,
-
-    /// SETBCR - The ISA 3.1 (P10) SETBCR instruction.
-    SETBCR,
-
-    /// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction
-    VSRQ,
-
-    // NOTE: The nodes below may require PC-Rel specific patterns if the
-    // address could be PC-Relative. When adding new nodes below, consider
-    // whether or not the address can be PC-Relative and add the corresponding
-    // PC-relative patterns and tests.
-
-    /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
-    /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
-    /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
-    /// i32.
-    FIRST_MEMORY_OPCODE,
-    STBRX = FIRST_MEMORY_OPCODE,
-
-    /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
-    /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
-    /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
-    /// or i32.
-    LBRX,
-
-    /// STFIWX - The STFIWX instruction.  The first operand is an input token
-    /// chain, then an f64 value to store, then an address to store it to.
-    STFIWX,
-
-    /// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
-    /// load which sign-extends from a 32-bit integer value into the
-    /// destination 64-bit register.
-    LFIWAX,
-
-    /// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
-    /// load which zero-extends from a 32-bit integer value into the
-    /// destination 64-bit register.
-    LFIWZX,
-
-    /// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an
-    /// integer smaller than 64 bits into a VSR. The integer is zero-extended.
-    /// This can be used for converting loaded integers to floating point.
-    LXSIZX,
-
-    /// STXSIX - The STXSI[bh]X instruction. The first operand is an input
-    /// chain, then an f64 value to store, then an address to store it to,
-    /// followed by a byte-width for the store.
-    STXSIX,
-
-    /// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
-    /// Maps directly to an lxvd2x instruction that will be followed by
-    /// an xxswapd.
-    LXVD2X,
-
-    /// LXVRZX - Load VSX Vector Rightmost and Zero Extend
-    /// This node represents v1i128 BUILD_VECTOR of a zero extending load
-    /// instruction from <byte, halfword, word, or doubleword> to i128.
-    /// Allows utilization of the Load VSX Vector Rightmost Instructions.
-    LXVRZX,
-
-    /// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
-    /// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
-    /// the vector type to load vector in big-endian element order.
-    LOAD_VEC_BE,
-
-    /// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
-    /// v2f32 value into the lower half of a VSR register.
-    LD_VSX_LH,
-
-    /// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory
-    /// instructions such as LXVDSX, LXVWSX.
-    LD_SPLAT,
-
-    /// VSRC, CHAIN = ZEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
-    /// that zero-extends.
-    ZEXT_LD_SPLAT,
-
-    /// VSRC, CHAIN = SEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
-    /// that sign-extends.
-    SEXT_LD_SPLAT,
-
-    /// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
-    /// Maps directly to an stxvd2x instruction that will be preceded by
-    /// an xxswapd.
-    STXVD2X,
-
-    /// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian.
-    /// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on
-    /// the vector type to store vector in big-endian element order.
-    STORE_VEC_BE,
-
-    /// Store scalar integers from VSR.
-    ST_VSR_SCAL_INT,
-
-    /// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
-    /// except they ensure that the compare input is zero-extended for
-    /// sub-word versions because the atomic loads zero-extend.
-    ATOMIC_CMP_SWAP_8,
-    ATOMIC_CMP_SWAP_16,
-
-    /// CHAIN,Glue = STORE_COND CHAIN, GPR, Ptr
-    /// The store conditional instruction ST[BHWD]ARX that produces a glue
-    /// result to attach it to a conditional branch.
-    STORE_COND,
-
-    /// GPRC = TOC_ENTRY GA, TOC
-    /// Loads the entry for GA from the TOC, where the TOC base is given by
-    /// the last operand.
-    TOC_ENTRY,
-    LAST_MEMORY_OPCODE = TOC_ENTRY,
-  };
-
-  } // end namespace PPCISD
-
   /// Define some predicates that are used for node matching.
   namespace PPC {
 
@@ -752,10 +178,6 @@ namespace llvm {
     explicit PPCTargetLowering(const PPCTargetMachine &TM,
                                const PPCSubtarget &STI);
 
-    /// getTargetNodeName() - This method returns the name of a target specific
-    /// DAG node.
-    const char *getTargetNodeName(unsigned Opcode) const override;
-
     bool isSelectSupported(SelectSupportKind Kind) const override {
       // PowerPC does not support scalar condition selects on vectors.
       return (Kind != SelectSupportKind::ScalarCondVectorVal);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index f3998113ddd52..3ecc58c04e378 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -149,28 +149,49 @@ def SDT_PPCBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, [
 
 def PPCfre    : SDNode<"PPCISD::FRE",     SDTFPUnaryOp, []>;
 def PPCfrsqrte: SDNode<"PPCISD::FRSQRTE", SDTFPUnaryOp, []>;
+
+// Square root instruction.
 def PPCfsqrt  : SDNode<"PPCISD::FSQRT",   SDTFPUnaryOp, []>;
+
+// Test instruction for software square root.
 def PPCftsqrt : SDNode<"PPCISD::FTSQRT",  SDT_PPCFtsqrt,[]>;
 
+// FCFID - The FCFID instruction, taking an f64 operand and producing
+// and f64 value containing the FP representation of the integer that
+// was temporarily in the f64 operand.
 def PPCfcfid  : SDNode<"PPCISD::FCFID",   SDTFPUnaryOp, []>;
+
+// Newer FCFID[US] integer-to-floating-point conversion instructions for
+// unsigned integers and single-precision outputs.
 def PPCfcfidu : SDNode<"PPCISD::FCFIDU",  SDTFPUnaryOp, []>;
 def PPCfcfids : SDNode<"PPCISD::FCFIDS",  SDTFPRoundOp, []>;
 def PPCfcfidus: SDNode<"PPCISD::FCFIDUS", SDTFPRoundOp, []>;
+
+// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64
+// operand, producing an f64 value containing the integer representation
+// of that FP value.
 def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
 def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
+
+// Newer FCTI[D,W]UZ floating-point-to-integer conversion instructions for
+// unsigned integers with round toward zero.
 def PPCfctiduz: SDNode<"PPCISD::FCTIDUZ",SDTFPUnaryOp, []>;
 def PPCfctiwuz: SDNode<"PPCISD::FCTIWUZ",SDTFPUnaryOp, []>;
 
+// VSRQ - The ISA 3.1 (P10) Vector Shift right quadword instruction
 def PPCvsrq: SDNode<"PPCISD::VSRQ", SDT_PPCVecShiftQuad, []>;
 
-def PPCstrict_fcfid : SDNode<"PPCISD::STRICT_FCFID",
-                             SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
-                              SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fcfids : SDNode<"PPCISD::STRICT_FCFIDS",
-                             SDTFPRoundOp, [SDNPHasChain]>;
-def PPCstrict_fcfidus : SDNode<"PPCISD::STRICT_FCFIDUS",
-                              SDTFPRoundOp, [SDNPHasChain]>;
+// Constrained integer-to-floating-point conversion instructions.
+let IsStrictFP = true in {
+  def PPCstrict_fcfid  : SDNode<"PPCISD::STRICT_FCFID",
+                                SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fcfidu : SDNode<"PPCISD::STRICT_FCFIDU",
+                                SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fcfids : SDNode<"PPCISD::STRICT_FCFIDS",
+                                SDTFPRoundOp, [SDNPHasChain]>;
+  def PPCstrict_fcfidus : SDNode<"PPCISD::STRICT_FCFIDUS",
+                                 SDTFPRoundOp, [SDNPHasChain]>;
+}
 
 def PPCany_fcfid : PatFrags<(ops node:$op),
                              [(PPCfcfid node:$op),
@@ -185,28 +206,56 @@ def PPCany_fcfidus : PatFrags<(ops node:$op),
                               [(PPCfcfidus node:$op),
                                (PPCstrict_fcfidus node:$op)]>;
 
+// Store scalar integers from VSR.
 def PPCstore_scal_int_from_vsr:
    SDNode<"PPCISD::ST_VSR_SCAL_INT", SDT_PPCstore_scal_int_from_vsr,
            [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// STFIWX - The STFIWX instruction.  The first operand is an input token
+// chain, then an f64 value to store, then an address to store it to.
 def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LFIWAX CHAIN, Ptr - This is a floating-point
+// load which sign-extends from a 32-bit integer value into the
+// destination 64-bit register.
 def PPClfiwax : SDNode<"PPCISD::LFIWAX", SDT_PPClfiwx,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LFIWZX CHAIN, Ptr - This is a floating-point
+// load which zero-extends from a 32-bit integer value into the
+// destination 64-bit register.
 def PPClfiwzx : SDNode<"PPCISD::LFIWZX", SDT_PPClfiwx,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LXSIZX, CHAIN, Ptr, ByteWidth - This is a load of an
+// integer smaller than 64 bits into a VSR. The integer is zero-extended.
+// This can be used for converting loaded integers to floating point.
 def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// STXSIX - The STXSI[bh]X instruction. The first operand is an input
+// chain, then an f64 value to store, then an address to store it to,
+// followed by a byte-width for the store.
 def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// VEXTS, ByteWidth - takes an input in VSFRC and produces an output in
+// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
 def PPCVexts  : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;
 
-// Extract FPSCR (not modeled at the DAG level).
+// F8RC = MFFS - This moves the FPSCR (not modeled) into the register.
 def PPCmffs   : SDNode<"PPCISD::MFFS",
                        SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>,
                        [SDNPHasChain]>;
 
-// Perform FADD in round-to-zero mode.
+// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
+// towards zero.  Used only as part of the long double-to-int
+// conversion sequence.
 def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;
+
+// Constrained floating point add in round-to-zero mode.
+let IsStrictFP = true in
 def PPCstrict_faddrtz: SDNode<"PPCISD::STRICT_FADDRTZ", SDTFPBinOp,
                               [SDNPHasChain]>;
 
@@ -214,72 +263,194 @@ def PPCany_faddrtz: PatFrags<(ops node:$lhs, node:$rhs),
                              [(PPCfaddrtz node:$lhs, node:$rhs),
                               (PPCstrict_faddrtz node:$lhs, node:$rhs)]>;
 
+// FSEL - Traditional three-operand fsel node.
 def PPCfsel   : SDNode<"PPCISD::FSEL",
    // Type constraint for fsel.
    SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
                         SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
+
+// XSMAXC[DQ]P, XSMINC[DQ]P - C-type min/max instructions.
 def PPCxsmaxc : SDNode<"PPCISD::XSMAXC", SDT_PPCFPMinMax, []>;
 def PPCxsminc : SDNode<"PPCISD::XSMINC", SDT_PPCFPMinMax, []>;
+
+// Hi/Lo - These represent the high and low 16-bit parts of a global
+// address respectively.  These nodes have two operands, the first of
+// which must be a TargetGlobalAddress, and the second of which must be a
+// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
+// though these are usually folded into other nodes.
 def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
 def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
+
+// GPRC = TOC_ENTRY GA, TOC
+// Loads the entry for GA from the TOC, where the TOC base is given by
+// the last operand.
 def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp,
                          [SDNPMayLoad, SDNPMemOperand]>;
 
+// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by initial-exec TLS
+// for non-position independent code on PPC32.
 def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>;
 
+// G8RC = ADDIS_GOT_TPREL_HA %x2, Symbol - Used by the initial-exec
+// TLS model, produces an ADDIS8 instruction that adds the GOT
+// base to sym\@got\@tprel\@ha.
 def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>;
+
+// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
+// TLS model, produces a LD instruction with base register G8RReg
+// and offset sym\@got\@tprel\@l.  This completes the addition that
+// finds the offset of "sym" relative to the thread pointer.
 def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
                             [SDNPMayLoad]>;
+
+// G8RC = ADD_TLS G8RReg, Symbol - Can be used by the initial-exec
+// and local-exec TLS models, produces an ADD instruction that adds
+// the contents of G8RReg to the thread pointer.  Symbol contains a
+// relocation sym\@tls which is to be replaced by the thread pointer
+// and identifies to the linker that the instruction is part of a
+// TLS sequence.
 def PPCaddTls     : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
+
+// G8RC = ADDIS_TLSGD_HA %x2, Symbol - For the general-dynamic TLS
+// model, produces an ADDIS8 instruction that adds the GOT base
+// register to sym\@got\@tlsgd\@ha.
 def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
+
+// %x3 = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
+// model, produces an ADDI8 instruction that adds G8RReg to
+// sym\@got\@tlsgd\@l and stores the result in X3.  Hidden by
+// ADDIS_TLSGD_L_ADDR until after register assignment.
 def PPCaddiTlsgdL   : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
+
+// %x3 = GET_TLS_ADDR %x3, Symbol - For the general-dynamic TLS
+// model, produces a call to __tls_get_addr(sym\@tlsgd).  Hidden by
+// ADDIS_TLSGD_L_ADDR until after register assignment.
 def PPCgetTlsAddr   : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
+
+// %x3 = GET_TLS_MOD_AIX _$TLSML - For the AIX local-dynamic TLS model,
+// produces a call to .__tls_get_mod(_$TLSML\@ml).
 def PPCgetTlsMod   : SDNode<"PPCISD::GET_TLS_MOD_AIX", SDTIntUnaryOp>;
+
+// %x3 = GET_TPOINTER - Used for the local- and initial-exec TLS model on
+// 32-bit AIX, produces a call to .__get_tpointer to retrieve the thread
+// pointer. At the end of the call, the thread pointer is found in R3.
 def PPCgetTpointer : SDNode<"PPCISD::GET_TPOINTER", SDTIntLeaf, []>;
+
+// G8RC = ADDI_TLSGD_L_ADDR G8RReg, Symbol, Symbol - Op that
+// combines ADDI_TLSGD_L and GET_TLS_ADDR until expansion following
+// register assignment.
 def PPCaddiTlsgdLAddr : SDNode<"PPCISD::ADDI_TLSGD_L_ADDR",
                                SDTypeProfile<1, 3, [
                                  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
                                  SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+
+// GPRC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
+// G8RC = TLSGD_AIX, TOC_ENTRY, TOC_ENTRY
+// Op that combines two register copies of TOC entries
+// (region handle into R3 and variable offset into R4) followed by a
+// GET_TLS_ADDR node which will be expanded to a call to .__tls_get_addr.
+// This node is used in 64-bit mode as well (in which case the result is
+// G8RC and inputs are X3/X4).
 def PPCTlsgdAIX     : SDNode<"PPCISD::TLSGD_AIX", SDTIntBinOp>;
+
+// [GP|G8]RC = TLSLD_AIX, TOC_ENTRY(module handle)
+// Op that requires a single input of the module handle TOC entry in R3,
+// and generates a GET_TLS_MOD_AIX node which will be expanded into a call
+// to .__tls_get_mod. This node is used in both 32-bit and 64-bit modes.
+// The only difference is the register class.
 def PPCTlsldAIX     : SDNode<"PPCISD::TLSLD_AIX", SDTIntUnaryOp>;
+
+// G8RC = ADDIS_TLSLD_HA %x2, Symbol - For the local-dynamic TLS
+// model, produces an ADDIS8 instruction that adds the GOT base
+// register to sym\@got\@tlsld\@ha.
 def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
+
+// %x3 = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
+// model, produces an ADDI8 instruction that adds G8RReg to
+// sym\@got\@tlsld\@l and stores the result in X3.  Hidden by
+// ADDIS_TLSLD_L_ADDR until after register assignment.
 def PPCaddiTlsldL   : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
+
+// %x3 = GET_TLSLD_ADDR %x3, Symbol - For the local-dynamic TLS
+// model, produces a call to __tls_get_addr(sym\@tlsld).  Hidden by
+// ADDIS_TLSLD_L_ADDR until after register assignment.
 def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
+
+// G8RC = ADDI_TLSLD_L_ADDR G8RReg, Symbol, Symbol - Op that
+// combines ADDI_TLSLD_L and GET_TLSLD_ADDR until expansion
+// following register assignment.
 def PPCaddiTlsldLAddr : SDNode<"PPCISD::ADDI_TLSLD_L_ADDR",
                                SDTypeProfile<1, 3, [
                                  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
                                  SDTCisSameAs<0, 3>, SDTCisInt<0> ]>>;
+
+// G8RC = ADDIS_DTPREL_HA %x3, Symbol - For the local-dynamic TLS
+// model, produces an ADDIS8 instruction that adds X3 to
+// sym\@dtprel\@ha.
 def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp>;
+
+// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
+// model, produces an ADDI8 instruction that adds G8RReg to
+// sym\@got\@dtprel\@l.
 def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
+
+// G8RC = PADDI_DTPREL %x3, Symbol - For the pc-rel based local-dynamic TLS
+// model, produces a PADDI8 instruction that adds X3 to sym\@dtprel.
 def PPCpaddiDtprel   : SDNode<"PPCISD::PADDI_DTPREL", SDTIntBinOp>;
 
+// VPERM - The PPC VPERM Instruction.
 def PPCvperm     : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+
+// XXSPLT - The PPC VSX splat instructions
 def PPCxxsplt    : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
+
+// XXSPLTI_SP_TO_DP - The PPC VSX splat instructions for immediates for
+// converting immediate single precision numbers to double precision
+// vector or scalar.
 def PPCxxspltidp : SDNode<"PPCISD::XXSPLTI_SP_TO_DP", SDT_PPCSpToDp, []>;
+
+// VECINSERT - The PPC vector insert instruction
 def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>;
+
+// XXPERMDI - The PPC XXPERMDI instruction
 def PPCxxpermdi  : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
+
+// VECSHL - The PPC vector shift left instruction
 def PPCvecshl    : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
 
+// The CMPB instruction (takes two operands of i32 or i64).
 def PPCcmpb     : SDNode<"PPCISD::CMPB", SDTIntBinOp, []>;
 
 // These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
 // amounts.  These nodes are generated by the multi-precision shift code.
+//
+// For scalar types, only the last `n + 1` bits of the shift amounts
+// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc.
+// for exact behaviors.
+//
+// For vector types, only the last n bits are used. See vsld.
 def PPCsrl        : SDNode<"PPCISD::SRL"       , SDTIntShiftOp>;
 def PPCsra        : SDNode<"PPCISD::SRA"       , SDTIntShiftOp>;
 def PPCshl        : SDNode<"PPCISD::SHL"       , SDTIntShiftOp>;
 
+// FNMSUB - Negated multiply-subtract instruction.
 def PPCfnmsub     : SDNode<"PPCISD::FNMSUB"    , SDTFPTernaryOp>;
 
+// EXTSWSLI = The PPC extswsli instruction, which does an extend-sign
+// word and shift left immediate.
 def PPCextswsli : SDNode<"PPCISD::EXTSWSLI" , SDT_PPCextswsli>;
 
-def PPCstrict_fctidz : SDNode<"PPCISD::STRICT_FCTIDZ",
-                              SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fctiwz : SDNode<"PPCISD::STRICT_FCTIWZ",
-                              SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fctiduz : SDNode<"PPCISD::STRICT_FCTIDUZ",
-                               SDTFPUnaryOp, [SDNPHasChain]>;
-def PPCstrict_fctiwuz : SDNode<"PPCISD::STRICT_FCTIWUZ",
+// Constrained conversion from floating point to int
+let IsStrictFP = true in {
+  def PPCstrict_fctidz : SDNode<"PPCISD::STRICT_FCTIDZ",
                                 SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fctiwz : SDNode<"PPCISD::STRICT_FCTIWZ",
+                                SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fctiduz : SDNode<"PPCISD::STRICT_FCTIDUZ",
+                                 SDTFPUnaryOp, [SDNPHasChain]>;
+  def PPCstrict_fctiwuz : SDNode<"PPCISD::STRICT_FCTIWUZ",
+                                 SDTFPUnaryOp, [SDNPHasChain]>;
+}
 
 def PPCany_fctidz : PatFrags<(ops node:$op),
                              [(PPCstrict_fctidz node:$op),
@@ -294,19 +465,24 @@ def PPCany_fctiwuz : PatFrags<(ops node:$op),
                               [(PPCstrict_fctiwuz node:$op),
                                (PPCfctiwuz node:$op)]>;
 
-// Move 2 i64 values into a VSX register
+// Direct move of 2 consecutive GPR to a VSX register.
 def PPCbuild_fp128: SDNode<"PPCISD::BUILD_FP128",
                            SDTypeProfile<1, 2,
                              [SDTCisFP<0>, SDTCisSameSizeAs<1,2>,
                               SDTCisSameAs<1,2>]>,
                            []>;
 
+// BUILD_SPE64 and EXTRACT_SPE are analogous to BUILD_PAIR and
+// EXTRACT_ELEMENT but take f64 arguments instead of i64, as i64 is
+// unsupported for this target.
+// Merge 2 GPRs to a single SPE register.
 def PPCbuild_spe64: SDNode<"PPCISD::BUILD_SPE64",
                            SDTypeProfile<1, 2,
                              [SDTCisVT<0, f64>, SDTCisVT<1,i32>,
                              SDTCisVT<1,i32>]>,
                            []>;
 
+// Extract SPE register component, second argument is high or low.
 def PPCextract_spe : SDNode<"PPCISD::EXTRACT_SPE",
                             SDTypeProfile<1, 2,
                               [SDTCisVT<0, i32>, SDTCisVT<1, f64>,
@@ -320,6 +496,11 @@ def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_PPCCallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
 def SDT_PPCCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+
+// CALL - A direct function call.
+// CALL_NOP is a call with the special NOP which follows 64-bit
+// CALL_NOTOC the caller does not use the TOC.
+// SVR4 calls and 32-bit/64-bit AIX calls.
 def PPCcall  : SDNode<"PPCISD::CALL", SDT_PPCCall,
                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
@@ -329,17 +510,28 @@ def PPCcall_nop  : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
 def PPCcall_notoc : SDNode<"PPCISD::CALL_NOTOC", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                             SDNPVariadic]>;
+
+// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
+// MTCTR instruction.
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
+// BCTRL instruction.
 def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
                       [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                        SDNPVariadic]>;
+
+// CHAIN,FLAG = BCTRL(CHAIN, ADDR, INFLAG) - The combination of a bctrl
+// instruction and the TOC reload required on 64-bit ELF, 32-bit AIX
+// and 64-bit AIX.
 def PPCbctrl_load_toc : SDNode<"PPCISD::BCTRL_LOAD_TOC",
                                SDTypeProfile<0, 1, []>,
                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                 SDNPVariadic]>;
 
-// Call nodes for strictfp calls (that define RM).
+// The variants that implicitly define rounding mode for calls with
+// strictfp semantics.
 def PPCcall_rm  : SDNode<"PPCISD::CALL_RM", SDT_PPCCall,
                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                           SDNPVariadic]>;
@@ -357,42 +549,81 @@ def PPCbctrl_load_toc_rm : SDNode<"PPCISD::BCTRL_LOAD_TOC_RM",
                                   [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                    SDNPVariadic]>;
 
+// Return with a glue operand, matched by 'blr'
 def PPCretglue   : SDNode<"PPCISD::RET_GLUE", SDTNone,
                            [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
+// TC_RETURN - A tail call return.
+//   operand #0 chain
+//   operand #1 callee (register or absolute)
+//   operand #2 stack adjustment
+//   operand #3 optional in flag
 def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
                         [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
+// EH_SJLJ_SETJMP - SjLj exception handling setjmp.
 def PPCeh_sjlj_setjmp  : SDNode<"PPCISD::EH_SJLJ_SETJMP",
                                 SDTypeProfile<1, 1, [SDTCisInt<0>,
                                                      SDTCisPtrTy<1>]>,
                                 [SDNPHasChain, SDNPSideEffect]>;
+
+// EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
 def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
                                 SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
                                 [SDNPHasChain, SDNPSideEffect]>;
 
+// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
+// instructions.  For lack of better number, we use the opcode number
+// encoding for the OPC field to identify the compare.  For example, 838
+// is VCMPGTSH.
 def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
+
+// RESVEC, OUTFLAG = VCMP_rec(LHS, RHS, OPC) - Represents one of the
+// altivec VCMP*_rec instructions.  For lack of better number, we use the
+// opcode number encoding for the OPC field to identify the compare.  For
+// example, 838 is VCMPGTSH.
 def PPCvcmp_rec   : SDNode<"PPCISD::VCMP_rec", SDT_PPCvcmp, [SDNPOutGlue]>;
 
+// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
+// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
+// condition register to branch on, OPC is the branch opcode to use (e.g.
+// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
+// an optional input flag argument.
 def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
                            [SDNPHasChain, SDNPOptInGlue]>;
 
-// PPC-specific atomic operations.
+// ATOMIC_CMP_SWAP - the exact same as the target-independent nodes
+// except they ensure that the compare input is zero-extended for
+// sub-word versions because the atomic loads zero-extend.
 def PPCatomicCmpSwap_8 :
   SDNode<"PPCISD::ATOMIC_CMP_SWAP_8", SDTAtomic3,
          [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 def PPCatomicCmpSwap_16 :
   SDNode<"PPCISD::ATOMIC_CMP_SWAP_16", SDTAtomic3,
          [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+
+// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a
+// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
+// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
+// or i32.
 def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
                            [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
+// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
+// the GPRC input, then stores it through Ptr.  Type can be either i16 or
+// i32.
 def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
                            [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// CHAIN,Glue = STORE_COND CHAIN, GPR, Ptr
+// The store conditional instruction ST[BHWD]ARX that produces a glue
+// result to attach it to a conditional branch.
 def PPCStoreCond  : SDNode<"PPCISD::STORE_COND", SDT_StoreCond,
                            [SDNPHasChain, SDNPMayStore,
                             SDNPMemOperand, SDNPOutGlue]>;
 
-// Instructions to set/unset CR bit 6 for SVR4 vararg calls
+// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
 def PPCcr6set   : SDNode<"PPCISD::CR6SET", SDTNone,
                          [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
@@ -401,17 +632,44 @@ def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
 // Instructions to support dynamic alloca.
 def SDTDynOp  : SDTypeProfile<1, 2, []>;
 def SDTDynAreaOp  : SDTypeProfile<1, 1, []>;
+
+// The following two target-specific nodes are used for calls through
+// function pointers in the 64-bit SVR4 ABI.
+
+// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
+// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+// compute an allocation on the stack.
 def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+
+// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+// compute an offset from native SP to the address  of the most recent
+// dynamic alloca.
 def PPCdynareaoffset   : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;
+
+// To avoid stack clash, allocation is performed by block and each block is
+// probed.
 def PPCprobedalloca : SDNode<"PPCISD::PROBED_ALLOCA", SDTDynOp, [SDNPHasChain]>;
 
 // PC Relative Specific Nodes
+
+// MAT_PCREL_ADDR = Materialize a PC Relative address. This can be done
+// either through an add like PADDI or through a PC Relative load like
+// PLD.
 def PPCmatpcreladdr : SDNode<"PPCISD::MAT_PCREL_ADDR", SDTIntUnaryOp, []>;
+
+// TLS_DYNAMIC_MAT_PCREL_ADDR = Materialize a PC Relative address for
+// TLS global address when using dynamic access models. This can be done
+// through an add like PADDI.
 def PPCtlsdynamatpcreladdr : SDNode<"PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR",
                                     SDTIntUnaryOp, []>;
+
+// TLS_LOCAL_EXEC_MAT_ADDR = Materialize an address for TLS global address
+// when using local exec access models, and when prefixed instructions are
+// available. This is used with ADD_TLS to produce an add like PADDI.
 def PPCtlslocalexecmataddr : SDNode<"PPCISD::TLS_LOCAL_EXEC_MAT_ADDR",
                                     SDTIntUnaryOp, []>;
 
+// These nodes represent PPC arithmetic operations with carry.
 def PPCaddc : SDNode<"PPCISD::ADDC", SDT_PPCBinaryArithWithFlagsOut,
                      [SDNPCommutative]>;
 def PPCadde : SDNode<"PPCISD::ADDE", SDT_PPCBinaryArithWithFlagsInOut,
@@ -2535,6 +2793,7 @@ defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$RST), (ins f8rc:$RA, f8rc:$RB),
 
 // Reciprocal estimates.
 let mayRaiseFPException = 1 in {
+// Reciprocal estimate instructions (unary FP ops).
 defm FRE      : XForm_26r<63, 24, (outs f8rc:$RST), (ins f8rc:$RB),
                           "fre", "$RST, $RB", IIC_FPGeneral,
                           [(set f64:$RST, (PPCfre f64:$RB))]>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td
index 2d8c633b9fef6..bd9a999237c09 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrP10.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td
@@ -83,15 +83,31 @@ def SDT_PPCsetbc : SDTypeProfile<1, 1, [
 // ISA 3.1 specific PPCISD nodes.
 //
 
+// XXSPLTI32DX - The PPC XXSPLTI32DX instruction.
 def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
+
+// ACC_BUILD = Build an accumulator register from 4 VSX registers.
 def PPCAccBuild : SDNode<"PPCISD::ACC_BUILD", SDT_PPCAccBuild, []>;
+
+// PAIR_BUILD = Build a vector pair register from 2 VSX registers.
 def PPCPairBuild : SDNode<"PPCISD::PAIR_BUILD", SDT_PPCPairBuild, []>;
+
+// EXTRACT_VSX_REG = Extract one of the underlying vsx registers of
+// an accumulator or pair register. This node is needed because
+// EXTRACT_SUBVECTOR expects the input and output vectors to have the same
+// element type.
 def PPCAccExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCAccExtractVsx,
                        []>;
 def PPCPairExtractVsx : SDNode<"PPCISD::EXTRACT_VSX_REG", SDT_PPCPairExtractVsx,
                         []>;
+
+// XXMFACC = This corresponds to the xxmfacc instruction.
 def PPCxxmfacc : SDNode<"PPCISD::XXMFACC", SDT_PPCxxmfacc, []>;
+
+// SETBC - The ISA 3.1 (P10) SETBC instruction.
 def PPCsetbc  : SDNode<"PPCISD::SETBC",   SDT_PPCsetbc, []>;
+
+// SETBCR - The ISA 3.1 (P10) SETBCR instruction.
 def PPCsetbcr : SDNode<"PPCISD::SETBCR",  SDT_PPCsetbc, []>;
 
 //===----------------------------------------------------------------------===//
@@ -105,7 +121,10 @@ def SDT_PPCLXVRZX : SDTypeProfile<1, 2, [
   SDTCisVT<0, v1i128>, SDTCisPtrTy<1>, SDTCisPtrTy<2>
 ]>;
 
-// PPC Specific DAG Nodes.
+// LXVRZX - Load VSX Vector Rightmost and Zero Extend
+// This node represents v1i128 BUILD_VECTOR of a zero extending load
+// instruction from <byte, halfword, word, or doubleword> to i128.
+// Allows utilization of the Load VSX Vector Rightmost Instructions.
 def PPClxvrzx : SDNode<"PPCISD::LXVRZX", SDT_PPCLXVRZX,
                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index 885bed670e319..d72201df5b002 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -87,31 +87,91 @@ def SDT_PPCxxperm : SDTypeProfile<1, 3, [
   SDTCisVT<0, v2f64>, SDTCisVT<1, v2f64>,
   SDTCisVT<2, v2f64>, SDTCisVT<3, v4i32>]>;
 //--------------------------- Custom PPC nodes -------------------------------//
+
+// VSRC, CHAIN = LXVD2X_LE CHAIN, Ptr - Occurs only for little endian.
+// Maps directly to an lxvd2x instruction that will be followed by
+// an xxswapd.
 def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// CHAIN = STXVD2X CHAIN, VSRC, Ptr - Occurs only for little endian.
+// Maps directly to an stxvd2x instruction that will be preceded by
+// an xxswapd.
 def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// VSRC, CHAIN = LOAD_VEC_BE CHAIN, Ptr - Occurs only for little endian.
+// Maps directly to one of lxvd2x/lxvw4x/lxvh8x/lxvb16x depending on
+// the vector type to load vector in big-endian element order.
 def PPCld_vec_be  : SDNode<"PPCISD::LOAD_VEC_BE", SDT_PPCld_vec_be,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// CHAIN = STORE_VEC_BE CHAIN, VSRC, Ptr - Occurs only for little endian.
+// Maps directly to one of stxvd2x/stxvw4x/stxvh8x/stxvb16x depending on
+// the vector type to store vector in big-endian element order.
 def PPCst_vec_be : SDNode<"PPCISD::STORE_VEC_BE", SDT_PPCst_vec_be,
                         [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little
+// endian.  Maps to an xxswapd instruction that corrects an lxvd2x
+// or stxvd2x instruction.  The chain is necessary because the
+// sequence replaces a load and needs to provide the same number
+// of outputs.
 def PPCxxswapd : SDNode<"PPCISD::XXSWAPD", SDT_PPCxxswapd, [SDNPHasChain]>;
+
+// Direct move from a VSX register to a GPR
 def PPCmfvsr : SDNode<"PPCISD::MFVSR", SDTUnaryOp, []>;
+
+// Direct move from a GPR to a VSX register (algebraic)
 def PPCmtvsra : SDNode<"PPCISD::MTVSRA", SDTUnaryOp, []>;
+
+// Direct move from a GPR to a VSX register (zero)
 def PPCmtvsrz : SDNode<"PPCISD::MTVSRZ", SDTUnaryOp, []>;
+
+// Extract a subvector from signed integer vector and convert to FP.
+// It is primarily used to convert a (widened) illegal integer vector
+// type to a legal floating point vector type.
+// For example v2i32 -> widened to v4i32 -> v2f64
 def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>;
+
+// Extract a subvector from unsigned integer vector and convert to FP.
+// As with SINT_VEC_TO_FP, used for converting illegal types.
 def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
+
+// An SDNode for swaps that are not associated with any loads/stores
+// and thereby have no chain.
 def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
 
+// FP_EXTEND_HALF(VECTOR, IDX) - Custom extend upper (IDX=0) half or
+// lower (IDX=1) half of v4f32 to v2f64.
 def PPCfpexth : SDNode<"PPCISD::FP_EXTEND_HALF", SDT_PPCfpexth, []>;
+
+// VSRC, CHAIN = LD_VSX_LH CHAIN, Ptr - This is a floating-point load of a
+// v2f32 value into the lower half of a VSR register.
 def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// VSRC, CHAIN = LD_SPLAT, CHAIN, Ptr - a splatting load memory
+// instructions such as LXVDSX, LXVWSX.
 def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// VSRC, CHAIN = ZEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+// that zero-extends.
 def PPCzextldsplat : SDNode<"PPCISD::ZEXT_LD_SPLAT", SDT_PPCldsplat,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// VSRC, CHAIN = SEXT_LD_SPLAT, CHAIN, Ptr - a splatting load memory
+// that sign-extends.
 def PPCsextldsplat : SDNode<"PPCISD::SEXT_LD_SPLAT", SDT_PPCldsplat,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+// PowerPC instructions that have SCALAR_TO_VECTOR semantics tend to
+// place the value into the least significant element of the most
+// significant doubleword in the vector. This is not element zero for
+// anything smaller than a doubleword on either endianness. This node has
+// the same semantics as SCALAR_TO_VECTOR except that the value remains in
+// the aforementioned location in the vector register.
 def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED",
                      SDTypeProfile<1, 1, []>, []>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
index 93a4693c50168..80aa1122167df 100644
--- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
@@ -7,20 +7,72 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCSelectionDAGInfo.h"
-#include "PPCISelLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+#define GET_SDNODE_DESC
+#include "PPCGenSDNodeInfo.inc"
 
 using namespace llvm;
 
+PPCSelectionDAGInfo::PPCSelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(PPCGenSDNodeInfo) {}
+
 PPCSelectionDAGInfo::~PPCSelectionDAGInfo() = default;
 
-bool PPCSelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
-  return Opcode >= PPCISD::FIRST_MEMORY_OPCODE &&
-         Opcode <= PPCISD::LAST_MEMORY_OPCODE;
+const char *PPCSelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
+  switch (static_cast<PPCISD::NodeType>(Opcode)) {
+  case PPCISD::GlobalBaseReg:
+    return "PPCISD::GlobalBaseReg";
+  case PPCISD::SRA_ADDZE:
+    return "PPCISD::SRA_ADDZE";
+  case PPCISD::READ_TIME_BASE:
+    return "PPCISD::READ_TIME_BASE";
+  case PPCISD::MFOCRF:
+    return "PPCISD::MFOCRF";
+  case PPCISD::ANDI_rec_1_EQ_BIT:
+    return "PPCISD::ANDI_rec_1_EQ_BIT";
+  case PPCISD::ANDI_rec_1_GT_BIT:
+    return "PPCISD::ANDI_rec_1_GT_BIT";
+  case PPCISD::BDNZ:
+    return "PPCISD::BDNZ";
+  case PPCISD::BDZ:
+    return "PPCISD::BDZ";
+  case PPCISD::PPC32_PICGOT:
+    return "PPCISD::PPC32_PICGOT";
+  case PPCISD::VADD_SPLAT:
+    return "PPCISD::VADD_SPLAT";
+  }
+
+  return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
 }
 
-bool PPCSelectionDAGInfo::isTargetStrictFPOpcode(unsigned Opcode) const {
-  return Opcode >= PPCISD::FIRST_STRICTFP_OPCODE &&
-         Opcode <= PPCISD::LAST_STRICTFP_OPCODE;
+void PPCSelectionDAGInfo::verifyTargetNode(const SelectionDAG &DAG,
+                                           const SDNode *N) const {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case PPCISD::DYNAREAOFFSET:
+    // invalid number of results; expected 2, got 1
+  case PPCISD::TOC_ENTRY:
+    // invalid number of results; expected 1, got 2
+  case PPCISD::STORE_COND:
+    // invalid number of results; expected 2, got 3
+  case PPCISD::LD_SPLAT:
+  case PPCISD::SEXT_LD_SPLAT:
+  case PPCISD::ZEXT_LD_SPLAT:
+    // invalid number of operands; expected 2, got 3
+  case PPCISD::ST_VSR_SCAL_INT:
+    // invalid number of operands; expected 4, got 5
+  case PPCISD::XXPERM:
+    // operand #1 must have type v2f64, but has type v16i8
+  case PPCISD::ACC_BUILD:
+    // operand #3 must have type v4i32, but has type v16i8
+  case PPCISD::PAIR_BUILD:
+    // operand #1 must have type v4i32, but has type v16i8
+    return;
+  }
+
+  SelectionDAGGenTargetInfo::verifyTargetNode(DAG, N);
 }
 
 std::pair<SDValue, SDValue> PPCSelectionDAGInfo::EmitTargetCodeForMemcmp(
diff --git a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
index f962a7a5321aa..ffe8982ce1af4 100644
--- a/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCSelectionDAGInfo.h
@@ -11,15 +11,66 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "PPCGenSDNodeInfo.inc"
+
 namespace llvm {
+namespace PPCISD {
+
+enum NodeType : unsigned {
+  /// The result of the mflr at function entry, used for PIC code.
+  GlobalBaseReg = GENERATED_OPCODE_END,
+
+  /// The combination of sra[wd]i and addze used to implemented signed
+  /// integer division by a power of 2. The first operand is the dividend,
+  /// and the second is the constant shift amount (representing the
+  /// divisor).
+  SRA_ADDZE,
+
+  /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
+  /// This copies the bits corresponding to the specified CRREG into the
+  /// resultant GPR.  Bits corresponding to other CR regs are undefined.
+  MFOCRF,
+
+  // FIXME: Remove these once the ANDI glue bug is fixed:
+  /// i1 = ANDI_rec_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
+  /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
+  /// implement truncation of i32 or i64 to i1.
+  ANDI_rec_1_EQ_BIT,
+  ANDI_rec_1_GT_BIT,
+
+  // READ_TIME_BASE - A read of the 64-bit time-base register on a 32-bit
+  // target (returns (Lo, Hi)). It takes a chain operand.
+  READ_TIME_BASE,
 
-class PPCSelectionDAGInfo : public SelectionDAGTargetInfo {
+  /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
+  /// loops.
+  BDNZ,
+  BDZ,
+
+  /// GPRC = address of _GLOBAL_OFFSET_TABLE_. Used by general dynamic and
+  /// local dynamic TLS and position indendepent code on PPC32.
+  PPC32_PICGOT,
+
+  /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
+  /// during instruction selection to optimize a BUILD_VECTOR into
+  /// operations on splats.  This is necessary to avoid losing these
+  /// optimizations due to constant folding.
+  VADD_SPLAT,
+};
+
+} // namespace PPCISD
+
+class PPCSelectionDAGInfo : public SelectionDAGGenTargetInfo {
 public:
+  PPCSelectionDAGInfo();
+
   ~PPCSelectionDAGInfo() override;
 
-  bool isTargetMemoryOpcode(unsigned Opcode) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
-  bool isTargetStrictFPOpcode(unsigned Opcode) const override;
+  void verifyTargetNode(const SelectionDAG &DAG,
+                        const SDNode *N) const override;
 
   std::pair<SDValue, SDValue>
   EmitTargetCodeForMemcmp(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,

From 5cf5eb7714ea4d2a9a7775c1a054b9ea6556b78b Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Mon, 17 Nov 2025 15:00:36 -0800
Subject: [PATCH 67/67] [MemProf] Fixup edges for largest N cold contexts
 (#167599)

We build the callsite graph by first adding nodes and edges for all
allocation contexts, then match the interior callsite nodes onto actual
calls (IR or summary), which due to inlining may result in the
generation of new nodes representing the inlined context sequence. We
attempt to update edges correctly during this process, but in the case
of recursion this becomes impossible to always get correct.
Specifically, when creating new inlined sequence nodes for stack ids on
recursive cycles we can't always update correctly, because we have lost
the original ordering of the context.

This PR introduces a mechanism, guarded by -memprof-top-n-important=
flag, to keep track of extra information for the largest N cold
contexts. Another flag -memprof-fixup-important (enabled by default)
will perform more expensive fixup of the edges for those largest N cold
contexts, by saving and walking the original ordered list of stack ids
from the context.
---
 .../IPO/MemProfContextDisambiguation.cpp      | 262 ++++++++++++++++--
 llvm/test/ThinLTO/X86/memprof-fixup.ll        | 129 +++++++++
 .../MemProfContextDisambiguation/fixup.ll     | 105 +++++++
 3 files changed, 480 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/ThinLTO/X86/memprof-fixup.ll
 create mode 100644 llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index d35ae4730a9f3..0f4bc649df720 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -107,6 +107,10 @@ STATISTIC(MismatchedCloneAssignments,
 STATISTIC(TotalMergeInvokes, "Number of merge invocations for nodes");
 STATISTIC(TotalMergeIters, "Number of merge iterations for nodes");
 STATISTIC(MaxMergeIters, "Max merge iterations for nodes");
+STATISTIC(NumImportantContextIds, "Number of important context ids");
+STATISTIC(NumFixupEdgeIdsInserted, "Number of fixup edge ids inserted");
+STATISTIC(NumFixupEdgesAdded, "Number of fixup edges added");
+STATISTIC(NumFixedContexts, "Number of contexts with fixed edges");
 
 static cl::opt<std::string> DotFilePathPrefix(
     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
@@ -223,9 +227,18 @@ static cl::opt<bool> MemProfRequireDefinitionForPromotion(
 extern cl::opt<bool> MemProfReportHintedSizes;
 extern cl::opt<unsigned> MinClonedColdBytePercent;
 
+cl::opt<unsigned> MemProfTopNImportant(
+    "memprof-top-n-important", cl::init(10), cl::Hidden,
+    cl::desc("Number of largest cold contexts to consider important"));
+
+cl::opt<bool> MemProfFixupImportant(
+    "memprof-fixup-important", cl::init(true), cl::Hidden,
+    cl::desc("Enables edge fixup for important contexts"));
+
 } // namespace llvm
 
 namespace {
+
 /// CRTP base for graphs built from either IR or ThinLTO summary index.
 ///
 /// The graph represents the call contexts in all memprof metadata on allocation
@@ -581,17 +594,26 @@ class CallsiteContextGraph {
 
   /// Adds nodes for the given MIB stack ids.
   template <class NodeT, class IteratorT>
-  void addStackNodesForMIB(ContextNode *AllocNode,
-                           CallStack<NodeT, IteratorT> &StackContext,
-                           CallStack<NodeT, IteratorT> &CallsiteContext,
-                           AllocationType AllocType,
-                           ArrayRef<ContextTotalSize> ContextSizeInfo);
+  void addStackNodesForMIB(
+      ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
+      CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
+      ArrayRef<ContextTotalSize> ContextSizeInfo,
+      std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold);
 
   /// Matches all callsite metadata (or summary) to the nodes created for
   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
   /// inlining performed on those callsite instructions.
   void updateStackNodes();
 
+  /// Optionally fixup edges for the N largest cold contexts to better enable
+  /// cloning. This is particularly helpful if the context includes recursion
+  /// as well as inlining, resulting in a single stack node for multiple stack
+  /// ids in the context. With recursion it is particularly difficult to get the
+  /// edge updates correct as in the general case we have lost the original
+  /// stack id ordering for the context. Do more expensive fixup for the largest
+  /// contexts, controlled by MemProfTopNImportant and MemProfFixupImportant.
+  void fixupImportantContexts();
+
   /// Update graph to conservatively handle any callsite stack nodes that target
   /// multiple different callee target functions.
   void handleCallsitesWithMultipleTargets();
@@ -658,7 +680,8 @@ class CallsiteContextGraph {
   void assignStackNodesPostOrder(
       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
-      DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
+      DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+      const DenseSet<uint32_t> &ImportantContextIds);
 
   /// Duplicates the given set of context ids, updating the provided
   /// map from each original id with the newly generated context ids,
@@ -859,6 +882,50 @@ class CallsiteContextGraph {
   /// nodes.
   DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
 
+  /// Saves information for the contexts identified as important (the largest
+  /// cold contexts up to MemProfTopNImportant).
+  struct ImportantContextInfo {
+    // The original list of leaf first stack ids corresponding to this context.
+    std::vector<uint64_t> StackIds;
+    // Max length of stack ids corresponding to a single stack ContextNode for
+    // this context (i.e. the max length of a key in StackIdsToNode below).
+    unsigned MaxLength = 0;
+    // Mapping of slices of the stack ids to the corresponding ContextNode
+    // (there can be multiple stack ids due to inlining). Populated when
+    // updating stack nodes while matching them to the IR or summary.
+    std::map<std::vector<uint64_t>, ContextNode *> StackIdsToNode;
+  };
+
+  // Map of important full context ids to information about each.
+  DenseMap<uint32_t, ImportantContextInfo> ImportantContextIdInfo;
+
+  // For each important context id found in Node (if any), records the list of
+  // stack ids that corresponded to the given callsite Node. There can be more
+  // than one in the case of inlining.
+  void recordStackNode(std::vector<uint64_t> &StackIds, ContextNode *Node,
+                       // We pass in the Node's context ids to avoid the
+                       // overhead of computing them as the caller already has
+                       // them in some cases.
+                       const DenseSet<uint32_t> &NodeContextIds,
+                       const DenseSet<uint32_t> &ImportantContextIds) {
+    if (!MemProfTopNImportant) {
+      assert(ImportantContextIds.empty());
+      return;
+    }
+    DenseSet<uint32_t> Ids =
+        set_intersection(NodeContextIds, ImportantContextIds);
+    if (Ids.empty())
+      return;
+    auto Size = StackIds.size();
+    for (auto Id : Ids) {
+      auto &Entry = ImportantContextIdInfo[Id];
+      Entry.StackIdsToNode[StackIds] = Node;
+      // Keep track of the max to simplify later analysis.
+      if (Size > Entry.MaxLength)
+        Entry.MaxLength = Size;
+    }
+  }
+
   /// Maps to track the calls to their corresponding nodes in the graph.
   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
@@ -1353,7 +1420,8 @@ template <class NodeT, class IteratorT>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
-    ArrayRef<ContextTotalSize> ContextSizeInfo) {
+    ArrayRef<ContextTotalSize> ContextSizeInfo,
+    std::map<uint64_t, uint32_t> &TotalSizeToContextIdTopNCold) {
   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
   // is done.
   if (AllocType == AllocationType::Hot)
@@ -1361,8 +1429,33 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
 
   ContextIdToAllocationType[++LastContextId] = AllocType;
 
+  bool IsImportant = false;
   if (!ContextSizeInfo.empty()) {
     auto &Entry = ContextIdToContextSizeInfos[LastContextId];
+    // If this is a cold allocation, and we are collecting non-zero largest
+    // contexts, see if this is a candidate.
+    if (AllocType == AllocationType::Cold && MemProfTopNImportant > 0) {
+      uint64_t TotalCold = 0;
+      for (auto &CSI : ContextSizeInfo)
+        TotalCold += CSI.TotalSize;
+      // Record this context if either we haven't found the first top-n largest
+      // yet, or if it is larger than the smallest already recorded.
+      if (TotalSizeToContextIdTopNCold.size() < MemProfTopNImportant ||
+          // Since TotalSizeToContextIdTopNCold is a std::map, it is implicitly
+          // sorted in ascending size of its key which is the size.
+          TotalCold > TotalSizeToContextIdTopNCold.begin()->first) {
+        if (TotalSizeToContextIdTopNCold.size() == MemProfTopNImportant) {
+          // Remove old one and its associated entries.
+          auto IdToRemove = TotalSizeToContextIdTopNCold.begin()->second;
+          TotalSizeToContextIdTopNCold.erase(
+              TotalSizeToContextIdTopNCold.begin());
+          assert(ImportantContextIdInfo.count(IdToRemove));
+          ImportantContextIdInfo.erase(IdToRemove);
+        }
+        TotalSizeToContextIdTopNCold[TotalCold] = LastContextId;
+        IsImportant = true;
+      }
+    }
     Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
   }
 
@@ -1381,6 +1474,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
        ContextIter != StackContext.end(); ++ContextIter) {
     auto StackId = getStackId(*ContextIter);
+    if (IsImportant)
+      ImportantContextIdInfo[LastContextId].StackIds.push_back(StackId);
     ContextNode *StackNode = getNodeForStackId(StackId);
     if (!StackNode) {
       StackNode = createNewNode(/*IsAllocation=*/false);
@@ -1600,11 +1695,12 @@ static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
 
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
-    assignStackNodesPostOrder(
-        ContextNode *Node, DenseSet<const ContextNode *> &Visited,
-        DenseMap<uint64_t, std::vector<CallContextInfo>>
-            &StackIdToMatchingCalls,
-        DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
+    assignStackNodesPostOrder(ContextNode *Node,
+                              DenseSet<const ContextNode *> &Visited,
+                              DenseMap<uint64_t, std::vector<CallContextInfo>>
+                                  &StackIdToMatchingCalls,
+                              DenseMap<CallInfo, CallInfo> &CallToMatchingCall,
+                              const DenseSet<uint32_t> &ImportantContextIds) {
   auto Inserted = Visited.insert(Node);
   if (!Inserted.second)
     return;
@@ -1620,7 +1716,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       continue;
     }
     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
-                              CallToMatchingCall);
+                              CallToMatchingCall, ImportantContextIds);
   }
 
   // If this node's stack id is in the map, update the graph to contain new
@@ -1648,6 +1744,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
       Node->setCall(Call);
       NonAllocationCallToContextNodeMap[Call] = Node;
       NodeToCallingFunc[Node] = Func;
+      recordStackNode(Ids, Node, Node->getContextIds(), ImportantContextIds);
       return;
     }
   }
@@ -1786,6 +1883,9 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
                                 : CurNode->computeAllocType();
       PrevNode = CurNode;
     }
+
+    recordStackNode(Ids, NewNode, SavedContextIds, ImportantContextIds);
+
     if (VerifyNodes) {
       checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
       for (auto Id : Ids) {
@@ -1798,6 +1898,122 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
   }
 }
 
+template <typename DerivedCCG, typename FuncTy, typename CallTy>
+void CallsiteContextGraph<DerivedCCG, FuncTy,
+                          CallTy>::fixupImportantContexts() {
+  if (ImportantContextIdInfo.empty())
+    return;
+
+  // Update statistics as we are done building this map at this point.
+  NumImportantContextIds = ImportantContextIdInfo.size();
+
+  if (!MemProfFixupImportant)
+    return;
+
+  if (ExportToDot)
+    exportToDot("beforestackfixup");
+
+  // For each context we identified as important, walk through the saved context
+  // stack ids in order from leaf upwards, and make sure all edges are correct.
+  // These can be difficult to get right when updating the graph while mapping
+  // nodes onto summary or IR, especially when there is recursion. In
+  // particular, when we have created new nodes to reflect inlining, it is
+  // sometimes impossible to know exactly how to update the edges in the face of
+  // recursion, as we have lost the original ordering of the stack ids in the
+  // contexts.
+  // TODO: Consider only doing this if we detect the context has recursive
+  // cycles.
+  //
+  // I.e. assume we have a context with stack ids like: {A B A C A D E}
+  // and let's say A was inlined into B, C, and D. The original graph will have
+  // multiple recursive cycles through A. When we match the original context
+  // nodes onto the IR or summary, we will merge {A B} into one context node,
+  // {A C} onto another, and {A D} onto another. Looking at the stack sequence
+  // above, we should end up with a non-cyclic set of edges like:
+  // {AB} <- {AC} <- {AD} <- E. However, because we normally have lost the
+  // original ordering, we won't get the edges correct initially (it's
+  // impossible without the original ordering). Here we do the fixup (add and
+  // removing edges where necessary) for this context. In the
+  // ImportantContextInfo struct in this case we should have a MaxLength = 2,
+  // and map entries for {A B}, {A C}, {A D}, and {E}.
+  for (auto &[CurContextId, Info] : ImportantContextIdInfo) {
+    if (Info.StackIdsToNode.empty())
+      continue;
+    bool Changed = false;
+    ContextNode *PrevNode = nullptr;
+    ContextNode *CurNode = nullptr;
+    DenseSet<const ContextEdge *> VisitedEdges;
+    ArrayRef<uint64_t> AllStackIds(Info.StackIds);
+    // Try to identify what callsite ContextNode maps to which slice of the
+    // context's ordered stack ids.
+    for (unsigned I = 0; I < AllStackIds.size(); I++, PrevNode = CurNode) {
+      // We will do this greedily, trying up to MaxLength stack ids in a row, to
+      // see if we recorded a context node for that sequence.
+      auto Len = Info.MaxLength;
+      auto LenToEnd = AllStackIds.size() - I;
+      if (Len > LenToEnd)
+        Len = LenToEnd;
+      CurNode = nullptr;
+      // Try to find a recorded context node starting with the longest length
+      // recorded, and on down until we check for just a single stack node.
+      for (; Len > 0; Len--) {
+        // Get the slice of the original stack id sequence to check.
+        auto CheckStackIds = AllStackIds.slice(I, Len);
+        auto EntryIt = Info.StackIdsToNode.find(CheckStackIds);
+        if (EntryIt == Info.StackIdsToNode.end())
+          continue;
+        CurNode = EntryIt->second;
+        // Skip forward so we don't try to look for the ones we just matched.
+        // We increment by Len - 1, because the outer for loop will increment I.
+        I += Len - 1;
+        break;
+      }
+      // Give up if we couldn't find a node. Since we need to clone from the
+      // leaf allocation upwards, no sense in doing anymore fixup further up
+      // the context if we couldn't match part of the original stack context
+      // onto a callsite node.
+      if (!CurNode)
+        break;
+      // No edges to fix up until we have a pair of nodes that should be
+      // adjacent in the graph.
+      if (!PrevNode)
+        continue;
+      // See if we already have a call edge from CurNode to PrevNode.
+      auto *CurEdge = PrevNode->findEdgeFromCaller(CurNode);
+      if (CurEdge) {
+        // We already have an edge. Make sure it contains this context id.
+        if (CurEdge->getContextIds().insert(CurContextId).second) {
+          NumFixupEdgeIdsInserted++;
+          Changed = true;
+        }
+      } else {
+        // No edge exists - add one.
+        NumFixupEdgesAdded++;
+        DenseSet<uint32_t> ContextIds({CurContextId});
+        auto AllocType = computeAllocType(ContextIds);
+        auto NewEdge = std::make_shared<ContextEdge>(
+            PrevNode, CurNode, AllocType, std::move(ContextIds));
+        PrevNode->CallerEdges.push_back(NewEdge);
+        CurNode->CalleeEdges.push_back(NewEdge);
+        // Save the new edge for the below handling.
+        CurEdge = NewEdge.get();
+        Changed = true;
+      }
+      VisitedEdges.insert(CurEdge);
+      // Now remove this context id from any other caller edges calling
+      // PrevNode.
+      for (auto &Edge : PrevNode->CallerEdges) {
+        // Skip the edge updating/created above and edges we have already
+        // visited (due to recursion).
+        if (Edge.get() != CurEdge && !VisitedEdges.contains(Edge.get()))
+          Edge->getContextIds().erase(CurContextId);
+      }
+    }
+    if (Changed)
+      NumFixedContexts++;
+  }
+}
+
 template <typename DerivedCCG, typename FuncTy, typename CallTy>
 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // Map of stack id to all calls with that as the last (outermost caller)
@@ -2043,9 +2259,14 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
   // nodes representing any inlining at interior callsites. Note we move the
   // associated context ids over to the new nodes.
   DenseSet<const ContextNode *> Visited;
+  DenseSet<uint32_t> ImportantContextIds(llvm::from_range,
+                                         ImportantContextIdInfo.keys());
   for (auto &Entry : AllocationCallToContextNodeMap)
     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
-                              CallToMatchingCall);
+                              CallToMatchingCall, ImportantContextIds);
+
+  fixupImportantContexts();
+
   if (VerifyCCG)
     check();
 }
@@ -2155,6 +2376,10 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
     Module &M,
     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
     : Mod(M), OREGetter(OREGetter) {
+  // Map for keeping track of the largest cold contexts up to the number given
+  // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+  // must be sorted.
+  std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
   for (auto &F : M) {
     std::vector<CallInfo> CallsWithMetadata;
     for (auto &BB : F) {
@@ -2191,7 +2416,8 @@ ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
                 AllocNode, StackContext, CallsiteContext,
-                getMIBAllocType(MIBMD), ContextSizeInfo);
+                getMIBAllocType(MIBMD), ContextSizeInfo,
+                TotalSizeToContextIdTopNCold);
           }
           // If exporting the graph to dot and an allocation id of interest was
           // specified, record all the context ids for this allocation node.
@@ -2241,6 +2467,10 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
         isPrevailing)
     : Index(Index), isPrevailing(isPrevailing) {
+  // Map for keeping track of the largest cold contexts up to the number given
+  // by MemProfTopNImportant. Must be a std::map (not DenseMap) because keys
+  // must be sorted.
+  std::map<uint64_t, uint32_t> TotalSizeToContextIdTopNCold;
   for (auto &I : Index) {
     auto VI = Index.getValueInfo(I);
     for (auto &S : VI.getSummaryList()) {
@@ -2288,7 +2518,7 @@ IndexCallsiteContextGraph::IndexCallsiteContextGraph(
             }
             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
-                ContextSizeInfo);
+                ContextSizeInfo, TotalSizeToContextIdTopNCold);
             I++;
           }
           // If exporting the graph to dot and an allocation id of interest was
diff --git a/llvm/test/ThinLTO/X86/memprof-fixup.ll b/llvm/test/ThinLTO/X86/memprof-fixup.ll
new file mode 100644
index 0000000000000..afed80fc562c1
--- /dev/null
+++ b/llvm/test/ThinLTO/X86/memprof-fixup.ll
@@ -0,0 +1,129 @@
+;; Test fixup of largest cold contexts.
+
+;; This case has multiple recursive cycles in the cold context, which can be
+;; made non-recursive with the inlining in the code.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+;; Need context sizes in summary, so enable reporting.
+; RUN: opt -thinlto-bc -memprof-report-hinted-sizes %s >%t.o
+
+;; First try disabling detection of the largest cold contexts.
+;; We will not get any cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=0 \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of important context ids" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+;; Allow default detection of the largest cold contexts, but disable fixup.
+;; We should find 1 important context, but still not get cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -memprof-fixup-important=false \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=TOPN1-NOFIXUP \
+; RUN:	--implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids
+
+;; Allow default detection of largest cold contexts, fixup is enabled by default.
+;; This case should get fixup and cloning.
+; RUN: llvm-lto2 run %t.o -enable-memprof-context-disambiguation \
+; RUN:  -supports-hot-cold-new \
+; RUN:  -r=%t.o,E,plx \
+; RUN:  -r=%t.o,DB,plx \
+; RUN:  -r=%t.o,CB,plx \
+; RUN:  -r=%t.o,A,plx \
+; RUN:  -r=%t.o,main,plx \
+; RUN:  -r=%t.o,_Znam, \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:  -pass-remarks=memprof-context-disambiguation \
+; RUN:  -o %t.out 2>&1 | FileCheck %s --check-prefix=TOPN1
+
+; TOPN1: created clone E.memprof.1
+; TOPN1: call in clone E marked with memprof allocation attribute notcold
+; TOPN1: call in clone E.memprof.1 marked with memprof allocation attribute cold
+; TOPN1: created clone DB.memprof.1
+; TOPN1: call in clone DB.memprof.1 assigned to call function clone E.memprof.1
+; TOPN1: created clone CB.memprof.1
+; TOPN1: call in clone CB.memprof.1 assigned to call function clone DB.memprof.1
+; TOPN1: created clone A.memprof.1
+; TOPN1: call in clone A.memprof.1 assigned to call function clone CB.memprof.1
+; TOPN1: call in clone main assigned to call function clone A.memprof.1
+
+; TOPN1: 1 memprof-context-disambiguation - Number of contexts with fixed edges
+; TOPN1: 2 memprof-context-disambiguation - Number of fixup edges added
+; TOPN1: 1 memprof-context-disambiguation - Number of important context ids
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @E() {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !memprof !7, !callsite !14
+  ret void
+}
+
+define void @DB() {
+entry:
+  tail call void @E(), !callsite !17
+  ret void
+}
+
+define void @CB() {
+entry:
+  tail call void @DB(), !callsite !22
+  ret void
+}
+
+define void @A() {
+entry:
+  tail call void @CB(), !callsite !20
+  ret void
+}
+
+define i32 @main() {
+entry:
+  tail call void @A(), !callsite !25
+  tail call void @A(), !callsite !27
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!7 = !{!8, !10}
+!8 = !{!9, !"cold", !2}
+!9 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 678}
+!2 = !{i64 12345, i64 200}
+!10 = !{!11, !"notcold", !3}
+!3 = !{i64 23456, i64 200}
+!11 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 789}
+!14 = !{i64 123}
+!17 = !{i64 234, i64 345}
+!22 = !{i64 234, i64 456}
+!20 = !{i64 234, i64 567}
+!25 = !{i64 678}
+!27 = !{i64 789}
diff --git a/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
new file mode 100644
index 0000000000000..a08f89b5bbe97
--- /dev/null
+++ b/llvm/test/Transforms/MemProfContextDisambiguation/fixup.ll
@@ -0,0 +1,105 @@
+;; Test fixup of largest cold contexts.
+
+;; This case has multiple recursive cycles in the cold context, which can be
+;; made non-recursive with the inlining in the code.
+
+;; -stats requires asserts
+; REQUIRES: asserts
+
+;; First try disabling detection of the largest cold contexts.
+;; We will not get any cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-top-n-important=0 \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUNL	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of important context ids" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+;; Allow default detection of the largest cold contexts, but disable fixup.
+;; We should find 1 important context, but still not get cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-fixup-important=false \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUNL	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1-NOFIXUP \
+; RUN:	--implicit-check-not="created clone" \
+; RUN:	--implicit-check-not="Number of cold static allocations" \
+; RUN:	--implicit-check-not="Number of function clones" \
+; RUN:	--implicit-check-not="Number of fixup"
+
+; TOPN1-NOFIXUP: 1 memprof-context-disambiguation - Number of important context ids
+
+;; Allow default detection of largest cold contexts, fixup is enabled by default.
+;; This case should get fixup and cloning.
+; RUN: opt -passes=memprof-context-disambiguation -supports-hot-cold-new \
+; RUN:  -memprof-verify-ccg -memprof-verify-nodes -stats \
+; RUN:	-pass-remarks=memprof-context-disambiguation \
+; RUN:  %s -S 2>&1 | FileCheck %s --check-prefix=TOPN1
+
+; TOPN1: created clone E.memprof.1
+; TOPN1: created clone DB.memprof.1
+; TOPN1: created clone CB.memprof.1
+; TOPN1: created clone A.memprof.1
+; TOPN1: call in clone main assigned to call function clone A.memprof.1
+; TOPN1: call in clone A.memprof.1 assigned to call function clone CB.memprof.1
+; TOPN1: call in clone CB.memprof.1 assigned to call function clone DB.memprof.1
+; TOPN1: call in clone DB.memprof.1 assigned to call function clone E.memprof.1
+; TOPN1: call in clone E.memprof.1 marked with memprof allocation attribute cold
+; TOPN1: call in clone E marked with memprof allocation attribute notcold
+
+; TOPN1: 1 memprof-context-disambiguation - Number of contexts with fixed edges
+; TOPN1: 2 memprof-context-disambiguation - Number of fixup edges added
+; TOPN1: 1 memprof-context-disambiguation - Number of important context ids
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @E() {
+entry:
+  %call = tail call ptr @_Znam(i64 10), !memprof !7, !callsite !14
+  ret void
+}
+
+define void @DB() {
+entry:
+  tail call void @E(), !callsite !17
+  ret void
+}
+
+define void @CB() {
+entry:
+  tail call void @DB(), !callsite !22
+  ret void
+}
+
+define void @A() {
+entry:
+  tail call void @CB(), !callsite !20
+  ret void
+}
+
+define i32 @main() {
+entry:
+  tail call void @A(), !callsite !25
+  tail call void @A(), !callsite !27
+  ret i32 0
+}
+
+declare ptr @_Znam(i64)
+
+!7 = !{!8, !10}
+!8 = !{!9, !"cold", !2}
+!9 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 678}
+!2 = !{i64 12345, i64 200}
+!10 = !{!11, !"notcold", !3}
+!3 = !{i64 23456, i64 200}
+!11 = !{i64 123, i64 234, i64 345, i64 234, i64 456, i64 234, i64 567, i64 789}
+!14 = !{i64 123}
+!17 = !{i64 234, i64 345}
+!22 = !{i64 234, i64 456}
+!20 = !{i64 234, i64 567}
+!25 = !{i64 678}
+!27 = !{i64 789}