From a533c17f36ee042380ba0b4a3988a81782ae49e2 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Sun, 19 Oct 2025 07:42:29 -0500
Subject: [PATCH 1/8] [AMDGPU] Add hot block register renaming pass

This patch introduces a post-allocation register renaming optimization
pass that reduces value density in hot basic blocks. The pass helps
the post-RA scheduler avoid false WAW dependencies by moving local
values to unused physical registers.

The pass operates after greedy register allocation but before
VirtRegRewriter. It identifies hot blocks (above frequency threshold),
calculates value density per physical register, and selectively moves
local live ranges to free registers. Only 32-bit VGPR values that live
entirely within a single basic block are moved, ensuring conservative
behavior.

Key features:
- Respects tied operands and register allocation constraints
- Honors occupancy-based VGPR limits to avoid spilling
- Disabled by default (enable with -amdgpu-enable-hot-block-reg-renaming)
- Includes comprehensive lit tests

Performance results show up to 2% improvement on register-intensive
kernels such as rocRAND MTGP32.
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   3 +
 .../AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp | 505 ++++++++++++++++++
 .../AMDGPU/AMDGPUHotBlockRegisterRenaming.h   |  34 ++
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |   1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 227 ++++----
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |   1 +
 .../AMDGPU/hot-block-register-renaming.mir    | 146 +++++
 7 files changed, 806 insertions(+), 111 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h
 create mode 100644 llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 3a4f20a3bab34..5cd02635e90b4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -545,6 +545,9 @@ extern char &AMDGPUExportKernelRuntimeHandlesLegacyID;
 void initializeGCNNSAReassignLegacyPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 
+void initializeAMDGPUHotBlockRegisterRenamingLegacyPass(PassRegistry &);
+extern char &AMDGPUHotBlockRegisterRenamingID;
+
 void initializeGCNPreRALongBranchRegLegacyPass(PassRegistry &);
 extern char &GCNPreRALongBranchRegID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
new file mode 100644
index 0000000000000..bc95ee375d008
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
@@ -0,0 +1,505 @@
+//===-- AMDGPUHotBlockRegisterRenaming.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Reduces value density in hot basic blocks by remapping local values
+/// from overused physical registers to free physical registers.
+///
+/// This gives the Post-RA scheduler more flexibility to reorder instructions
+/// by reducing false dependencies created by register reuse.
+///
+/// Algorithm:
+/// 1. Sort basic blocks by frequency (hottest first)
+/// 2. For each BB:
+///    a. Calculate value density (count of local values per PhysReg)
+///    b. Identify free PhysRegs (completely unused in this BB)
+///    c. Iteratively move local values from dense to free registers
+/// 3. VirtRegRewriter applies the updated VirtRegMap
+///
+/// Constraints (conservative):
+/// - Only move 32-bit VGPRs
+/// - Only move local values (single segment, entirely within BB)
+/// - Only move to completely free registers
+/// - Skip values with allocation hints
+/// - Skip reserved registers
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUHotBlockRegisterRenaming.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include <queue>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-hot-block-reg-renaming"
+
+STATISTIC(NumBlocksProcessed, "Number of hot blocks processed");
+STATISTIC(NumValuesRemapped, "Number of values remapped to reduce density");
+STATISTIC(NumBlocksSkipped,
+          "Number of blocks skipped (no dense regs or no free regs)");
+
+namespace {
+
+class AMDGPUHotBlockRegisterRenamingImpl {
+public:
+  AMDGPUHotBlockRegisterRenamingImpl(VirtRegMap *VRM, LiveRegMatrix *LRM,
+                                     LiveIntervals *LIS,
+                                     MachineBlockFrequencyInfo *MBFI,
+                                     const GCNSubtarget *ST,
+                                     const SIMachineFunctionInfo &MFI)
+      : VRM(VRM), LRM(LRM), LIS(LIS), MBFI(MBFI), ST(ST), MFI(MFI) {}
+
+  bool run(MachineFunction &MF);
+
+private:
+  VirtRegMap *VRM;
+  LiveRegMatrix *LRM;
+  LiveIntervals *LIS;
+  MachineBlockFrequencyInfo *MBFI;
+  const GCNSubtarget *ST;
+  const SIMachineFunctionInfo &MFI;
+  const SIRegisterInfo *TRI = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  unsigned VGPRLimit = 0; // Register limit based on occupancy
+
+  /// Cache of VirtRegs that cannot be moved (e.g. tied operands)
+  DenseSet<Register> UnmovableVRegs;
+
+  /// Process a single basic block
+  bool processBasicBlock(MachineBasicBlock *MBB);
+
+  /// Calculate value density map for a basic block
+  void calculateValueDensity(MachineBasicBlock *MBB,
+                             DenseMap<MCRegister, unsigned> &ValueDensity);
+
+  /// Find free physical registers in a basic block
+  void findFreeRegisters(MachineBasicBlock *MBB,
+                         SmallVectorImpl<MCRegister> &FreeRegs);
+
+  /// Check if a segment is local to a basic block
+  bool isLocalSegment(const LiveInterval::Segment &Seg, SlotIndex BBStart,
+                      SlotIndex BBEnd) const;
+
+  /// Check if a register is suitable for our optimization
+  bool isSuitableRegister(MCRegister PhysReg) const;
+
+  /// Check if a virtual register can be safely moved
+  bool canMoveValue(Register VirtReg, MCRegister CurrentPhysReg,
+                    MCRegister TargetPhysReg, SlotIndex BBStart,
+                    SlotIndex BBEnd);
+
+  /// Try to move a value from DenseReg to FreeReg
+  bool tryMoveValue(MCRegister DenseReg, MCRegister FreeReg,
+                    MachineBasicBlock *MBB, SlotIndex BBStart, SlotIndex BBEnd);
+};
+
+class AMDGPUHotBlockRegisterRenamingLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUHotBlockRegisterRenamingLegacy() : MachineFunctionPass(ID) {
+    initializeAMDGPUHotBlockRegisterRenamingLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Hot Block Register Renaming";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervalsWrapperPass>();
+    AU.addRequired<VirtRegMapWrapperLegacy>();
+    AU.addRequired<LiveRegMatrixWrapperLegacy>();
+    AU.addRequired<MachineBlockFrequencyInfoWrapperPass>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // end anonymous namespace
+
+char AMDGPUHotBlockRegisterRenamingLegacy::ID = 0;
+
+char &llvm::AMDGPUHotBlockRegisterRenamingID =
+    AMDGPUHotBlockRegisterRenamingLegacy::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPUHotBlockRegisterRenamingLegacy, DEBUG_TYPE,
+                      "AMDGPU Hot Block Register Renaming", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMapWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrixWrapperLegacy)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(AMDGPUHotBlockRegisterRenamingLegacy, DEBUG_TYPE,
+                    "AMDGPU Hot Block Register Renaming", false, false)
+
+bool AMDGPUHotBlockRegisterRenamingLegacy::runOnMachineFunction(
+    MachineFunction &MF) {
+  VirtRegMap *VRM = &getAnalysis<VirtRegMapWrapperLegacy>().getVRM();
+  LiveRegMatrix *LRM = &getAnalysis<LiveRegMatrixWrapperLegacy>().getLRM();
+  LiveIntervals *LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+  MachineBlockFrequencyInfo *MBFI =
+      &getAnalysis<MachineBlockFrequencyInfoWrapperPass>().getMBFI();
+
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  AMDGPUHotBlockRegisterRenamingImpl Impl(VRM, LRM, LIS, MBFI, ST, MFI);
+  return Impl.run(MF);
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::run(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "AMDGPUHotBlockRegisterRenaming: Processing "
+                    << MF.getName() << "\n");
+
+  TRI = ST->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  // Calculate VGPR limit based on occupancy
+  unsigned Occupancy = MFI.getOccupancy();
+  VGPRLimit = ST->getMaxNumVGPRs(Occupancy, MFI.getDynamicVGPRBlockSize());
+
+  LLVM_DEBUG(dbgs() << "  Occupancy: " << Occupancy
+                    << ", VGPR Limit: " << VGPRLimit << "\n");
+
+  // Sort basic blocks by frequency (hottest first)
+  SmallVector<MachineBasicBlock *, 16> SortedBBs;
+  for (MachineBasicBlock &MBB : MF) {
+    SortedBBs.push_back(&MBB);
+  }
+
+  llvm::sort(SortedBBs, [this](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
+  });
+
+  bool Changed = false;
+  for (MachineBasicBlock *MBB : SortedBBs) {
+    Changed |= processBasicBlock(MBB);
+  }
+
+  return Changed;
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::processBasicBlock(
+    MachineBasicBlock *MBB) {
+  LLVM_DEBUG(dbgs() << "  Processing BB#" << MBB->getNumber() << " (freq="
+                    << MBFI->getBlockFreq(MBB).getFrequency() << ")\n");
+
+  // Clear the unmovable cache for each BB (tied operands are BB-specific)
+  UnmovableVRegs.clear();
+
+  SlotIndex BBStart = LIS->getMBBStartIdx(MBB);
+  SlotIndex BBEnd = LIS->getMBBEndIdx(MBB);
+
+  // Step 1: Calculate value density
+  DenseMap<MCRegister, unsigned> ValueDensity;
+  calculateValueDensity(MBB, ValueDensity);
+
+  if (ValueDensity.empty()) {
+    LLVM_DEBUG(dbgs() << "    No values found, skipping\n");
+    ++NumBlocksSkipped;
+    return false;
+  }
+
+  // Step 2: Find free registers
+  SmallVector<MCRegister, 64> FreeRegs;
+  findFreeRegisters(MBB, FreeRegs);
+
+  if (FreeRegs.empty()) {
+    LLVM_DEBUG(dbgs() << "    No free registers, skipping\n");
+    ++NumBlocksSkipped;
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "    Found " << ValueDensity.size()
+                    << " registers with values, " << FreeRegs.size()
+                    << " free registers\n");
+
+  // Step 3: Create max heap of dense registers
+  auto Comparator = [&ValueDensity](MCRegister A, MCRegister B) {
+    return ValueDensity[A] < ValueDensity[B]; // max heap
+  };
+  std::priority_queue<MCRegister, std::vector<MCRegister>, decltype(Comparator)>
+      DenseRegsHeap(Comparator);
+
+  for (auto &Entry : ValueDensity) {
+    if (Entry.second > 1) { // Only interested in registers with density > 1
+      DenseRegsHeap.push(Entry.first);
+    }
+  }
+
+  if (DenseRegsHeap.empty()) {
+    LLVM_DEBUG(
+        dbgs() << "    No dense registers (all density <= 1), skipping\n");
+    ++NumBlocksSkipped;
+    return false;
+  }
+
+  // Step 4: Iteratively move values
+  bool Changed = false;
+  size_t FreeRegIdx = 0;
+
+  while (!DenseRegsHeap.empty() && FreeRegIdx < FreeRegs.size()) {
+    MCRegister DenseReg = DenseRegsHeap.top();
+    DenseRegsHeap.pop();
+
+    MCRegister FreeReg = FreeRegs[FreeRegIdx++];
+
+    if (tryMoveValue(DenseReg, FreeReg, MBB, BBStart, BBEnd)) {
+      Changed = true;
+      ++NumValuesRemapped;
+
+      // Update density
+      ValueDensity[DenseReg]--;
+
+      // If still dense, put back in heap
+      if (ValueDensity[DenseReg] > 1) {
+        DenseRegsHeap.push(DenseReg);
+      }
+    }
+  }
+
+  if (Changed) {
+    ++NumBlocksProcessed;
+  } else {
+    ++NumBlocksSkipped;
+  }
+
+  return Changed;
+}
+
+void AMDGPUHotBlockRegisterRenamingImpl::calculateValueDensity(
+    MachineBasicBlock *MBB, DenseMap<MCRegister, unsigned> &ValueDensity) {
+  SlotIndex BBStart = LIS->getMBBStartIdx(MBB);
+  SlotIndex BBEnd = LIS->getMBBEndIdx(MBB);
+
+  // Iterate over VGPR_32 register class
+  const TargetRegisterClass *VGPR_32_RC =
+      TRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+
+  for (MCRegister PhysReg : *VGPR_32_RC) {
+    if (MRI->isReserved(PhysReg))
+      continue;
+
+    unsigned LocalValueCount = 0;
+
+    // Access LiveIntervalUnion for this PhysReg
+    for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+      LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+
+      for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) {
+        SlotIndex SegStart = SI.start();
+        SlotIndex SegEnd = SI.stop();
+
+        // Check if segment is entirely within this BB
+        if (SegStart >= BBStart && SegEnd < BBEnd) {
+          LocalValueCount++;
+        }
+      }
+    }
+
+    if (LocalValueCount > 0) {
+      ValueDensity[PhysReg] = LocalValueCount;
+    }
+  }
+}
+
+void AMDGPUHotBlockRegisterRenamingImpl::findFreeRegisters(
+    MachineBasicBlock *MBB, SmallVectorImpl<MCRegister> &FreeRegs) {
+  SlotIndex BBStart = LIS->getMBBStartIdx(MBB);
+  SlotIndex BBEnd = LIS->getMBBEndIdx(MBB);
+
+  const TargetRegisterClass *VGPR_32_RC =
+      TRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+
+  unsigned RegIdx = 0;
+  for (MCRegister PhysReg : *VGPR_32_RC) {
+    // Only consider registers up to VGPRLimit (based on occupancy)
+    if (RegIdx >= VGPRLimit)
+      break;
+    RegIdx++;
+
+    if (MRI->isReserved(PhysReg))
+      continue;
+
+    bool IsFree = true;
+
+    // Check all register units
+    for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+      LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+
+      // Check if anything is live in this BB
+      LiveIntervalUnion::SegmentIter SI = LIU.find(BBStart);
+      if (SI.valid() && SI.start() < BBEnd) {
+        IsFree = false;
+        break;
+      }
+    }
+
+    if (IsFree) {
+      FreeRegs.push_back(PhysReg);
+    }
+  }
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::canMoveValue(Register VirtReg,
+                                                      MCRegister CurrentPhysReg,
+                                                      MCRegister TargetPhysReg,
+                                                      SlotIndex BBStart,
+                                                      SlotIndex BBEnd) {
+
+  // Check for tied operands
+  // A tied operand means the instruction requires source and destination to be
+  // the same physical register. Moving such a value would break this
+  // constraint.
+
+  LiveInterval &VirtRegLI = LIS->getInterval(VirtReg);
+
+  for (const LiveRange::Segment &S : VirtRegLI) {
+    // Only check segments within this BB
+    if (S.start < BBStart || S.end > BBEnd)
+      continue;
+
+    // Check if this segment starts at a tied def point
+    // (meaning it's the destination of a tied operand instruction)
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(S.start);
+    if (!DefMI)
+      continue;
+
+    for (const MachineOperand &MO : DefMI->operands()) {
+      if (MO.isReg() && MO.getReg() == VirtReg && MO.isDef() && MO.isTied()) {
+        LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                          << ": has tied def at " << S.start << " in "
+                          << *DefMI);
+        return false;
+      }
+    }
+  }
+
+  // Future checks can be added here:
+  // - Register class constraints
+  // - Special register restrictions
+  // - Architecture-specific constraints
+
+  return true;
+}
+
+bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
+                                                      MCRegister FreeReg,
+                                                      MachineBasicBlock *MBB,
+                                                      SlotIndex BBStart,
+                                                      SlotIndex BBEnd) {
+  // Find a movable local value in DenseReg
+  for (MCRegUnit Unit : TRI->regunits(DenseReg)) {
+    LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+
+    for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) {
+      Register VirtReg = SI.value()->reg();
+
+      // Check if this VirtReg is mapped to DenseReg
+      if (VRM->getPhys(VirtReg) != DenseReg)
+        continue;
+
+      // Get the proper LiveInterval from LiveIntervals
+      LiveInterval &VirtRegLI = LIS->getInterval(VirtReg);
+
+      // Check: segment is local (entirely within BB)
+      SlotIndex SegStart = SI.start();
+      SlotIndex SegEnd = SI.stop();
+      if (SegStart < BBStart || SegEnd >= BBEnd)
+        continue;
+
+      // Check: LiveInterval has only one segment (conservative)
+      if (VirtRegLI.size() != 1)
+        continue;
+
+      // Check: No subranges (conservative - avoid complex cases)
+      if (VirtRegLI.hasSubRanges())
+        continue;
+
+      // Check: No allocation hints
+      if (VRM->hasKnownPreference(VirtReg))
+        continue;
+
+      // Check: Cached unmovable VirtRegs
+      if (UnmovableVRegs.contains(VirtReg)) {
+        LLVM_DEBUG(dbgs() << "        Skipping " << printReg(VirtReg, TRI)
+                          << " (cached as unmovable)\n");
+        continue;
+      }
+
+      // Check: Can this value be safely moved?
+      if (!canMoveValue(VirtReg, DenseReg, FreeReg, BBStart, BBEnd)) {
+        // Cache the result to avoid checking again
+        UnmovableVRegs.insert(VirtReg);
+        continue;
+      }
+
+      // This VirtReg is movable! Perform the remap
+      LLVM_DEBUG(dbgs() << "      Moving " << printReg(VirtReg, TRI) << " from "
+                        << printReg(DenseReg, TRI) << " to "
+                        << printReg(FreeReg, TRI) << "\n");
+
+      // Safety check: must be assigned before unassign
+      if (!VRM->hasPhys(VirtReg)) {
+        LLVM_DEBUG(
+            dbgs() << "        WARNING: VirtReg not assigned, skipping\n");
+        continue;
+      }
+
+      LRM->unassign(VirtRegLI); // Remove from LiveRegMatrix
+      LRM->assign(VirtRegLI,
+                  FreeReg); // Assign to new physreg (updates VirtRegMap too)
+
+      // Sanity check: verify VirtReg is now mapped to FreeReg
+      assert(VRM->getPhys(VirtReg) == FreeReg &&
+             "VirtRegMap not updated correctly");
+
+      return true; // Successfully moved one value
+    }
+  }
+
+  return false; // No movable value found
+}
+
+PreservedAnalyses
+AMDGPUHotBlockRegisterRenamingPass::run(MachineFunction &MF,
+                                        MachineFunctionAnalysisManager &MFAM) {
+  VirtRegMap *VRM = &MFAM.getResult<VirtRegMapAnalysis>(MF);
+  LiveRegMatrix *LRM = &MFAM.getResult<LiveRegMatrixAnalysis>(MF);
+  LiveIntervals *LIS = &MFAM.getResult<LiveIntervalsAnalysis>(MF);
+  MachineBlockFrequencyInfo *MBFI =
+      &MFAM.getResult<MachineBlockFrequencyAnalysis>(MF);
+
+  const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  AMDGPUHotBlockRegisterRenamingImpl Impl(VRM, LRM, LIS, MBFI, ST, MFI);
+  if (!Impl.run(MF))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h
new file mode 100644
index 0000000000000..6dfdd1bec72ef
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.h
@@ -0,0 +1,34 @@
+//===-- AMDGPUHotBlockRegisterRenaming.h -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Reduces value density in hot basic blocks by remapping local values
+/// from overused physical registers to free physical registers.
+///
+/// This gives the Post-RA scheduler more flexibility to reorder instructions
+/// by reducing false dependencies created by register reuse.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class AMDGPUHotBlockRegisterRenamingPass
+    : public PassInfoMixin<AMDGPUHotBlockRegisterRenamingPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUHOTBLOCKREGISTERRENAMING_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index a424c45b8af1f..382a923e4ff61 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -122,6 +122,7 @@ MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()
 MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass())
 MACHINE_FUNCTION_PASS("amdgpu-prepare-agpr-alloc", AMDGPUPrepareAGPRAllocPass())
 MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass())
+MACHINE_FUNCTION_PASS("amdgpu-hot-block-reg-renaming", AMDGPUHotBlockRegisterRenamingPass())
 MACHINE_FUNCTION_PASS("amdgpu-wait-sgpr-hazards", AMDGPUWaitSGPRHazardsPass())
 MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass())
 MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9c9ae86b9cce8..7ad686a7d0e32 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -21,6 +21,7 @@
 #include "AMDGPUCtorDtorLowering.h"
 #include "AMDGPUExportClustering.h"
 #include "AMDGPUExportKernelRuntimeHandles.h"
+#include "AMDGPUHotBlockRegisterRenaming.h"
 #include "AMDGPUIGroupLP.h"
 #include "AMDGPUISelDAGToDAG.h"
 #include "AMDGPULowerVGPREncoding.h"
@@ -169,13 +170,13 @@ class AMDGPUCodeGenPassBuilder
 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
 public:
   SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
-    : RegisterRegAllocBase(N, D, C) {}
+      : RegisterRegAllocBase(N, D, C) {}
 };
 
 class VGPRRegisterRegAlloc : public RegisterRegAllocBase<VGPRRegisterRegAlloc> {
 public:
   VGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
-    : RegisterRegAllocBase(N, D, C) {}
+      : RegisterRegAllocBase(N, D, C) {}
 };
 
 class WWMRegisterRegAlloc : public RegisterRegAllocBase<WWMRegisterRegAlloc> {
@@ -218,19 +219,21 @@ static llvm::once_flag InitializeDefaultVGPRRegisterAllocatorFlag;
 static llvm::once_flag InitializeDefaultWWMRegisterAllocatorFlag;
 
 static SGPRRegisterRegAlloc
-defaultSGPRRegAlloc("default",
-                    "pick SGPR register allocator based on -O option",
-                    useDefaultRegisterAllocator);
+    defaultSGPRRegAlloc("default",
+                        "pick SGPR register allocator based on -O option",
+                        useDefaultRegisterAllocator);
 
 static cl::opt<SGPRRegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<SGPRRegisterRegAlloc>>
-SGPRRegAlloc("sgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
-             cl::desc("Register allocator to use for SGPRs"));
+    SGPRRegAlloc("sgpr-regalloc", cl::Hidden,
+                 cl::init(&useDefaultRegisterAllocator),
+                 cl::desc("Register allocator to use for SGPRs"));
 
 static cl::opt<VGPRRegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<VGPRRegisterRegAlloc>>
-VGPRRegAlloc("vgpr-regalloc", cl::Hidden, cl::init(&useDefaultRegisterAllocator),
-             cl::desc("Register allocator to use for VGPRs"));
+    VGPRRegAlloc("vgpr-regalloc", cl::Hidden,
+                 cl::init(&useDefaultRegisterAllocator),
+                 cl::desc("Register allocator to use for VGPRs"));
 
 static cl::opt<WWMRegisterRegAlloc::FunctionPassCtor, false,
                RegisterPassParser<WWMRegisterRegAlloc>>
@@ -301,22 +304,25 @@ static FunctionPass *createFastWWMRegisterAllocator() {
   return createFastRegisterAllocator(onlyAllocateWWMRegs, false);
 }
 
-static SGPRRegisterRegAlloc basicRegAllocSGPR(
-  "basic", "basic register allocator", createBasicSGPRRegisterAllocator);
-static SGPRRegisterRegAlloc greedyRegAllocSGPR(
-  "greedy", "greedy register allocator", createGreedySGPRRegisterAllocator);
-
-static SGPRRegisterRegAlloc fastRegAllocSGPR(
-  "fast", "fast register allocator", createFastSGPRRegisterAllocator);
+static SGPRRegisterRegAlloc basicRegAllocSGPR("basic",
+                                              "basic register allocator",
+                                              createBasicSGPRRegisterAllocator);
+static SGPRRegisterRegAlloc
+    greedyRegAllocSGPR("greedy", "greedy register allocator",
+                       createGreedySGPRRegisterAllocator);
 
+static SGPRRegisterRegAlloc fastRegAllocSGPR("fast", "fast register allocator",
+                                             createFastSGPRRegisterAllocator);
 
-static VGPRRegisterRegAlloc basicRegAllocVGPR(
-  "basic", "basic register allocator", createBasicVGPRRegisterAllocator);
-static VGPRRegisterRegAlloc greedyRegAllocVGPR(
-  "greedy", "greedy register allocator", createGreedyVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc basicRegAllocVGPR("basic",
+                                              "basic register allocator",
+                                              createBasicVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc
+    greedyRegAllocVGPR("greedy", "greedy register allocator",
+                       createGreedyVGPRRegisterAllocator);
 
-static VGPRRegisterRegAlloc fastRegAllocVGPR(
-  "fast", "fast register allocator", createFastVGPRRegisterAllocator);
+static VGPRRegisterRegAlloc fastRegAllocVGPR("fast", "fast register allocator",
+                                             createFastVGPRRegisterAllocator);
 
 static WWMRegisterRegAlloc basicRegAllocWWMReg("basic",
                                                "basic register allocator",
@@ -334,14 +340,14 @@ static bool isLTOPreLink(ThinOrFullLTOPhase Phase) {
 } // anonymous namespace
 
 static cl::opt<bool>
-EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
-                        cl::desc("Run early if-conversion"),
-                        cl::init(false));
+    EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
+                            cl::desc("Run early if-conversion"),
+                            cl::init(false));
 
 static cl::opt<bool>
-OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
-            cl::desc("Run pre-RA exec mask optimizations"),
-            cl::init(true));
+    OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
+                     cl::desc("Run pre-RA exec mask optimizations"),
+                     cl::init(true));
 
 static cl::opt<bool>
     LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
@@ -349,32 +355,27 @@ static cl::opt<bool>
                   cl::init(true), cl::Hidden);
 
 // Option to disable vectorizer for tests.
-static cl::opt<bool> EnableLoadStoreVectorizer(
-  "amdgpu-load-store-vectorizer",
-  cl::desc("Enable load store vectorizer"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableLoadStoreVectorizer("amdgpu-load-store-vectorizer",
+                              cl::desc("Enable load store vectorizer"),
+                              cl::init(true), cl::Hidden);
 
 // Option to control global loads scalarization
-static cl::opt<bool> ScalarizeGlobal(
-  "amdgpu-scalarize-global-loads",
-  cl::desc("Enable global load scalarization"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    ScalarizeGlobal("amdgpu-scalarize-global-loads",
+                    cl::desc("Enable global load scalarization"),
+                    cl::init(true), cl::Hidden);
 
 // Option to run internalize pass.
 static cl::opt<bool> InternalizeSymbols(
-  "amdgpu-internalize-symbols",
-  cl::desc("Enable elimination of non-kernel functions and unused globals"),
-  cl::init(false),
-  cl::Hidden);
+    "amdgpu-internalize-symbols",
+    cl::desc("Enable elimination of non-kernel functions and unused globals"),
+    cl::init(false), cl::Hidden);
 
 // Option to inline all early.
-static cl::opt<bool> EarlyInlineAll(
-  "amdgpu-early-inline-all",
-  cl::desc("Inline all functions early"),
-  cl::init(false),
-  cl::Hidden);
+static cl::opt<bool> EarlyInlineAll("amdgpu-early-inline-all",
+                                    cl::desc("Inline all functions early"),
+                                    cl::init(false), cl::Hidden);
 
 static cl::opt<bool> RemoveIncompatibleFunctions(
     "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
@@ -382,39 +383,40 @@ static cl::opt<bool> RemoveIncompatibleFunctions(
              "use features not supported by the target GPU"),
     cl::init(true));
 
-static cl::opt<bool> EnableSDWAPeephole(
-  "amdgpu-sdwa-peephole",
-  cl::desc("Enable SDWA peepholer"),
-  cl::init(true));
+static cl::opt<bool> EnableSDWAPeephole("amdgpu-sdwa-peephole",
+                                        cl::desc("Enable SDWA peepholer"),
+                                        cl::init(true));
 
-static cl::opt<bool> EnableDPPCombine(
-  "amdgpu-dpp-combine",
-  cl::desc("Enable DPP combiner"),
-  cl::init(true));
+static cl::opt<bool> EnableDPPCombine("amdgpu-dpp-combine",
+                                      cl::desc("Enable DPP combiner"),
+                                      cl::init(true));
 
 // Enable address space based alias analysis
-static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
-  cl::desc("Enable AMDGPU Alias Analysis"),
-  cl::init(true));
+static cl::opt<bool>
+    EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
+                              cl::desc("Enable AMDGPU Alias Analysis"),
+                              cl::init(true));
 
 // Enable lib calls simplifications
-static cl::opt<bool> EnableLibCallSimplify(
-  "amdgpu-simplify-libcall",
-  cl::desc("Enable amdgpu library simplifications"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableLibCallSimplify("amdgpu-simplify-libcall",
+                          cl::desc("Enable amdgpu library simplifications"),
+                          cl::init(true), cl::Hidden);
 
 static cl::opt<bool> EnableLowerKernelArguments(
-  "amdgpu-ir-lower-kernel-arguments",
-  cl::desc("Lower kernel argument loads in IR pass"),
-  cl::init(true),
-  cl::Hidden);
+    "amdgpu-ir-lower-kernel-arguments",
+    cl::desc("Lower kernel argument loads in IR pass"), cl::init(true),
+    cl::Hidden);
 
 static cl::opt<bool> EnableRegReassign(
-  "amdgpu-reassign-regs",
-  cl::desc("Enable register reassign optimizations on gfx10+"),
-  cl::init(true),
-  cl::Hidden);
+    "amdgpu-reassign-regs",
+    cl::desc("Enable register reassign optimizations on gfx10+"),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool> EnableHotBlockRegRenaming(
+    "amdgpu-enable-hot-block-reg-renaming",
+    cl::desc("Enable hot block register renaming to reduce value density"),
+    cl::init(false), cl::Hidden);
 
 static cl::opt<bool> OptVGPRLiveRange(
     "amdgpu-opt-vgpr-liverange",
@@ -432,11 +434,10 @@ static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
         clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
 
 // Enable Mode register optimization
-static cl::opt<bool> EnableSIModeRegisterPass(
-  "amdgpu-mode-register",
-  cl::desc("Enable mode register pass"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableSIModeRegisterPass("amdgpu-mode-register",
+                             cl::desc("Enable mode register pass"),
+                             cl::init(true), cl::Hidden);
 
 // Enable GFX11+ s_delay_alu insertion
 static cl::opt<bool>
@@ -452,19 +453,16 @@ static cl::opt<bool>
 
 // Option is used in lit tests to prevent deadcoding of patterns inspected.
 static cl::opt<bool>
-EnableDCEInRA("amdgpu-dce-in-ra",
-    cl::init(true), cl::Hidden,
-    cl::desc("Enable machine DCE inside regalloc"));
+    EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden,
+                  cl::desc("Enable machine DCE inside regalloc"));
 
 static cl::opt<bool> EnableSetWavePriority("amdgpu-set-wave-priority",
                                            cl::desc("Adjust wave priority"),
                                            cl::init(false), cl::Hidden);
 
-static cl::opt<bool> EnableScalarIRPasses(
-  "amdgpu-scalar-ir-passes",
-  cl::desc("Enable scalar IR passes"),
-  cl::init(true),
-  cl::Hidden);
+static cl::opt<bool> EnableScalarIRPasses("amdgpu-scalar-ir-passes",
+                                          cl::desc("Enable scalar IR passes"),
+                                          cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableSwLowerLDS("amdgpu-enable-sw-lower-lds",
@@ -477,10 +475,10 @@ static cl::opt<bool, true> EnableLowerModuleLDS(
     cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool> EnablePreRAOptimizations(
-    "amdgpu-enable-pre-ra-optimizations",
-    cl::desc("Enable Pre-RA optimizations pass"), cl::init(true),
-    cl::Hidden);
+static cl::opt<bool>
+    EnablePreRAOptimizations("amdgpu-enable-pre-ra-optimizations",
+                             cl::desc("Enable Pre-RA optimizations pass"),
+                             cl::init(true), cl::Hidden);
 
 static cl::opt<bool> EnablePromoteKernelArguments(
     "amdgpu-enable-promote-kernel-arguments",
@@ -507,10 +505,10 @@ static cl::opt<bool> EnableRewritePartialRegUses(
     cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool> EnableHipStdPar(
-  "amdgpu-enable-hipstdpar",
-  cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
-  cl::Hidden);
+static cl::opt<bool>
+    EnableHipStdPar("amdgpu-enable-hipstdpar",
+                    cl::desc("Enable HIP Standard Parallelism Offload support"),
+                    cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
     EnableAMDGPUAttributor("amdgpu-attributor-enable",
@@ -613,6 +611,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
   initializeAMDGPUResourceUsageAnalysisWrapperPassPass(*PR);
   initializeGCNNSAReassignLegacyPass(*PR);
+  initializeAMDGPUHotBlockRegisterRenamingLegacyPass(*PR);
   initializeGCNPreRAOptimizationsLegacyPass(*PR);
   initializeGCNPreRALongBranchRegLegacyPass(*PR);
   initializeGCNRewritePartialRegUsesLegacyPass(*PR);
@@ -633,8 +632,8 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
 static ScheduleDAGInstrs *
 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
-  ScheduleDAGMILive *DAG =
-    new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
+  ScheduleDAGMILive *DAG = new GCNScheduleDAGMILive(
+      C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   if (ST.shouldClusterStores())
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
@@ -697,14 +696,13 @@ createIterativeILPMachineScheduler(MachineSchedContext *C) {
   return DAG;
 }
 
-static MachineSchedRegistry
-SISchedRegistry("si", "Run SI's custom scheduler",
-                createSIMachineScheduler);
+static MachineSchedRegistry SISchedRegistry("si", "Run SI's custom scheduler",
+                                            createSIMachineScheduler);
 
 static MachineSchedRegistry
-GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
-                             "Run GCN scheduler to maximize occupancy",
-                             createGCNMaxOccupancyMachineScheduler);
+    GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
+                                 "Run GCN scheduler to maximize occupancy",
+                                 createGCNMaxOccupancyMachineScheduler);
 
 static MachineSchedRegistry
     GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
@@ -961,7 +959,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
 
   PB.registerFullLinkTimeOptimizationLastEPCallback(
       [this](ModulePassManager &PM, OptimizationLevel Level) {
-
         // Promote kernel arguments to global address space for LLVM IR
         // generated by flang compiler
         FunctionPassManager FPM;
@@ -1391,7 +1388,7 @@ void AMDGPUPassConfig::addIRPasses() {
                                              AAResults &AAR) {
         if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
           AAR.addAAResult(WrapperPass->getResult());
-        }));
+      }));
     }
 
     if (TM.getTargetTriple().isAMDGCN()) {
@@ -1651,6 +1648,10 @@ void GCNPassConfig::addOptimizedRegAlloc() {
 }
 
 bool GCNPassConfig::addPreRewrite() {
+  // Hot block register renaming to reduce value density
+  if (TM->getOptLevel() > CodeGenOptLevel::None && EnableHotBlockRegRenaming)
+    addPass(&AMDGPUHotBlockRegisterRenamingID);
+
   if (EnableRegReassign)
     addPass(&GCNNSAReassignID);
 
@@ -2009,8 +2010,8 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
                              AMDGPU::SGPR_32RegClass,
                              MFI->ArgInfo.PrivateSegmentSize, 0, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->LDSKernelId,
-                             AMDGPU::SGPR_32RegClass,
-                             MFI->ArgInfo.LDSKernelId, 0, 1) ||
+                             AMDGPU::SGPR_32RegClass, MFI->ArgInfo.LDSKernelId,
+                             0, 1) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkGroupIDX,
                              AMDGPU::SGPR_32RegClass, MFI->ArgInfo.WorkGroupIDX,
                              0, 1) ||
@@ -2033,14 +2034,14 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
                              AMDGPU::SReg_64RegClass,
                              MFI->ArgInfo.ImplicitBufferPtr, 2, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDX,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDX, 0, 0) ||
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDX,
+                             0, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDY,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDY, 0, 0) ||
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDY,
+                             0, 0) ||
        parseAndCheckArgument(YamlMFI.ArgInfo->WorkItemIDZ,
-                             AMDGPU::VGPR_32RegClass,
-                             MFI->ArgInfo.WorkItemIDZ, 0, 0)))
+                             AMDGPU::VGPR_32RegClass, MFI->ArgInfo.WorkItemIDZ,
+                             0, 0)))
     return true;
 
   if (ST.hasIEEEMode())
@@ -2245,6 +2246,11 @@ Error AMDGPUCodeGenPassBuilder::addInstSelector(AddMachinePass &addPass) const {
 }
 
 void AMDGPUCodeGenPassBuilder::addPreRewrite(AddMachinePass &addPass) const {
+  // Hot block register renaming to reduce value density
+  if (TM.getOptLevel() > CodeGenOptLevel::None && EnableHotBlockRegRenaming) {
+    addPass(AMDGPUHotBlockRegisterRenamingPass());
+  }
+
   if (EnableRegReassign) {
     addPass(GCNNSAReassignPass());
   }
@@ -2347,7 +2353,6 @@ Error AMDGPUCodeGenPassBuilder::addRegAssignmentOptimized(
   // For allocating per-thread VGPRs.
   addPass(RAGreedyPass({onlyAllocateVGPRs, "vgpr"}));
 
-
   addPreRewrite(addPass);
   addPass(VirtRegRewriterPass(true));
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 8c0f9d01a7d30..b2d45438021ce 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -63,6 +63,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUFrameLowering.cpp
   AMDGPUGlobalISelDivergenceLowering.cpp
   AMDGPUGlobalISelUtils.cpp
+  AMDGPUHotBlockRegisterRenaming.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInsertDelayAlu.cpp
   AMDGPUInstCombineIntrinsic.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir b/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir
new file mode 100644
index 0000000000000..28c9c16f248d1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hot-block-register-renaming.mir
@@ -0,0 +1,146 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-enable-hot-block-reg-renaming -verify-machineinstrs -run-pass=greedy,amdgpu-hot-block-reg-renaming,virtregrewriter -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @test_basic_move() { ret void }
+  define amdgpu_kernel void @test_tied_operand() { ret void }
+  define amdgpu_kernel void @test_no_free_registers() #0 { ret void }
+  
+  attributes #0 = { "amdgpu-num-vgpr"="8" }
+...
+
+---
+# Test 1: Verify that we correctly move a value when it's safe to do so
+# Multiple values allocated to vgpr3, one should be moved to a free register
+# CHECK-LABEL: name: test_basic_move
+# CHECK: bb.1:
+# CHECK-NOT: renamable $vgpr3 = V_ADD_F32_e64 0, renamable $vgpr0, 0, renamable $vgpr1
+# CHECK-NOT: renamable $vgpr3 = V_MUL_F32_e64 0, killed renamable $vgpr3, 0, renamable $vgpr2
+# CHECK: renamable $vgpr{{[4-9]|[1-5][0-9]}} = V_MUL_F32_e64
+name:            test_basic_move
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr2' }
+  - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 4, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 5, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 6, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 7, class: vgpr_32, preferred-register: '$vgpr3' }
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = COPY $vgpr2
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    ; High value density: %3, %4, %5, %6, %7 all prefer vgpr3
+    ; Some should be moved to free registers
+    %3:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %4:vgpr_32 = V_ADD_F32_e64 0, %3, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %5:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %2, 0, 0, implicit $mode, implicit $exec
+    %6:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %7:vgpr_32 = V_FMA_F32_e64 0, %3, 0, %2, 0, %4, 0, 0, implicit $mode, implicit $exec
+    S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0, implicit %7
+
+...
+---
+# Test 2: Verify that we do NOT move values with tied operands
+# V_MAC_F32 has tied def-use, should not be moved
+# CHECK-LABEL: name: test_tied_operand
+# CHECK: bb.1:
+# CHECK: renamable $vgpr3 = V_ADD_F32_e32
+# CHECK-NEXT: renamable $vgpr3 = V_MAC_F32_e32
+# CHECK-NEXT: renamable $vgpr3 = V_MUL_F32_e64
+name:            test_tied_operand
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 4, class: vgpr_32, preferred-register: '$vgpr3' }
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    ; %2 and %3 both prefer vgpr3, but %3 has tied operand
+    %2:vgpr_32 = V_ADD_F32_e32 %1, %0, implicit $mode, implicit $exec
+    %3:vgpr_32 = V_MAC_F32_e32 %0, %1, %2, implicit $mode, implicit $exec
+    ; The pass should NOT move %3 because it has a tied def operand
+    %4:vgpr_32 = V_MUL_F32_e64 0, %3, 0, %0, 0, 0, implicit $mode, implicit $exec
+    S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0, implicit %4
+
+...
+---
+# Test 3: Verify that we do NOT move when there are no free registers
+# With only 8 VGPRs available and all in use, no moves should happen
+# CHECK-LABEL: name: test_no_free_registers
+# CHECK: bb.1:
+# CHECK: renamable $vgpr3 = V_ADD_F32_e64
+# CHECK: renamable $vgpr3 = V_MUL_F32_e64
+name:            test_no_free_registers
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr2' }
+  - { id: 3, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 4, class: vgpr_32, preferred-register: '$vgpr4' }
+  - { id: 5, class: vgpr_32, preferred-register: '$vgpr5' }
+  - { id: 6, class: vgpr_32, preferred-register: '$vgpr6' }
+  - { id: 7, class: vgpr_32, preferred-register: '$vgpr7' }
+  - { id: 8, class: vgpr_32, preferred-register: '$vgpr3' }
+  - { id: 9, class: vgpr_32, preferred-register: '$vgpr3' }
+body: |
+  bb.0:
+    successors: %bb.1
+    liveins: $vgpr0, $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    ; Fill up vgpr2-7 to leave no free registers
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
+    %4:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
+    %5:vgpr_32 = V_MOV_B32_e32 3, implicit $exec
+    %6:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
+    %7:vgpr_32 = V_MOV_B32_e32 5, implicit $exec
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    ; With all registers occupied, pass should not move values
+    %8:vgpr_32 = V_ADD_F32_e64 0, %0, 0, %1, 0, 0, implicit $mode, implicit $exec
+    %9:vgpr_32 = V_MUL_F32_e64 0, %8, 0, %1, 0, 0, implicit $mode, implicit $exec
+    S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+  bb.2:
+    S_ENDPGM 0, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %9
+
+...
+

From af1f538f1fcbb7e389a7211b66f3cc7ad16c7f53 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Mon, 20 Oct 2025 06:52:42 -0500
Subject: [PATCH 2/8] [AMDGPU] Hot block register renaming: address PR review
 feedback

- Rename canMoveValue to isVirtRegMovable for clarity
- Add assertions to verify single-value precondition
- Restore VRM->getPhys check: NOT redundant due to register aliasing
  (register units are shared between aliased registers like VGPR0 and VGPR0_VGPR1)
- Improve tied operand check to verify tied source register compatibility
---
 .../AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp | 61 +++++++++++++++----
 1 file changed, 49 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
index bc95ee375d008..a6a7f1362626f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
@@ -104,9 +104,9 @@ class AMDGPUHotBlockRegisterRenamingImpl {
   bool isSuitableRegister(MCRegister PhysReg) const;
 
   /// Check if a virtual register can be safely moved
-  bool canMoveValue(Register VirtReg, MCRegister CurrentPhysReg,
-                    MCRegister TargetPhysReg, SlotIndex BBStart,
-                    SlotIndex BBEnd);
+  bool isVirtRegMovable(Register VirtReg, MCRegister CurrentPhysReg,
+                        MCRegister TargetPhysReg, SlotIndex BBStart,
+                        SlotIndex BBEnd);
 
   /// Try to move a value from DenseReg to FreeReg
   bool tryMoveValue(MCRegister DenseReg, MCRegister FreeReg,
@@ -363,19 +363,30 @@ void AMDGPUHotBlockRegisterRenamingImpl::findFreeRegisters(
   }
 }
 
-bool AMDGPUHotBlockRegisterRenamingImpl::canMoveValue(Register VirtReg,
-                                                      MCRegister CurrentPhysReg,
-                                                      MCRegister TargetPhysReg,
-                                                      SlotIndex BBStart,
-                                                      SlotIndex BBEnd) {
+bool AMDGPUHotBlockRegisterRenamingImpl::isVirtRegMovable(Register VirtReg,
+                                                          MCRegister CurrentPhysReg,
+                                                          MCRegister TargetPhysReg,
+                                                          SlotIndex BBStart,
+                                                          SlotIndex BBEnd) {
+
+  LiveInterval &VirtRegLI = LIS->getInterval(VirtReg);
+
+  // Verify precondition: single value with single segment in BB
+  unsigned SegmentCount = 0;
+  for (const LiveRange::Segment &S : VirtRegLI) {
+    if (S.start >= BBStart && S.end <= BBEnd)
+      SegmentCount++;
+  }
+  assert(SegmentCount == 1 &&
+         "isVirtRegMovable expects VirtReg with single segment in BB");
+  assert(VirtRegLI.getNumValNums() == 1 &&
+         "isVirtRegMovable expects VirtReg with single value");
 
   // Check for tied operands
   // A tied operand means the instruction requires source and destination to be
   // the same physical register. Moving such a value would break this
   // constraint.
 
-  LiveInterval &VirtRegLI = LIS->getInterval(VirtReg);
-
   for (const LiveRange::Segment &S : VirtRegLI) {
     // Only check segments within this BB
     if (S.start < BBStart || S.end > BBEnd)
@@ -387,8 +398,31 @@ bool AMDGPUHotBlockRegisterRenamingImpl::canMoveValue(Register VirtReg,
     if (!DefMI)
       continue;
 
-    for (const MachineOperand &MO : DefMI->operands()) {
+    for (unsigned OpIdx = 0, E = DefMI->getNumOperands(); OpIdx < E; ++OpIdx) {
+      const MachineOperand &MO = DefMI->getOperand(OpIdx);
       if (MO.isReg() && MO.getReg() == VirtReg && MO.isDef() && MO.isTied()) {
+        // Found a tied def - need to check the source operand it's tied to
+        unsigned TiedIdx = DefMI->findTiedOperandIdx(OpIdx);
+        const MachineOperand &TiedMO = DefMI->getOperand(TiedIdx);
+        
+        // If the tied source is a register, verify it won't conflict
+        if (TiedMO.isReg()) {
+          Register TiedReg = TiedMO.getReg();
+          if (TiedReg.isVirtual()) {
+            MCRegister TiedPhysReg = VRM->getPhys(TiedReg);
+            // Cannot move if it would violate the tied constraint
+            // (source and dest must be in same physical register)
+            if (TiedPhysReg != CurrentPhysReg) {
+              LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                                << ": tied to " << printReg(TiedReg, TRI)
+                                << " which is in different PhysReg "
+                                << printReg(TiedPhysReg, TRI) << " at " << S.start
+                                << " in " << *DefMI);
+              return false;
+            }
+          }
+        }
+        
         LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
                           << ": has tied def at " << S.start << " in "
                           << *DefMI);
@@ -418,6 +452,9 @@ bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
       Register VirtReg = SI.value()->reg();
 
       // Check if this VirtReg is mapped to DenseReg
+      // NOTE: This is NOT redundant! We iterate per register unit, and units
+      // can be shared between aliased registers (e.g., VGPR0 and VGPR0_VGPR1).
+      // This check filters out VirtRegs mapped to aliased registers.
       if (VRM->getPhys(VirtReg) != DenseReg)
         continue;
 
@@ -450,7 +487,7 @@ bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
       }
 
       // Check: Can this value be safely moved?
-      if (!canMoveValue(VirtReg, DenseReg, FreeReg, BBStart, BBEnd)) {
+      if (!isVirtRegMovable(VirtReg, DenseReg, FreeReg, BBStart, BBEnd)) {
         // Cache the result to avoid checking again
         UnmovableVRegs.insert(VirtReg);
         continue;

From fdb2f21a30b558dfb6dc1f1412fc1b77c7cf3a14 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Fri, 24 Oct 2025 06:05:38 -0500
Subject: [PATCH 3/8] [AMDGPU] enable Hot Block Register Renaming by default
 for CI/CT run

This flips the default of -amdgpu-enable-hot-block-reg-renaming to true
to exercise the pass across large CI/CT builds. This is a temporary
enablement to flush out issues; users can still disable with
-mllvm -amdgpu-enable-hot-block-reg-renaming=false.
---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7ad686a7d0e32..5eba6f6b18a22 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -416,7 +416,7 @@ static cl::opt<bool> EnableRegReassign(
 static cl::opt<bool> EnableHotBlockRegRenaming(
     "amdgpu-enable-hot-block-reg-renaming",
     cl::desc("Enable hot block register renaming to reduce value density"),
-    cl::init(false), cl::Hidden);
+    cl::init(true), cl::Hidden);
 
 static cl::opt<bool> OptVGPRLiveRange(
     "amdgpu-opt-vgpr-liverange",

From 695770a117db52bf24f40ed8c91a20248011b214 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Mon, 27 Oct 2025 10:03:39 -0500
Subject: [PATCH 4/8] [AMDGPU] Fix Hot Block Register Renaming assertions on
 complex IR

Fix two assertions discovered during CI/CT testing with rocBLAS kernels:

1. isVirtRegMovable() crashed on PHI nodes with multiple value definitions.
   Converted assertions to early-return checks, allowing the pass to skip
   unmovable registers instead of crashing on legitimate IR patterns.

2. tryMoveValue() assumed LiveIntervalUnion contains only virtual registers,
   but it can contain physical registers after allocation. Added isVirtual()
   check before calling VirtRegMap::getPhys() to prevent assertion failures.

Both fixes improve robustness without affecting correctness or performance.
---
 .../AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp | 27 ++++++++++++++++---
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
index a6a7f1362626f..820132df8e23c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
@@ -377,10 +377,21 @@ bool AMDGPUHotBlockRegisterRenamingImpl::isVirtRegMovable(Register VirtReg,
     if (S.start >= BBStart && S.end <= BBEnd)
       SegmentCount++;
   }
-  assert(SegmentCount == 1 &&
-         "isVirtRegMovable expects VirtReg with single segment in BB");
-  assert(VirtRegLI.getNumValNums() == 1 &&
-         "isVirtRegMovable expects VirtReg with single value");
+  
+  // Cannot move registers with multiple segments in BB (e.g., PHI nodes)
+  if (SegmentCount != 1) {
+    LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                      << ": has " << SegmentCount << " segments in BB\n");
+    return false;
+  }
+  
+  // Cannot move registers with multiple definitions (e.g., from PHI merge)
+  if (VirtRegLI.getNumValNums() != 1) {
+    LLVM_DEBUG(dbgs() << "        Cannot move " << printReg(VirtReg, TRI)
+                      << ": has " << VirtRegLI.getNumValNums() 
+                      << " value definitions\n");
+    return false;
+  }
 
   // Check for tied operands
   // A tied operand means the instruction requires source and destination to be
@@ -451,6 +462,14 @@ bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
     for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) {
       Register VirtReg = SI.value()->reg();
 
+      // Skip physical registers (LiveIntervalUnion can contain both)
+      if (!VirtReg.isVirtual())
+        continue;
+
+      // Skip virtual registers that haven't been allocated yet
+      if (!VRM->hasPhys(VirtReg))
+        continue;
+
       // Check if this VirtReg is mapped to DenseReg
       // NOTE: This is NOT redundant! We iterate per register unit, and units
       // can be shared between aliased registers (e.g., VGPR0 and VGPR0_VGPR1).

From 009025606cc636d9acb42594bd496804901d4c69 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Thu, 30 Oct 2025 08:48:01 -0500
Subject: [PATCH 5/8] AMDGPU: Fix correctness issues in Hot Block Register
 Renaming pass

Three critical correctness fixes for the Hot Block Register Renaming pass:

Fix #0 (Kernel-Only): Restrict pass to kernel functions only. Post-RA passes
cannot safely modify non-kernel functions because they have no mechanism to
update RegMask operands in caller's call instructions, which would lead to
inter-procedural register corruption.

Fix #1a (Redefinitions): Check that target free register is not redefined by
any instruction within the virtual register's live range. Without this check,
moving a value to a register that gets overwritten mid-range causes segfaults.

Fix #1b (Call Clobbers): Use LiveIntervals::checkRegMaskInterference() to
verify that target register is not clobbered by any call instruction within
the live range. Prevents incorrect register assignments across function calls.

All fixes verified on aomp-complex test case (segfault fixed) and rocRAND
MTGP32 kernel (117 values remapped, original optimization preserved).
---
 .../AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp | 85 +++++++++++++++++--
 1 file changed, 78 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
index 820132df8e23c..c4c16c56f17c8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
@@ -44,6 +44,7 @@
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -57,6 +58,8 @@ STATISTIC(NumBlocksProcessed, "Number of hot blocks processed");
 STATISTIC(NumValuesRemapped, "Number of values remapped to reduce density");
 STATISTIC(NumBlocksSkipped,
           "Number of blocks skipped (no dense regs or no free regs)");
+STATISTIC(NumNonKernelsSkipped,
+          "Number of non-kernel functions skipped for safety");
 
 namespace {
 
@@ -110,7 +113,8 @@ class AMDGPUHotBlockRegisterRenamingImpl {
 
   /// Try to move a value from DenseReg to FreeReg
   bool tryMoveValue(MCRegister DenseReg, MCRegister FreeReg,
-                    MachineBasicBlock *MBB, SlotIndex BBStart, SlotIndex BBEnd);
+                    MachineBasicBlock *MBB, SlotIndex BBStart, SlotIndex BBEnd,
+                    const DenseMap<MCRegister, SmallVector<SlotIndex, 4>> &PhysRegDefs);
 };
 
 class AMDGPUHotBlockRegisterRenamingLegacy : public MachineFunctionPass {
@@ -173,6 +177,17 @@ bool AMDGPUHotBlockRegisterRenamingImpl::run(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "AMDGPUHotBlockRegisterRenaming: Processing "
                     << MF.getName() << "\n");
 
+  // Fix #0: Skip non-kernel functions to avoid RegMask corruption issues.
+  // Post-RA pass cannot update RegMask operands in caller's call instructions,
+  // which would lead to incorrect assumptions about clobbered registers.
+  CallingConv::ID CC = MF.getFunction().getCallingConv();
+  if (CC != CallingConv::AMDGPU_KERNEL) {
+    LLVM_DEBUG(dbgs() << "  Skipping non-kernel function (CC=" << CC
+                      << "): Post-RA pass cannot safely modify callees\n");
+    ++NumNonKernelsSkipped;
+    return false;
+  }
+
   TRI = ST->getRegisterInfo();
   MRI = &MF.getRegInfo();
 
@@ -236,6 +251,33 @@ bool AMDGPUHotBlockRegisterRenamingImpl::processBasicBlock(
                     << " registers with values, " << FreeRegs.size()
                     << " free registers\n");
 
+  // Step 2a: Build PhysReg definitions cache (Fix #1a)
+  // Track all SlotIndexes where each physical register is defined
+  const TargetRegisterClass *VGPR_32_RC =
+      TRI->getRegClass(AMDGPU::VGPR_32RegClassID);
+  DenseMap<MCRegister, SmallVector<SlotIndex, 4>> PhysRegDefs;
+  
+  for (MachineInstr &MI : *MBB) {
+    SlotIndex Idx = LIS->getInstructionIndex(MI);
+    for (const MachineOperand &MO : MI.operands()) {
+      if (MO.isReg() && MO.isDef() && MO.getReg().isPhysical()) {
+        MCRegister PhysReg = MO.getReg();
+        if (VGPR_32_RC->contains(PhysReg)) {
+          PhysRegDefs[PhysReg].push_back(Idx);
+          // Also track superregs for aliasing
+          for (MCRegister Super : TRI->superregs(PhysReg)) {
+            PhysRegDefs[Super].push_back(Idx);
+          }
+        }
+      }
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "    Built PhysRegDefs cache: " << PhysRegDefs.size() 
+           << " registers have definitions in this BB\n";
+  });
+
   // Step 3: Create max heap of dense registers
   auto Comparator = [&ValueDensity](MCRegister A, MCRegister B) {
     return ValueDensity[A] < ValueDensity[B]; // max heap
@@ -266,7 +308,7 @@ bool AMDGPUHotBlockRegisterRenamingImpl::processBasicBlock(
 
     MCRegister FreeReg = FreeRegs[FreeRegIdx++];
 
-    if (tryMoveValue(DenseReg, FreeReg, MBB, BBStart, BBEnd)) {
+    if (tryMoveValue(DenseReg, FreeReg, MBB, BBStart, BBEnd, PhysRegDefs)) {
       Changed = true;
       ++NumValuesRemapped;
 
@@ -450,11 +492,10 @@ bool AMDGPUHotBlockRegisterRenamingImpl::isVirtRegMovable(Register VirtReg,
   return true;
 }
 
-bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
-                                                      MCRegister FreeReg,
-                                                      MachineBasicBlock *MBB,
-                                                      SlotIndex BBStart,
-                                                      SlotIndex BBEnd) {
+bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(
+    MCRegister DenseReg, MCRegister FreeReg, MachineBasicBlock *MBB,
+    SlotIndex BBStart, SlotIndex BBEnd,
+    const DenseMap<MCRegister, SmallVector<SlotIndex, 4>> &PhysRegDefs) {
   // Find a movable local value in DenseReg
   for (MCRegUnit Unit : TRI->regunits(DenseReg)) {
     LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
@@ -512,6 +553,36 @@ bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(MCRegister DenseReg,
         continue;
       }
 
+      // Fix #1a: Check that FreeReg is not redefined in VirtReg's live range
+      auto DefIt = PhysRegDefs.find(FreeReg);
+      if (DefIt != PhysRegDefs.end()) {
+        bool HasConflict = false;
+        for (SlotIndex DefIdx : DefIt->second) {
+          // Check if definition is strictly inside the live range (not at endpoints)
+          if (DefIdx > SegStart && DefIdx < SegEnd) {
+            LLVM_DEBUG(dbgs() << "        Cannot move to " << printReg(FreeReg, TRI)
+                              << ": redefined at " << DefIdx << " inside live range ["
+                              << SegStart << ", " << SegEnd << ")\n");
+            HasConflict = true;
+            break;
+          }
+        }
+        if (HasConflict)
+          continue;  // Try next VirtReg
+      }
+
+      // Fix #1b: Check that FreeReg is not clobbered by any call in the live range
+      BitVector UsableRegs;
+      if (LIS->checkRegMaskInterference(VirtRegLI, UsableRegs)) {
+        // checkRegMaskInterference returns true if LI crosses RegMask instructions
+        // UsableRegs now contains registers NOT clobbered by any RegMask
+        if (!UsableRegs.test(FreeReg)) {
+          LLVM_DEBUG(dbgs() << "        Cannot move to " << printReg(FreeReg, TRI)
+                            << ": clobbered by call RegMask in live range\n");
+          continue;  // Try next VirtReg
+        }
+      }
+
       // This VirtReg is movable! Perform the remap
       LLVM_DEBUG(dbgs() << "      Moving " << printReg(VirtReg, TRI) << " from "
                         << printReg(DenseReg, TRI) << " to "

From 6d8d689f206c6e56ec39f6ed63245dd71d3a53d2 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@amd.com>
Date: Fri, 7 Nov 2025 14:49:59 +0000
Subject: [PATCH 6/8] Add validity check for LiveRegMatrix to prevent dangling
 pointers

- Implemented `isValid()` method in LiveRegMatrix to verify that all LiveInterval pointers are valid.
- Added assertion in RegAllocBase's `postOptimization()` to ensure no dangling pointers exist in LiveRegMatrix after spilling.
---
 llvm/include/llvm/CodeGen/LiveRegMatrix.h | 10 ++++++++
 llvm/lib/CodeGen/LiveRegMatrix.cpp        | 28 +++++++++++++++++++++++
 llvm/lib/CodeGen/RegAllocBase.cpp         |  7 ++++++
 3 files changed, 45 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
index 0bc243271bb73..3d1566927f3a0 100644
--- a/llvm/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -168,6 +168,16 @@ class LiveRegMatrix {
   LiveIntervalUnion *getLiveUnions() { return &Matrix[0]; }
 
   Register getOneVReg(unsigned PhysReg) const;
+
+  /// Verify that all LiveInterval pointers in the matrix are valid.
+  /// This checks that each LiveInterval referenced in LiveIntervalUnion
+  /// actually exists in LiveIntervals and is not a dangling pointer.
+  /// Returns true if the matrix is valid, false if dangling pointers are found.
+  /// This is primarily useful for debugging heap-use-after-free issues.
+  /// This method uses a lazy approach - it builds a set of valid LiveInterval
+  /// pointers on-demand and has zero runtime/memory overhead during normal
+  /// register allocation.
+  bool isValid() const;
 };
 
 class LiveRegMatrixWrapperLegacy : public MachineFunctionPass {
diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
index cfda262aac82d..65b47a08bd48e 100644
--- a/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -12,11 +12,13 @@
 
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "RegisterCoalescer.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -290,6 +292,32 @@ Register LiveRegMatrix::getOneVReg(unsigned PhysReg) const {
   return MCRegister::NoRegister;
 }
 
+bool LiveRegMatrix::isValid() const {
+  // Build set of all valid LiveInterval pointers from LiveIntervals.
+  DenseSet<LiveInterval *> ValidIntervals;
+  for (unsigned RegIdx = 0, NumRegs = VRM->getRegInfo().getNumVirtRegs();
+       RegIdx < NumRegs; ++RegIdx) {
+    Register VReg = Register::index2VirtReg(RegIdx);
+    // Only track assigned registers since unassigned ones won't be in Matrix
+    if (VRM->hasPhys(VReg) && LIS->hasInterval(VReg))
+      ValidIntervals.insert(&LIS->getInterval(VReg));
+  }
+
+  // Now scan all LiveIntervalUnions in the matrix and verify each pointer
+  unsigned NumDanglingPointers = 0;
+  for (unsigned Unit = 0, NumUnits = Matrix.size(); Unit != NumUnits; ++Unit) {
+    for (const LiveInterval *LI : Matrix[Unit]) {
+      if (!ValidIntervals.contains(LI)) {
+        ++NumDanglingPointers;
+        dbgs() << "ERROR: LiveInterval pointer is not found in LiveIntervals:\n"
+               << "  Register Unit: " << printRegUnit(Unit, TRI) << "\n"
+               << "  LiveInterval pointer: " << LI << "\n";
+      }
+    }
+  }
+  return NumDanglingPointers == 0;
+}
+
 AnalysisKey LiveRegMatrixAnalysis::Key;
 
 LiveRegMatrix LiveRegMatrixAnalysis::run(MachineFunction &MF,
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index 2400a1feea26e..f8e2daea8a340 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -155,6 +155,13 @@ void RegAllocBase::allocatePhysRegs() {
 
 void RegAllocBase::postOptimization() {
   spiller().postOptimization();
+  
+  // Verify that LiveRegMatrix has no dangling pointers after spilling.
+  // This catches bugs where LiveIntervals are deleted but not removed from
+  // the LiveRegMatrix (e.g., LLVM bug #48911).
+  assert(Matrix->isValid() && 
+         "LiveRegMatrix contains dangling pointers after postOptimization");
+  
   for (auto *DeadInst : DeadRemats) {
     LIS->RemoveMachineInstrFromMaps(*DeadInst);
     DeadInst->eraseFromParent();

From b4d452c90370552f12ef729a1f215df83ff2bfb5 Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@amd.com>
Date: Fri, 7 Nov 2025 17:12:07 +0000
Subject: [PATCH 7/8] [regalloc] Fix dangling pointers left in LiveRegMatrix.

---
 llvm/include/llvm/CodeGen/LiveIntervalUnion.h |  4 ++++
 llvm/include/llvm/CodeGen/LiveRegMatrix.h     |  2 ++
 llvm/lib/CodeGen/InlineSpiller.cpp            | 22 ++++++++++++++++---
 llvm/lib/CodeGen/LiveIntervalUnion.cpp        | 13 +++++++++++
 llvm/lib/CodeGen/LiveRegMatrix.cpp            | 15 +++++++++++++
 .../Target/AMDGPU/SIPreAllocateWWMRegs.cpp    |  5 +++--
 6 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
index cc0f2a45bb182..643f62fa235b1 100644
--- a/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -93,6 +93,10 @@ class LiveIntervalUnion {
   // Remove a live virtual register's segments from this union.
   void extract(const LiveInterval &VirtReg, const LiveRange &Range);
 
+  // Remove all segments referencing VirtReg. This may be used if the register
+  // isn't used anymore.
+  void clear_all_segments_referencing(const LiveInterval &VirtReg);
+
   // Remove all inserted virtual registers.
   void clear() { Segments.clear(); ++Tag; }
 
diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
index 3d1566927f3a0..14c653244fe16 100644
--- a/llvm/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -135,6 +135,8 @@ class LiveRegMatrix {
   /// the assignment and updates VirtRegMap accordingly.
   void unassign(const LiveInterval &VirtReg);
 
+  void unassign(Register VirtReg);
+
   /// Returns true if the given \p PhysReg has any live intervals assigned.
   bool isPhysRegUsed(MCRegister PhysReg) const;
 
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index c3e0964594bd5..269c17d3dfbd4 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -86,6 +86,7 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
   const TargetInstrInfo &TII;
   const TargetRegisterInfo &TRI;
   const MachineBlockFrequencyInfo &MBFI;
+  LiveRegMatrix &Matrix;
 
   InsertPointAnalysis IPA;
 
@@ -129,16 +130,17 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
 
 public:
   HoistSpillHelper(const Spiller::RequiredAnalyses &Analyses,
-                   MachineFunction &mf, VirtRegMap &vrm)
+                   MachineFunction &mf, VirtRegMap &vrm, LiveRegMatrix &matrix)
       : MF(mf), LIS(Analyses.LIS), LSS(Analyses.LSS), MDT(Analyses.MDT),
         VRM(vrm), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
         TRI(*mf.getSubtarget().getRegisterInfo()), MBFI(Analyses.MBFI),
-        IPA(LIS, mf.getNumBlockIDs()) {}
+        Matrix(matrix), IPA(LIS, mf.getNumBlockIDs()) {}
 
   void addToMergeableSpills(MachineInstr &Spill, int StackSlot,
                             Register Original);
   bool rmFromMergeableSpills(MachineInstr &Spill, int StackSlot);
   void hoistAllSpills();
+  bool LRE_CanEraseVirtReg(Register) override;
   void LRE_DidCloneVirtReg(Register, Register) override;
 };
 
@@ -191,7 +193,7 @@ class InlineSpiller : public Spiller {
       : MF(MF), LIS(Analyses.LIS), LSS(Analyses.LSS), VRM(VRM),
         MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
         TRI(*MF.getSubtarget().getRegisterInfo()), Matrix(Matrix),
-        HSpiller(Analyses, MF, VRM), VRAI(VRAI) {}
+        HSpiller(Analyses, MF, VRM, *Matrix), VRAI(VRAI) {}
 
   void spill(LiveRangeEdit &, AllocationOrder *Order = nullptr) override;
   ArrayRef<Register> getSpilledRegs() override { return RegsToSpill; }
@@ -1750,6 +1752,20 @@ void HoistSpillHelper::hoistAllSpills() {
   }
 }
 
+/// Called before a virtual register is erased from LiveIntervals.
+/// Forcibly remove the register from LiveRegMatrix before it's deleted,
+/// preventing dangling pointers.
+bool HoistSpillHelper::LRE_CanEraseVirtReg(Register VirtReg) {
+  // If this virtual register is assigned to a physical register,
+  // unassign it from LiveRegMatrix before the interval is deleted.
+  // Use unassign_and_clear_all_refs() instead of unassign() because the
+  // LiveInterval may already be empty or in an inconsistent state.
+  if (VRM.hasPhys(VirtReg)) {
+    Matrix.unassign(VirtReg);
+  }
+  return true;  // Allow deletion to proceed
+}
+
 /// For VirtReg clone, the \p New register should have the same physreg or
 /// stackslot as the \p old register.
 void HoistSpillHelper::LRE_DidCloneVirtReg(Register New, Register Old) {
diff --git a/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/llvm/lib/CodeGen/LiveIntervalUnion.cpp
index eb547c5238432..f5643b9d2ca83 100644
--- a/llvm/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/llvm/lib/CodeGen/LiveIntervalUnion.cpp
@@ -79,6 +79,19 @@ void LiveIntervalUnion::extract(const LiveInterval &VirtReg,
   }
 }
 
+void LiveIntervalUnion::clear_all_segments_referencing(
+    const LiveInterval &VirtReg) {
+  ++Tag;
+
+  // Remove all segments referencing VirtReg.
+  for (SegmentIter SegPos = Segments.begin(); SegPos.valid();) {
+    if (SegPos.value() == &VirtReg)
+      SegPos.erase();
+    else
+      ++SegPos;
+  }
+}
+
 void
 LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
   if (empty()) {
diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
index 65b47a08bd48e..a3d1d4561bef2 100644
--- a/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -144,6 +144,21 @@ void LiveRegMatrix::unassign(const LiveInterval &VirtReg) {
   LLVM_DEBUG(dbgs() << '\n');
 }
 
+void LiveRegMatrix::unassign(Register VirtReg) {
+  Register PhysReg = VRM->getPhys(VirtReg);
+  LLVM_DEBUG(dbgs() << "unassigning " << printReg(VirtReg, TRI)
+                    << " from " << printReg(PhysReg, TRI) << ':');
+  VRM->clearVirt(VirtReg);
+
+  assert(LIS->hasInterval(VirtReg));
+  const LiveInterval &LI = LIS->getInterval(VirtReg);
+  for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
+    Matrix[Unit].clear_all_segments_referencing(LI);
+  }
+  ++NumUnassigned;
+  LLVM_DEBUG(dbgs() << '\n');
+}
+
 bool LiveRegMatrix::isPhysRegUsed(MCRegister PhysReg) const {
   for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
     if (!Matrix[Unit].empty())
diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index ecfaa5c70e9d3..e402068b93c3f 100644
--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -153,10 +153,11 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   for (unsigned Reg : RegsToRewrite) {
-    LIS->removeInterval(Reg);
-
     const Register PhysReg = VRM->getPhys(Reg);
     assert(PhysReg != 0);
+    
+    Matrix->unassign(Reg);
+    LIS->removeInterval(Reg);
 
     MFI->reserveWWMRegister(PhysReg);
   }

From e3cb46236485f232dab30b89679fdee12811e631 Mon Sep 17 00:00:00 2001
From: Michael Selehov <michael.selehov@amd.com>
Date: Tue, 18 Nov 2025 10:09:59 -0600
Subject: [PATCH 8/8] Fix for MCRegUnit

---
 llvm/lib/CodeGen/LiveRegMatrix.cpp                        | 5 +++--
 llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp | 8 +++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
index a3d1d4561bef2..030d07ecd0850 100644
--- a/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -321,11 +321,12 @@ bool LiveRegMatrix::isValid() const {
   // Now scan all LiveIntervalUnions in the matrix and verify each pointer
   unsigned NumDanglingPointers = 0;
   for (unsigned Unit = 0, NumUnits = Matrix.size(); Unit != NumUnits; ++Unit) {
-    for (const LiveInterval *LI : Matrix[Unit]) {
+    MCRegUnit RegUnit = static_cast<MCRegUnit>(Unit);
+    for (const LiveInterval *LI : Matrix[RegUnit]) {
       if (!ValidIntervals.contains(LI)) {
         ++NumDanglingPointers;
         dbgs() << "ERROR: LiveInterval pointer is not found in LiveIntervals:\n"
-               << "  Register Unit: " << printRegUnit(Unit, TRI) << "\n"
+               << "  Register Unit: " << printRegUnit(RegUnit, TRI) << "\n"
                << "  LiveInterval pointer: " << LI << "\n";
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
index c4c16c56f17c8..a6a8e711488fe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHotBlockRegisterRenaming.cpp
@@ -348,7 +348,8 @@ void AMDGPUHotBlockRegisterRenamingImpl::calculateValueDensity(
 
     // Access LiveIntervalUnion for this PhysReg
     for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
-      LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+      LiveIntervalUnion &LIU =
+          LRM->getLiveUnions()[static_cast<unsigned>(Unit)];
 
       for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) {
         SlotIndex SegStart = SI.start();
@@ -389,7 +390,8 @@ void AMDGPUHotBlockRegisterRenamingImpl::findFreeRegisters(
 
     // Check all register units
     for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
-      LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+      LiveIntervalUnion &LIU =
+          LRM->getLiveUnions()[static_cast<unsigned>(Unit)];
 
       // Check if anything is live in this BB
       LiveIntervalUnion::SegmentIter SI = LIU.find(BBStart);
@@ -498,7 +500,7 @@ bool AMDGPUHotBlockRegisterRenamingImpl::tryMoveValue(
     const DenseMap<MCRegister, SmallVector<SlotIndex, 4>> &PhysRegDefs) {
   // Find a movable local value in DenseReg
   for (MCRegUnit Unit : TRI->regunits(DenseReg)) {
-    LiveIntervalUnion &LIU = LRM->getLiveUnions()[Unit];
+    LiveIntervalUnion &LIU = LRM->getLiveUnions()[static_cast<unsigned>(Unit)];
 
     for (LiveIntervalUnion::SegmentIter SI = LIU.begin(); SI.valid(); ++SI) {
       Register VirtReg = SI.value()->reg();