Skip to content

Commit

Permalink
[AMDGPU][SILowerSGPRSpills] Spill SGPRs to virtual VGPRs
Browse files Browse the repository at this point in the history
Currently, the custom SGPR spill lowering pass spills
SGPRs into physical VGPR lanes and the remaining VGPRs
are used by regalloc for vector regclass allocation.
This imposes many restrictions that we ended up with
unsuccessful SGPR spilling when there won't be enough
VGPRs and we are forced to spill the leftover into
memory during PEI. The custom spill handling during PEI
has many edge cases and often breaks the compiler time
to time.

This patch implements spilling SGPRs into virtual VGPR
lanes. Since we now split the register allocation for
SGPRs and VGPRs, the virtual registers introduced for
the spill lanes would get allocated automatically in
the subsequent regalloc invocation for VGPRs.

Spill to virtual registers will always be successful,
even in the high-pressure situations, and hence it avoids
most of the edge cases during PEI. We are now left with
only the custom SGPR spills during PEI for special registers
like the frame pointer which is an unproblematic case.

By spilling CSRs into virtual VGPR lanes, we might end up
with broken CFIs that can potentially corrupt the frame
unwinding in the debugger causing either a crash or a
terrible debugging experience. This occurs when regalloc
tries to spill or split the liverange of these virtual VGPRs.
The CFIs should also be inserted at these intermediate
points to correctly propagate the CFI entries. It is not
currently implemented in the compiler. As a short-term fix,
we continue to spill CSR SGPRs into physical VGPR lanes for
the debugger to correctly compute the unwind information.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D124196

Change-Id: I1180639b1211b05e439132f9fc978860577c9017
  • Loading branch information
cdevadas committed Jan 27, 2023
1 parent 9729adc commit a29fe42
Show file tree
Hide file tree
Showing 72 changed files with 5,144 additions and 4,786 deletions.
16 changes: 8 additions & 8 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ static void getVGPRSpillLaneOrTempRegister(
SGPR, PrologEpilogSGPRSaveRestoreInfo(
SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));

LLVM_DEBUG(
auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
<< printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
<< printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
<< '\n';);
} else {
// Remove dead <FI> index
MF.getFrameInfo().RemoveStackObject(FI);
Expand Down Expand Up @@ -289,7 +289,7 @@ class PrologEpilogSGPRSpillBuilder {

assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
assert(Spill.size() == NumSubRegs);

for (unsigned I = 0; I < NumSubRegs; ++I) {
Expand Down Expand Up @@ -372,7 +372,7 @@ class PrologEpilogSGPRSpillBuilder {
void restoreFromVGPRLane(const int FI) {
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
assert(Spill.size() == NumSubRegs);

for (unsigned I = 0; I < NumSubRegs; ++I) {
Expand Down Expand Up @@ -1544,8 +1544,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
TRI->isAGPR(MRI, VReg))) {
// FIXME: change to enterBasicBlockEnd()
RS->enterBasicBlock(MBB);
RS->enterBasicBlockEnd(MBB);
RS->backward(MI);
TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
SpillFIs.set(FI);
continue;
Expand Down
151 changes: 131 additions & 20 deletions llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/InitializePasses.h"
Expand All @@ -40,6 +41,7 @@ class SILowerSGPRSpills : public MachineFunctionPass {
const SIInstrInfo *TII = nullptr;
LiveIntervals *LIS = nullptr;
SlotIndexes *Indexes = nullptr;
MachineDominatorTree *MDT = nullptr;

// Save and Restore blocks of the current function. Typically there is a
// single save block, unless Windows EH funclets are involved.
Expand All @@ -52,14 +54,25 @@ class SILowerSGPRSpills : public MachineFunctionPass {
SILowerSGPRSpills() : MachineFunctionPass(ID) {}

void calculateSaveRestoreBlocks(MachineFunction &MF);
bool spillCalleeSavedRegs(MachineFunction &MF);
bool spillCalleeSavedRegs(MachineFunction &MF,
SmallVectorImpl<int> &CalleeSavedFIs);
void updateLaneVGPRDomInstr(
int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr);

bool runOnMachineFunction(MachineFunction &MF) override;

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineDominatorTree>();
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}

MachineFunctionProperties getClearedProperties() const override {
return MachineFunctionProperties()
.set(MachineFunctionProperties::Property::IsSSA)
.set(MachineFunctionProperties::Property::NoVRegs);
}
};

} // end anonymous namespace
Expand All @@ -70,6 +83,7 @@ INITIALIZE_PASS_BEGIN(SILowerSGPRSpills, DEBUG_TYPE,
"SI lower SGPR spill instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
INITIALIZE_PASS_END(SILowerSGPRSpills, DEBUG_TYPE,
"SI lower SGPR spill instructions", false, false)

Expand Down Expand Up @@ -175,7 +189,8 @@ static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
EntryBB.sortUniqueLiveIns();
}

bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
bool SILowerSGPRSpills::spillCalleeSavedRegs(
MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Expand Down Expand Up @@ -214,6 +229,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
TRI->getSpillAlign(*RC), true);

CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
CalleeSavedFIs.push_back(JunkFI);
}
}

Expand All @@ -226,6 +242,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
TRI->getSpillAlign(*RC), true);
CSI.push_back(CalleeSavedInfo(RetAddrReg, JunkFI));
CalleeSavedFIs.push_back(JunkFI);
}

if (!CSI.empty()) {
Expand All @@ -245,20 +262,71 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
return false;
}

void SILowerSGPRSpills::updateLaneVGPRDomInstr(
int FI, MachineBasicBlock *MBB, MachineBasicBlock::iterator InsertPt,
DenseMap<Register, MachineBasicBlock::iterator> &LaneVGPRDomInstr) {
// For the Def of a virtual LaneVPGR to dominate all its uses, we should
// insert an IMPLICIT_DEF before the dominating spill. Switching to a
// depth first order doesn't really help since the machine function can be in
// the unstructured control flow post-SSA. For each virtual register, hence
// finding the common dominator to get either the dominating spill or a block
// dominating all spills. Is there a better way to handle it?
SIMachineFunctionInfo *FuncInfo =
MBB->getParent()->getInfo<SIMachineFunctionInfo>();
ArrayRef<SIRegisterInfo::SpilledReg> VGPRSpills =
FuncInfo->getSGPRSpillToVirtualVGPRLanes(FI);
Register PrevLaneVGPR;
for (auto &Spill : VGPRSpills) {
if (PrevLaneVGPR == Spill.VGPR)
continue;

PrevLaneVGPR = Spill.VGPR;
auto I = LaneVGPRDomInstr.find(Spill.VGPR);
if (Spill.Lane == 0 && I == LaneVGPRDomInstr.end()) {
// Initially add the spill instruction itself for Insertion point.
LaneVGPRDomInstr[Spill.VGPR] = InsertPt;
} else {
assert(I != LaneVGPRDomInstr.end());
auto PrevInsertPt = I->second;
MachineBasicBlock *DomMBB = PrevInsertPt->getParent();
if (DomMBB == MBB) {
// The insertion point earlier selected in a predecessor block whose
// spills are currently being lowered. The earlier InsertPt would be
// the one just before the block terminator and it should be changed
// if we insert any new spill in it.
if (MDT->dominates(&*InsertPt, &*PrevInsertPt))
I->second = InsertPt;

continue;
}

// Find the common dominator block between PrevInsertPt and the
// current spill.
DomMBB = MDT->findNearestCommonDominator(DomMBB, MBB);
if (DomMBB == MBB)
I->second = InsertPt;
else if (DomMBB != PrevInsertPt->getParent())
I->second = &(*DomMBB->getFirstTerminator());
}
}
}

bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();

LIS = getAnalysisIfAvailable<LiveIntervals>();
Indexes = getAnalysisIfAvailable<SlotIndexes>();
MDT = &getAnalysis<MachineDominatorTree>();

assert(SaveBlocks.empty() && RestoreBlocks.empty());

// First, expose any CSR SGPR spills. This is mostly the same as what PEI
// does, but somewhat simpler.
calculateSaveRestoreBlocks(MF);
bool HasCSRs = spillCalleeSavedRegs(MF);
SmallVector<int> CalleeSavedFIs;
bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs);

MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
Expand All @@ -272,6 +340,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {

bool MadeChange = false;
bool NewReservedRegs = false;
bool SpilledToVirtVGPRLanes = false;

// TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
// handled as SpilledToReg in regular PrologEpilogInserter.
Expand All @@ -287,30 +356,69 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// To track the spill frame indices handled in this pass.
BitVector SpillFIs(MFI.getObjectIndexEnd(), false);

// To track the IMPLICIT_DEF insertion point for the lane vgprs.
DenseMap<Register, MachineBasicBlock::iterator> LaneVGPRDomInstr;

for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
if (!TII->isSGPRSpill(MI))
continue;

int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
NewReservedRegs = true;
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
MI, FI, nullptr, Indexes, LIS);
(void)Spilled;
assert(Spilled && "failed to spill SGPR to VGPR when allocated");
SpillFIs.set(FI);

bool IsCalleeSaveSGPRSpill =
std::find(CalleeSavedFIs.begin(), CalleeSavedFIs.end(), FI) !=
CalleeSavedFIs.end();
if (IsCalleeSaveSGPRSpill) {
// Spill callee-saved SGPRs into physical VGPR lanes.

// TODO: This is to ensure the CFIs are static for efficient frame
// unwinding in the debugger. Spilling them into virtual VGPR lanes
// involve regalloc to allocate the physical VGPRs and that might
// cause intermediate spill/split of such liveranges for successful
// allocation. This would result in broken CFI encoding unless the
// regalloc aware CFI generation to insert new CFIs along with the
// intermediate spills is implemented. There is no such support
// currently exist in the LLVM compiler.
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
NewReservedRegs = true;
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
MI, FI, nullptr, Indexes, LIS, true);
(void)Spilled;
assert(Spilled &&
"failed to spill SGPR to physical VGPR lane when allocated");
}
} else {
MachineInstrSpan MIS(&MI, &MBB);
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
MI, FI, nullptr, Indexes, LIS);
(void)Spilled;
assert(Spilled &&
"failed to spill SGPR to virtual VGPR lane when allocated");
SpillFIs.set(FI);
updateLaneVGPRDomInstr(FI, &MBB, MIS.begin(), LaneVGPRDomInstr);
SpilledToVirtVGPRLanes = true;
}
}
}
}

// FIXME: Adding to live-ins redundant with reserving registers.
for (MachineBasicBlock &MBB : MF) {
for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
MBB.addLiveIn(Reg);
MBB.sortUniqueLiveIns();
for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
auto InsertPt = LaneVGPRDomInstr[Reg];
// Insert the IMPLICIT_DEF at the identified points.
auto MIB =
BuildMI(*InsertPt->getParent(), *InsertPt, InsertPt->getDebugLoc(),
TII->get(AMDGPU::IMPLICIT_DEF), Reg);
FuncInfo->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
if (LIS) {
LIS->InsertMachineInstrInMaps(*MIB);
LIS->createAndComputeVirtRegInterval(Reg);
}
}

for (MachineBasicBlock &MBB : MF) {
// FIXME: The dead frame indices are replaced with a null register from
// the debug value instructions. We should instead, update it with the
// correct register value. But not sure the register value alone is
Expand Down Expand Up @@ -338,6 +446,10 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// lane".
FuncInfo->removeDeadFrameIndices(MF, /*ResetSGPRSpillStackIDs*/ false);

MadeChange = true;
}

if (SpilledToVirtVGPRLanes) {
const TargetRegisterClass *RC =
ST.isWave32() ? &AMDGPU::SGPR_32RegClass : &AMDGPU::SGPR_64RegClass;
// Shift back the reserved SGPR for EXEC copy into the lowest range.
Expand All @@ -347,18 +459,17 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) <
TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy()))
FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR);

MadeChange = true;
} else {
// No SGPR spills and hence there won't be any WWM spills/copies. Reset the
// SGPR reserved for EXEC copy.
// No SGPR spills to virtual VGPR lanes and hence there won't be any WWM
// spills/copies. Reset the SGPR reserved for EXEC copy.
FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister);
}

SaveBlocks.clear();
RestoreBlocks.clear();

// Updated the reserved registers with any VGPRs added for SGPR spills.
// Updated the reserved registers with any physical VGPRs added for SGPR
// spills.
if (NewReservedRegs)
MRI.freezeReservedRegs(MF);

Expand Down
Loading

0 comments on commit a29fe42

Please sign in to comment.