73 changes: 52 additions & 21 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
Expand Down Expand Up @@ -4473,27 +4475,55 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
// Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
!ST.isTrapHandlerEnabled()) {
B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
} else {
// Pass queue pointer to trap handler as input, and insert trap instruction
// Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
MachineRegisterInfo &MRI = *B.getMRI();
if (!ST.isTrapHandlerEnabled() ||
ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
return legalizeTrap_ENDPGM(MI, MRI, B);

if (const auto &&HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
switch (HsaAbiVer.getValue()) {
case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
return legalizeTrap_AMDHSA_QUEUE_PTR(MI, MRI, B);
case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
return ST.supportsGetDoorbellID() ?
legalizeTrap_AMDHSA(MI, MRI, B) :
legalizeTrap_AMDHSA_QUEUE_PTR(MI, MRI, B);
}
}

Register LiveIn =
MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return false;
llvm_unreachable("Unknown trap handler");
}

Register SGPR01(AMDGPU::SGPR0_SGPR1);
B.buildCopy(SGPR01, LiveIn);
B.buildInstr(AMDGPU::S_TRAP)
.addImm(GCNSubtarget::TrapIDLLVMTrap)
.addReg(SGPR01, RegState::Implicit);
}
bool AMDGPULegalizerInfo::legalizeTrap_ENDPGM(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
MI.eraseFromParent();
return true;
}

bool AMDGPULegalizerInfo::legalizeTrap_AMDHSA_QUEUE_PTR(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
// Pass queue pointer to trap handler as input, and insert trap instruction
// Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
Register LiveIn =
MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return false;

Register SGPR01(AMDGPU::SGPR0_SGPR1);
B.buildCopy(SGPR01, LiveIn);
B.buildInstr(AMDGPU::S_TRAP)
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
.addReg(SGPR01, RegState::Implicit);

MI.eraseFromParent();
return true;
}

bool AMDGPULegalizerInfo::legalizeTrap_AMDHSA(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
B.buildInstr(AMDGPU::S_TRAP)
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
MI.eraseFromParent();
return true;
}
Expand All @@ -4502,16 +4532,17 @@ bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
// Is non-HSA path or trap-handler disabled? then, report a warning
// accordingly
if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
!ST.isTrapHandlerEnabled()) {
if (!ST.isTrapHandlerEnabled() ||
ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
"debugtrap handler not supported",
MI.getDebugLoc(), DS_Warning);
LLVMContext &Ctx = B.getMF().getFunction().getContext();
Ctx.diagnose(NoTrap);
} else {
// Insert debug-trap instruction
B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
B.buildInstr(AMDGPU::S_TRAP)
.addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
}

MI.eraseFromParent();
Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,12 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {

bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeTrap_ENDPGM(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeTrap_AMDHSA_QUEUE_PTR(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeTrap_AMDHSA(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeDebugTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;

Expand Down
16 changes: 0 additions & 16 deletions llvm/lib/Target/AMDGPU/AMDGPUPTNote.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,22 +25,6 @@ const char SectionName[] = ".note";
const char NoteNameV2[] = "AMD";
const char NoteNameV3[] = "AMDGPU";

// TODO: Remove this file once we drop code object v2.
enum NoteType{
NT_AMDGPU_HSA_RESERVED_0 = 0,
NT_AMDGPU_HSA_CODE_OBJECT_VERSION = 1,
NT_AMDGPU_HSA_HSAIL = 2,
NT_AMDGPU_HSA_ISA = 3,
NT_AMDGPU_HSA_PRODUCER = 4,
NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5,
NT_AMDGPU_HSA_EXTENSION = 6,
NT_AMDGPU_HSA_RESERVED_7 = 7,
NT_AMDGPU_HSA_RESERVED_8 = 8,
NT_AMDGPU_HSA_RESERVED_9 = 9,
NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101,
NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
};

}
}

Expand Down
35 changes: 14 additions & 21 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,19 @@

#include "AMDGPUSubtarget.h"
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
#include "AMDGPUCallLowering.h"
#include "AMDGPUInstructionSelector.h"
#include "AMDGPULegalizerInfo.h"
#include "AMDGPURegisterBankInfo.h"
#include "SIMachineFunctionInfo.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include <algorithm>

using namespace llvm;
Expand Down Expand Up @@ -85,9 +86,9 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
//
// Similarly we want enable-prt-strict-null to be on by default and not to
// unset everything else if it is disabled
TargetID.emplace(*this);

// Assuming ECC is enabled is the conservative default.
SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");

if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
Expand Down Expand Up @@ -140,20 +141,12 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,

HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;

// Disable XNACK on targets where it is not enabled by default unless it is
// explicitly requested.
if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
ToggleFeature(AMDGPU::FeatureXNACK);
EnableXNACK = false;
}
TargetID->setTargetIDFromFeaturesString(FS);

// ECC is on by default, but turn it off if the hardware doesn't support it
// anyway. This matters for the gfx9 targets with d16 loads, but don't support
// ECC.
if (DoesNotSupportSRAMECC && EnableSRAMECC) {
ToggleFeature(AMDGPU::FeatureSRAMECC);
EnableSRAMECC = false;
}
LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
<< TargetID->getXnackSetting() << '\n');
LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
<< TargetID->getSramEccSetting() << '\n');

return *this;
}
Expand Down Expand Up @@ -198,8 +191,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
UnalignedAccessMode(false),

HasApertureRegs(false),
SupportsXNACK(false),
EnableXNACK(false),
DoesNotSupportXNACK(false),
EnableCuMode(false),
TrapHandler(false),

Expand Down Expand Up @@ -248,8 +241,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasMAIInsts(false),
HasPkFmacF16Inst(false),
HasAtomicFaddInsts(false),
SupportsSRAMECC(false),
EnableSRAMECC(false),
DoesNotSupportSRAMECC(false),
HasNoSdstCMPX(false),
HasVscnt(false),
HasGetWaveIdInst(false),
Expand Down
58 changes: 33 additions & 25 deletions llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -272,24 +272,16 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
using AMDGPUSubtarget::getMaxWavesPerEU;

public:
enum TrapHandlerAbi {
TrapHandlerAbiNone = 0,
TrapHandlerAbiHsa = 1
// Following 2 enums are documented at:
// - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
enum class TrapHandlerAbi {
NONE = 0x00,
AMDHSA = 0x01,
};

enum TrapID {
TrapIDHardwareReserved = 0,
TrapIDHSADebugTrap = 1,
TrapIDLLVMTrap = 2,
TrapIDLLVMDebugTrap = 3,
TrapIDDebugBreakpoint = 7,
TrapIDDebugReserved8 = 8,
TrapIDDebugReservedFE = 0xfe,
TrapIDDebugReservedFF = 0xff
};

enum TrapRegValues {
LLVMTrapHandlerRegValue = 1
enum class TrapID {
LLVMAMDHSATrap = 0x02,
LLVMAMDHSADebugTrap = 0x03,
};

private:
Expand All @@ -300,6 +292,8 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
std::unique_ptr<LegalizerInfo> Legalizer;
std::unique_ptr<RegisterBankInfo> RegBankInfo;

Optional<AMDGPU::IsaInfo::AMDGPUTargetID> TargetID;

protected:
// Basic subtarget description.
Triple TargetTriple;
Expand All @@ -320,8 +314,12 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
bool UnalignedBufferAccess;
bool UnalignedAccessMode;
bool HasApertureRegs;
bool SupportsXNACK;

// This should not be used directly. 'TargetID' tracks the dynamic settings
// for XNACK.
bool EnableXNACK;
bool DoesNotSupportXNACK;

bool EnableCuMode;
bool TrapHandler;

Expand Down Expand Up @@ -375,8 +373,12 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
bool HasMAIInsts;
bool HasPkFmacF16Inst;
bool HasAtomicFaddInsts;
bool SupportsSRAMECC;

// This should not be used directly. 'TargetID' tracks the dynamic settings
// for SRAMECC.
bool EnableSRAMECC;
bool DoesNotSupportSRAMECC;

bool HasNoSdstCMPX;
bool HasVscnt;
bool HasGetWaveIdInst;
Expand Down Expand Up @@ -469,6 +471,11 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
return RegBankInfo.get();
}

const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
assert(TargetID.hasValue() && "TargetID has not be initialized");
return *TargetID;
}

// Nothing implemented, just prevent crashes on use.
const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
return &TSInfo;
Expand Down Expand Up @@ -594,7 +601,12 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
}

TrapHandlerAbi getTrapHandlerAbi() const {
return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
}

bool supportsGetDoorbellID() const {
// The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
return getGeneration() >= GFX9;
}

/// True if the offset field of DS instructions works as expected. On SI, the
Expand Down Expand Up @@ -723,7 +735,7 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
}

bool isXNACKEnabled() const {
return EnableXNACK;
return getTargetID().isXnackOnOrAny();
}

bool isCuModeEnabled() const {
Expand Down Expand Up @@ -786,7 +798,7 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
}

bool d16PreservesUnusedBits() const {
return hasD16LoadStore() && !isSRAMECCEnabled();
return hasD16LoadStore() && !getTargetID().isSramEccOnOrAny();
}

bool hasD16Images() const {
Expand Down Expand Up @@ -894,10 +906,6 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
return HasAtomicFaddInsts;
}

bool isSRAMECCEnabled() const {
return EnableSRAMECC;
}

bool hasNoSdstCMPX() const {
return HasNoSdstCMPX;
}
Expand Down
96 changes: 47 additions & 49 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool ParseDirectiveHSACodeObjectISA();
bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
bool ParseDirectiveAMDKernelCodeT();
bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo) const;
// TODO: Possibly make subtargetHasRegister const.
bool subtargetHasRegister(const MCRegisterInfo &MRI, unsigned RegNo);
bool ParseDirectiveAMDGPUHsaKernel();

bool ParseDirectiveISAVersion();
Expand Down Expand Up @@ -1136,7 +1137,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
// AsmParser::parseDirectiveSet() cannot be specialized for specific target.
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
MCContext &Ctx = getContext();
if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) {
if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) {
MCSymbol *Sym =
Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
Expand All @@ -1153,18 +1154,14 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
}
if (ISA.Major >= 6 && isHsaAbiVersion3(&getSTI())) {
if (ISA.Major >= 6 && isHsaAbiVersion3Or4(&getSTI())) {
initializeGprCountSymbol(IS_VGPR);
initializeGprCountSymbol(IS_SGPR);
} else
KernelScope.initialize(getContext());
}
}

bool hasXNACK() const {
return AMDGPU::hasXNACK(getSTI());
}

bool hasMIMG_R128() const {
return AMDGPU::hasMIMG_R128(getSTI());
}
Expand Down Expand Up @@ -1408,6 +1405,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
void lex();

public:
void onBeginOfFile() override;

OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
OperandMatchResultTy parseOptionalOpr(OperandVector &Operands);

Expand Down Expand Up @@ -2498,7 +2497,7 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
return nullptr;
}
if (isHsaAbiVersion3(&getSTI())) {
if (isHsaAbiVersion3Or4(&getSTI())) {
if (!updateGprCountSymbols(RegKind, RegNum, RegWidth))
return nullptr;
} else
Expand Down Expand Up @@ -4000,22 +3999,16 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() {
if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
return TokError("directive only supported for amdgcn architecture");

std::string Target;

std::string TargetIDDirective;
SMLoc TargetStart = getTok().getLoc();
if (getParser().parseEscapedString(Target))
if (getParser().parseEscapedString(TargetIDDirective))
return true;
SMRange TargetRange = SMRange(TargetStart, getTok().getLoc());

std::string ExpectedTarget;
raw_string_ostream ExpectedTargetOS(ExpectedTarget);
IsaInfo::streamIsaVersion(&getSTI(), ExpectedTargetOS);

if (Target != ExpectedTargetOS.str())
return getParser().Error(TargetRange.Start, "target must match options",
SMRange TargetRange = SMRange(TargetStart, getTok().getLoc());
if (getTargetStreamer().getTargetID()->toString() != TargetIDDirective)
return getParser().Error(TargetRange.Start, "target id must match options",
TargetRange);

getTargetStreamer().EmitDirectiveAMDGCNTarget(Target);
return false;
}

Expand Down Expand Up @@ -4087,7 +4080,6 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
unsigned UserSGPRCount = 0;
bool ReserveVCC = true;
bool ReserveFlatScr = true;
bool ReserveXNACK = hasXNACK();
Optional<bool> EnableWavefrontSize32;

while (true) {
Expand Down Expand Up @@ -4231,7 +4223,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
IDRange);
if (!isUInt<1>(Val))
return OutOfRangeError(ValRange);
ReserveXNACK = Val;
if (Val != getTargetStreamer().getTargetID()->isXnackOnOrAny())
return getParser().Error(IDRange.Start, ".amdhsa_reserve_xnack_mask does not match target id",
IDRange);
} else if (ID == ".amdhsa_float_round_mode_32") {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32, Val, ValRange);
Expand Down Expand Up @@ -4322,7 +4316,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
unsigned VGPRBlocks;
unsigned SGPRBlocks;
if (calculateGPRBlocks(getFeatureBits(), ReserveVCC, ReserveFlatScr,
ReserveXNACK, EnableWavefrontSize32, NextFreeVGPR,
getTargetStreamer().getTargetID()->isXnackOnOrAny(),
EnableWavefrontSize32, NextFreeVGPR,
VGPRRange, NextFreeSGPR, SGPRRange, VGPRBlocks,
SGPRBlocks))
return true;
Expand All @@ -4347,7 +4342,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {

getTargetStreamer().EmitAmdhsaKernelDescriptor(
getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
ReserveFlatScr, ReserveXNACK);
ReserveFlatScr);
return false;
}

Expand All @@ -4373,9 +4368,9 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
// targeted GPU.
if (getLexer().is(AsmToken::EndOfStatement)) {
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor,
ISA.Stepping,
"AMD", "AMDGPU");
getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(ISA.Major, ISA.Minor,
ISA.Stepping,
"AMD", "AMDGPU");
return false;
}

Expand Down Expand Up @@ -4409,8 +4404,8 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
ArchName = getLexer().getTok().getStringContents();
Lex();

getTargetStreamer().EmitDirectiveHSACodeObjectISA(Major, Minor, Stepping,
VendorName, ArchName);
getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(Major, Minor, Stepping,
VendorName, ArchName);
return false;
}

Expand Down Expand Up @@ -4521,19 +4516,11 @@ bool AMDGPUAsmParser::ParseDirectiveISAVersion() {
"architectures");
}

auto ISAVersionStringFromASM = getLexer().getTok().getStringContents();
auto TargetIDDirective = getLexer().getTok().getStringContents();
if (getTargetStreamer().getTargetID()->toString() != TargetIDDirective)
return Error(getParser().getTok().getLoc(), "target id must match options");

std::string ISAVersionStringFromSTI;
raw_string_ostream ISAVersionStreamFromSTI(ISAVersionStringFromSTI);
IsaInfo::streamIsaVersion(&getSTI(), ISAVersionStreamFromSTI);

if (ISAVersionStringFromASM != ISAVersionStreamFromSTI.str()) {
return Error(getParser().getTok().getLoc(),
".amd_amdgpu_isa directive does not match triple and/or mcpu "
"arguments specified through the command line");
}

getTargetStreamer().EmitISAVersion(ISAVersionStreamFromSTI.str());
getTargetStreamer().EmitISAVersion();
Lex();

return false;
Expand All @@ -4543,7 +4530,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
const char *AssemblerDirectiveBegin;
const char *AssemblerDirectiveEnd;
std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
isHsaAbiVersion3(&getSTI())
isHsaAbiVersion3Or4(&getSTI())
? std::make_tuple(HSAMD::V3::AssemblerDirectiveBegin,
HSAMD::V3::AssemblerDirectiveEnd)
: std::make_tuple(HSAMD::AssemblerDirectiveBegin,
Expand All @@ -4560,7 +4547,7 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
HSAMetadataString))
return true;

if (isHsaAbiVersion3(&getSTI())) {
if (isHsaAbiVersion3Or4(&getSTI())) {
if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
return Error(getParser().getTok().getLoc(), "invalid HSA metadata");
} else {
Expand Down Expand Up @@ -4717,12 +4704,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getString();

if (isHsaAbiVersion3(&getSTI())) {
if (IDVal == ".amdgcn_target")
return ParseDirectiveAMDGCNTarget();

if (isHsaAbiVersion3Or4(&getSTI())) {
if (IDVal == ".amdhsa_kernel")
return ParseDirectiveAMDHSAKernel();
return ParseDirectiveAMDHSAKernel();

// TODO: Restructure/combine with PAL metadata directive.
if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin)
Expand All @@ -4747,6 +4731,9 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
return ParseDirectiveHSAMetadata();
}

if (IDVal == ".amdgcn_target")
return ParseDirectiveAMDGCNTarget();

if (IDVal == ".amdgpu_lds")
return ParseDirectiveAMDGPULDS();

Expand All @@ -4760,7 +4747,7 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
}

bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
unsigned RegNo) const {
unsigned RegNo) {

for (MCRegAliasIterator R(AMDGPU::TTMP12_TTMP13_TTMP14_TTMP15, &MRI, true);
R.isValid(); ++R) {
Expand Down Expand Up @@ -4792,7 +4779,7 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
case AMDGPU::XNACK_MASK:
case AMDGPU::XNACK_MASK_LO:
case AMDGPU::XNACK_MASK_HI:
return !isCI() && !isSI() && !isGFX10() && hasXNACK();
return !isCI() && !isSI() && !isGFX10() && getTargetStreamer().getTargetID()->isXnackSupported();
case AMDGPU::SGPR_NULL:
return isGFX10();
default:
Expand Down Expand Up @@ -6742,6 +6729,17 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"abid", AMDGPUOperand::ImmTyABID, false, nullptr}
};

void AMDGPUAsmParser::onBeginOfFile() {
if (getSTI().getTargetTriple().getArch() == Triple::r600)
return;

if (!getTargetStreamer().getTargetID())
getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString());

if (isHsaAbiVersion3Or4(&getSTI()))
getTargetStreamer().EmitDirectiveAMDGCNTarget();
}

OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {

OperandMatchResultTy res = parseOptionalOpr(Operands);
Expand Down
234 changes: 186 additions & 48 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,20 @@ using namespace llvm::AMDGPU::HSAMD;
// AMDGPUTargetStreamer
//===----------------------------------------------------------------------===//

static void convertIsaVersionV2(uint32_t &Major, uint32_t &Minor,
uint32_t &Stepping, bool Sramecc, bool Xnack) {
if (Major == 9 && Minor == 0) {
switch (Stepping) {
case 0:
case 2:
case 4:
case 6:
if (Xnack)
Stepping++;
}
}
}

bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
HSAMD::Metadata HSAMetadata;
if (HSAMD::fromString(std::string(HSAMetadataString), HSAMetadata))
Expand Down Expand Up @@ -191,8 +205,8 @@ void AMDGPUTargetAsmStreamer::finish() {
getPALMetadata()->reset();
}

void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {
OS << "\t.amdgcn_target \"" << Target << "\"\n";
void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget() {
OS << "\t.amdgcn_target \"" << getTargetID()->toString() << "\"\n";
}

void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(
Expand All @@ -202,15 +216,14 @@ void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(
}

void
AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
uint32_t Minor,
uint32_t Stepping,
StringRef VendorName,
StringRef ArchName) {
OS << "\t.hsa_code_object_isa " <<
Twine(Major) << "," << Twine(Minor) << "," << Twine(Stepping) <<
",\"" << VendorName << "\",\"" << ArchName << "\"\n";

AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
uint32_t Minor,
uint32_t Stepping,
StringRef VendorName,
StringRef ArchName) {
convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny());
OS << "\t.hsa_code_object_isa " << Twine(Major) << "," << Twine(Minor) << ","
<< Twine(Stepping) << ",\"" << VendorName << "\",\"" << ArchName << "\"\n";
}

void
Expand All @@ -236,8 +249,8 @@ void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
<< Alignment.value() << '\n';
}

bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) {
OS << "\t.amd_amdgpu_isa \"" << IsaVersionString << "\"\n";
bool AMDGPUTargetAsmStreamer::EmitISAVersion() {
OS << "\t.amd_amdgpu_isa \"" << getTargetID()->toString() << "\"\n";
return true;
}

Expand Down Expand Up @@ -279,7 +292,7 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd() {
void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
bool ReserveVCC, bool ReserveFlatScr, bool ReserveXNACK) {
bool ReserveVCC, bool ReserveFlatScr) {
IsaVersion IVersion = getIsaVersion(STI.getCPU());

OS << "\t.amdhsa_kernel " << KernelName << '\n';
Expand Down Expand Up @@ -346,8 +359,20 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
if (IVersion.Major >= 7 && !ReserveFlatScr)
OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
if (IVersion.Major >= 8 && ReserveXNACK != hasXNACK(STI))
OS << "\t\t.amdhsa_reserve_xnack_mask " << ReserveXNACK << '\n';

if (const auto &&HsaAbiVer = getHsaAbiVersion(&STI)) {
switch (HsaAbiVer.getValue()) {
default:
break;
case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
break;
case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
if (getTargetID()->isXnackSupported())
OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n';
break;
}
}

PRINT_FIELD(OS, ".amdhsa_float_round_mode_32", KD,
compute_pgm_rsrc1,
Expand Down Expand Up @@ -416,23 +441,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(

AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S,
const MCSubtargetInfo &STI)
: AMDGPUTargetStreamer(S), Streamer(S), Os(STI.getTargetTriple().getOS()) {
MCAssembler &MCA = getStreamer().getAssembler();
unsigned EFlags = MCA.getELFHeaderEFlags();

EFlags &= ~ELF::EF_AMDGPU_MACH;
EFlags |= getElfMach(STI.getCPU());

EFlags &= ~ELF::EF_AMDGPU_XNACK;
if (AMDGPU::hasXNACK(STI))
EFlags |= ELF::EF_AMDGPU_XNACK;

EFlags &= ~ELF::EF_AMDGPU_SRAM_ECC;
if (AMDGPU::hasSRAMECC(STI))
EFlags |= ELF::EF_AMDGPU_SRAM_ECC;

MCA.setELFHeaderEFlags(EFlags);
}
: AMDGPUTargetStreamer(S), STI(STI), Streamer(S) {}

MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
return static_cast<MCELFStreamer &>(Streamer);
Expand All @@ -442,6 +451,9 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
// We use it for emitting the accumulated PAL metadata as a .note record.
// The PAL metadata is reset after it is emitted.
void AMDGPUTargetELFStreamer::finish() {
MCAssembler &MCA = getStreamer().getAssembler();
MCA.setELFHeaderEFlags(getEFlags());

std::string Blob;
const char *Vendor = getPALMetadata()->getVendor();
unsigned Type = getPALMetadata()->getType();
Expand All @@ -467,7 +479,7 @@ void AMDGPUTargetELFStreamer::EmitNote(
unsigned NoteFlags = 0;
// TODO Apparently, this is currently needed for OpenCL as mentioned in
// https://reviews.llvm.org/D74995
if (Os == Triple::AMDHSA)
if (STI.getTargetTriple().getOS() == Triple::AMDHSA)
NoteFlags = ELF::SHF_ALLOC;

S.PushSection();
Expand All @@ -483,33 +495,160 @@ void AMDGPUTargetELFStreamer::EmitNote(
S.PopSection();
}

void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget(StringRef Target) {}
unsigned AMDGPUTargetELFStreamer::getEFlags() {
switch (STI.getTargetTriple().getArch()) {
default:
llvm_unreachable("Unsupported Arch");
case Triple::r600:
return getEFlagsR600();
case Triple::amdgcn:
return getEFlagsAMDGCN();
}
}

unsigned AMDGPUTargetELFStreamer::getEFlagsR600() {
assert(STI.getTargetTriple().getArch() == Triple::r600);

return getElfMach(STI.getCPU());
}

unsigned AMDGPUTargetELFStreamer::getEFlagsAMDGCN() {
assert(STI.getTargetTriple().getArch() == Triple::amdgcn);

switch (STI.getTargetTriple().getOS()) {
default:
// TODO: Why are some tests have "mingw" listed as OS?
// llvm_unreachable("Unsupported OS");
case Triple::UnknownOS:
return getEFlagsUnknownOS();
case Triple::AMDHSA:
return getEFlagsAMDHSA();
case Triple::AMDPAL:
return getEFlagsAMDPAL();
case Triple::Mesa3D:
return getEFlagsMesa3D();
}
}

unsigned AMDGPUTargetELFStreamer::getEFlagsUnknownOS() {
// TODO: Why are some tests have "mingw" listed as OS?
// assert(STI.getTargetTriple().getOS() == Triple::UnknownOS);

return getEFlagsV3();
}

unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() {
assert(STI.getTargetTriple().getOS() == Triple::AMDHSA);

if (const auto &&HsaAbiVer = getHsaAbiVersion(&STI)) {
switch (HsaAbiVer.getValue()) {
case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
return getEFlagsV3();
case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
return getEFlagsV4();
}
}

llvm_unreachable("HSA OS ABI Version identification must be defined");
}

unsigned AMDGPUTargetELFStreamer::getEFlagsAMDPAL() {
assert(STI.getTargetTriple().getOS() == Triple::AMDPAL);

return getEFlagsV3();
}

unsigned AMDGPUTargetELFStreamer::getEFlagsMesa3D() {
assert(STI.getTargetTriple().getOS() == Triple::Mesa3D);

return getEFlagsV3();
}

unsigned AMDGPUTargetELFStreamer::getEFlagsV3() {
unsigned EFlagsV3 = 0;

// mach.
EFlagsV3 |= getElfMach(STI.getCPU());

// xnack.
if (getTargetID()->isXnackOnOrAny())
EFlagsV3 |= ELF::EF_AMDGPU_FEATURE_XNACK_V3;
// sramecc.
if (getTargetID()->isSramEccOnOrAny())
EFlagsV3 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_V3;

return EFlagsV3;
}

unsigned AMDGPUTargetELFStreamer::getEFlagsV4() {
unsigned EFlagsV4 = 0;

// mach.
EFlagsV4 |= getElfMach(STI.getCPU());

// xnack.
switch (getTargetID()->getXnackSetting()) {
case AMDGPU::IsaInfo::TargetIDSetting::Unsupported:
EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_UNSUPPORTED_V4;
break;
case AMDGPU::IsaInfo::TargetIDSetting::Any:
EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_ANY_V4;
break;
case AMDGPU::IsaInfo::TargetIDSetting::Off:
EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_OFF_V4;
break;
case AMDGPU::IsaInfo::TargetIDSetting::On:
EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_XNACK_ON_V4;
break;
}
// sramecc.
switch (getTargetID()->getSramEccSetting()) {
case AMDGPU::IsaInfo::TargetIDSetting::Unsupported:
EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_UNSUPPORTED_V4;
break;
case AMDGPU::IsaInfo::TargetIDSetting::Any:
EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_ANY_V4;
break;
case AMDGPU::IsaInfo::TargetIDSetting::Off:
EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_OFF_V4;
break;
case AMDGPU::IsaInfo::TargetIDSetting::On:
EFlagsV4 |= ELF::EF_AMDGPU_FEATURE_SRAMECC_ON_V4;
break;
}

return EFlagsV4;
}

void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget() {}

void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
uint32_t Major, uint32_t Minor) {

EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()),
ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
ELF::NT_AMD_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
OS.emitInt32(Major);
OS.emitInt32(Minor);
});
}

void
AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
uint32_t Minor,
uint32_t Stepping,
StringRef VendorName,
StringRef ArchName) {
AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
uint32_t Minor,
uint32_t Stepping,
StringRef VendorName,
StringRef ArchName) {
uint16_t VendorNameSize = VendorName.size() + 1;
uint16_t ArchNameSize = ArchName.size() + 1;

unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) +
sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
VendorNameSize + ArchNameSize;

convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny());
EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()),
ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) {
ELF::NT_AMD_HSA_ISA_VERSION, [&](MCELFStreamer &OS) {
OS.emitInt16(VendorNameSize);
OS.emitInt16(ArchNameSize);
OS.emitInt32(Major);
Expand Down Expand Up @@ -557,7 +696,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
SymbolELF->setSize(MCConstantExpr::create(Size, getContext()));
}

bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
bool AMDGPUTargetELFStreamer::EmitISAVersion() {
// Create two labels to mark the beginning and end of the desc field
// and a MCExpr to calculate the size of the desc field.
auto &Context = getContext();
Expand All @@ -567,10 +706,10 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
MCSymbolRefExpr::create(DescEnd, Context),
MCSymbolRefExpr::create(DescBegin, Context), Context);

EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA,
EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_ISA_NAME,
[&](MCELFStreamer &OS) {
OS.emitLabel(DescBegin);
OS.emitBytes(IsaVersionString);
OS.emitBytes(getTargetID()->toString());
OS.emitLabel(DescEnd);
});
return true;
Expand Down Expand Up @@ -618,7 +757,7 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
MCSymbolRefExpr::create(DescEnd, Context),
MCSymbolRefExpr::create(DescBegin, Context), Context);

EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA,
EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_METADATA,
[&](MCELFStreamer &OS) {
OS.emitLabel(DescBegin);
OS.emitBytes(HSAMetadataString);
Expand All @@ -642,8 +781,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
bool ReserveXNACK) {
uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {
auto &Streamer = getStreamer();
auto &Context = Streamer.getContext();

Expand Down
78 changes: 55 additions & 23 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H

#include "AMDKernelCodeT.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDGPUPALMetadata.h"
#include "llvm/ADT/APInt.h"
#include "llvm/BinaryFormat/MsgPackDocument.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSubtargetInfo.h"
Expand All @@ -32,22 +34,25 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
AMDGPUPALMetadata PALMetadata;

protected:
// TODO: Move HSAMetadataStream to AMDGPUTargetStreamer.
Optional<AMDGPU::IsaInfo::AMDGPUTargetID> TargetID;

MCContext &getContext() const { return Streamer.getContext(); }

public:
AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}

AMDGPUPALMetadata *getPALMetadata() { return &PALMetadata; }

virtual void EmitDirectiveAMDGCNTarget(StringRef Target) = 0;
virtual void EmitDirectiveAMDGCNTarget() = 0;

virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) = 0;

virtual void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
uint32_t Stepping,
StringRef VendorName,
StringRef ArchName) = 0;
virtual void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
uint32_t Stepping,
StringRef VendorName,
StringRef ArchName) = 0;

virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) = 0;

Expand All @@ -57,7 +62,7 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
Align Alignment) = 0;

/// \returns True on success, false on failure.
virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
virtual bool EmitISAVersion() = 0;

/// \returns True on success, false on failure.
virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString);
Expand All @@ -83,11 +88,27 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
virtual void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
bool ReserveXNACK) = 0;
uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) = 0;

static StringRef getArchNameFromElfMach(unsigned ElfMach);
static unsigned getElfMach(StringRef GPU);

const Optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() const {
return TargetID;
}
Optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() {
return TargetID;
}
void initializeTargetID(const MCSubtargetInfo &STI) {
assert(TargetID == None && "TargetID can only be initialized once");
TargetID.emplace(STI);
}
void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString) {
initializeTargetID(STI);

assert(getTargetID() != None && "TargetID is None");
getTargetID()->setTargetIDFromFeaturesString(FeatureString);
}
};

class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
Expand All @@ -97,14 +118,14 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {

void finish() override;

void EmitDirectiveAMDGCNTarget(StringRef Target) override;
void EmitDirectiveAMDGCNTarget() override;

void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) override;

void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
uint32_t Stepping, StringRef VendorName,
StringRef ArchName) override;
void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
uint32_t Stepping, StringRef VendorName,
StringRef ArchName) override;

void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;

Expand All @@ -113,7 +134,7 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;

/// \returns True on success, false on failure.
bool EmitISAVersion(StringRef IsaVersionString) override;
bool EmitISAVersion() override;

/// \returns True on success, false on failure.
bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
Expand All @@ -127,32 +148,44 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
bool ReserveXNACK) override;
uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
};

class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
const MCSubtargetInfo &STI;
MCStreamer &Streamer;
Triple::OSType Os;

void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType,
function_ref<void(MCELFStreamer &)> EmitDesc);

unsigned getEFlags();

unsigned getEFlagsR600();
unsigned getEFlagsAMDGCN();

unsigned getEFlagsUnknownOS();
unsigned getEFlagsAMDHSA();
unsigned getEFlagsAMDPAL();
unsigned getEFlagsMesa3D();

unsigned getEFlagsV3();
unsigned getEFlagsV4();

public:
AMDGPUTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);

MCELFStreamer &getStreamer();

void finish() override;

void EmitDirectiveAMDGCNTarget(StringRef Target) override;
void EmitDirectiveAMDGCNTarget() override;

void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
uint32_t Minor) override;

void EmitDirectiveHSACodeObjectISA(uint32_t Major, uint32_t Minor,
uint32_t Stepping, StringRef VendorName,
StringRef ArchName) override;
void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
uint32_t Stepping, StringRef VendorName,
StringRef ArchName) override;

void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;

Expand All @@ -161,7 +194,7 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;

/// \returns True on success, false on failure.
bool EmitISAVersion(StringRef IsaVersionString) override;
bool EmitISAVersion() override;

/// \returns True on success, false on failure.
bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
Expand All @@ -175,8 +208,7 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
bool ReserveXNACK) override;
uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
};

}
Expand Down
53 changes: 46 additions & 7 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5202,12 +5202,35 @@ SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
}

SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
if (!Subtarget->isTrapHandlerEnabled() ||
Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
return lowerTRAP_ENDPGM(Op, DAG);

if (const auto &&HsaAbiVer = AMDGPU::getHsaAbiVersion(Subtarget)) {
switch (HsaAbiVer.getValue()) {
case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
return lowerTRAP_AMDHSA_QUEUE_PTR(Op, DAG);
case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
return Subtarget->supportsGetDoorbellID() ?
lowerTRAP_AMDHSA(Op, DAG) : lowerTRAP_AMDHSA_QUEUE_PTR(Op, DAG);
}
}

llvm_unreachable("Unknown trap handler");
}

SDValue SITargetLowering::lowerTRAP_ENDPGM(
SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
}

if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
!Subtarget->isTrapHandlerEnabled())
return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
SDValue SITargetLowering::lowerTRAP_AMDHSA_QUEUE_PTR(
SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);

MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Expand All @@ -5218,22 +5241,37 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
QueuePtr, SDValue());

uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
SDValue Ops[] = {
ToReg,
DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
DAG.getTargetConstant(TrapID, SL, MVT::i16),
SGPR01,
ToReg.getValue(1)
};
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}

SDValue SITargetLowering::lowerTRAP_AMDHSA(
SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);

uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
SDValue Ops[] = {
Chain,
DAG.getTargetConstant(TrapID, SL, MVT::i16)
};
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}

SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
MachineFunction &MF = DAG.getMachineFunction();

if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
!Subtarget->isTrapHandlerEnabled()) {
if (!Subtarget->isTrapHandlerEnabled() ||
Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
"debugtrap handler not supported",
Op.getDebugLoc(),
Expand All @@ -5243,9 +5281,10 @@ SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
return Chain;
}

uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
SDValue Ops[] = {
Chain,
DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
DAG.getTargetConstant(TrapID, SL, MVT::i16)
};
return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
}
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP_ENDPGM(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP_AMDHSA_QUEUE_PTR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP_AMDHSA(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const;

SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
Expand Down
5 changes: 0 additions & 5 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1367,11 +1367,6 @@ def DSTOMOD {
int NONE = 0;
}

def TRAPID{
int LLVM_TRAP = 2;
int LLVM_DEBUG_TRAP = 3;
}

def HWREG {
int MODE = 1;
int STATUS = 2;
Expand Down
234 changes: 216 additions & 18 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@

static llvm::cl::opt<unsigned> AmdhsaCodeObjectVersion(
"amdhsa-code-object-version", llvm::cl::Hidden,
llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(3));
llvm::cl::desc("AMDHSA Code Object Version"), llvm::cl::init(4),
llvm::cl::ZeroOrMore);

namespace {

Expand Down Expand Up @@ -117,8 +118,11 @@ Optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
return ELF::ELFABIVERSION_AMDGPU_HSA_V2;
case 3:
return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
case 4:
return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
default:
return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") +
Twine(AmdhsaCodeObjectVersion));
}
}

Expand All @@ -134,6 +138,16 @@ bool isHsaAbiVersion3(const MCSubtargetInfo *STI) {
return false;
}

bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {
if (const auto &&HsaAbiVer = getHsaAbiVersion(STI))
return HsaAbiVer.getValue() == ELF::ELFABIVERSION_AMDGPU_HSA_V4;
return false;
}

bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI) {
return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI);
}

#define GET_MIMGBaseOpcodesTable_IMPL
#define GET_MIMGDimInfoTable_IMPL
#define GET_MIMGInfoTable_IMPL
Expand Down Expand Up @@ -267,25 +281,189 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) {

namespace IsaInfo {

void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream) {
auto TargetTriple = STI->getTargetTriple();
auto Version = getIsaVersion(STI->getCPU());
AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
: STI(STI), XnackSetting(TargetIDSetting::Any),
SramEccSetting(TargetIDSetting::Any) {
if (!STI.getFeatureBits().test(FeatureSupportsXNACK))
XnackSetting = TargetIDSetting::Unsupported;
if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC))
SramEccSetting = TargetIDSetting::Unsupported;
}

void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) {
// Check if xnack or sramecc is explicitly enabled or disabled. In the
// absence of the target features we assume we must generate code that can run
// in any environment.
SubtargetFeatures Features(FS);
Optional<bool> XnackRequested;
Optional<bool> SramEccRequested;

for (const std::string &Feature : Features.getFeatures()) {
if (Feature == "+xnack")
XnackRequested = true;
else if (Feature == "-xnack")
XnackRequested = false;
else if (Feature == "+sramecc")
SramEccRequested = true;
else if (Feature == "-sramecc")
SramEccRequested = false;
}

bool XnackSupported = isXnackSupported();
bool SramEccSupported = isSramEccSupported();

if (XnackRequested.hasValue()) {
if (XnackSupported) {
XnackSetting =
*XnackRequested ? TargetIDSetting::On : TargetIDSetting::Off;
} else {
// If a specific xnack setting was requested and this GPU does not support
// xnack emit a warning. Setting will remain set to "Unsupported".
if (*XnackRequested) {
errs() << "warning: xnack 'On' was requested for a processor that does "
"not support it!\n";
} else {
errs() << "warning: xnack 'Off' was requested for a processor that "
"does not support it!\n";
}
}
}

if (SramEccRequested.hasValue()) {
if (SramEccSupported) {
SramEccSetting =
*SramEccRequested ? TargetIDSetting::On : TargetIDSetting::Off;
} else {
// If a specific sramecc setting was requested and this GPU does not
// support sramecc emit a warning. Setting will remain set to
// "Unsupported".
if (*SramEccRequested) {
errs() << "warning: sramecc 'On' was requested for a processor that "
"does not support it!\n";
} else {
errs() << "warning: sramecc 'Off' was requested for a processor that "
"does not support it!\n";
}
}
}
}

static TargetIDSetting
getTargetIDSettingFromFeatureString(StringRef FeatureString) {
if (FeatureString.endswith("-"))
return TargetIDSetting::Off;
if (FeatureString.endswith("+"))
return TargetIDSetting::On;

llvm_unreachable("Malformed feature string");
}

void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) {
SmallVector<StringRef, 3> TargetIDSplit;
TargetID.split(TargetIDSplit, ':');

Optional<bool> FeatureStatus;
for (const auto &FeatureString : TargetIDSplit) {
if (FeatureString.startswith("xnack"))
XnackSetting = getTargetIDSettingFromFeatureString(FeatureString);
if (FeatureString.startswith("sramecc"))
SramEccSetting = getTargetIDSettingFromFeatureString(FeatureString);
}
}

std::string AMDGPUTargetID::toString() const {
std::string StringRep = "";
raw_string_ostream StreamRep(StringRep);

auto TargetTriple = STI.getTargetTriple();
auto Version = getIsaVersion(STI.getCPU());

Stream << TargetTriple.getArchName() << '-'
<< TargetTriple.getVendorName() << '-'
<< TargetTriple.getOSName() << '-'
<< TargetTriple.getEnvironmentName() << '-'
<< "gfx"
<< Version.Major
<< Version.Minor
<< Version.Stepping;
StreamRep << TargetTriple.getArchName() << '-'
<< TargetTriple.getVendorName() << '-'
<< TargetTriple.getOSName() << '-'
<< TargetTriple.getEnvironmentName() << '-';

if (hasXNACK(*STI))
Stream << "+xnack";
if (hasSRAMECC(*STI))
Stream << "+sram-ecc";
std::string Processor = "";
// TODO: Following else statement is present here because we used various
// alias names for GPUs up until GFX9 (e.g. 'fiji' is same as 'gfx803').
// Remove once all aliases are removed from GCNProcessors.td.
if (Version.Major >= 9)
Processor = STI.getCPU().str();
else
Processor = (Twine("gfx") + Twine(Version.Major) + Twine(Version.Minor) +
Twine(Version.Stepping))
.str();

std::string Features = "";
if (const auto &&HsaAbiVersion = getHsaAbiVersion(&STI)) {
switch (HsaAbiVersion.getValue()) {
case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
// Code object V2 only supported specific processors and had fixed
// settings for the XNACK.
if (Processor == "gfx700") {
} else if (Processor == "gfx701") {
} else if (Processor == "gfx702") {
} else if (Processor == "gfx703") {
} else if (Processor == "gfx704") {
} else if (Processor == "gfx801") {
if (!isXnackOnOrAny())
report_fatal_error(
"AMD GPU code object V2 does not support processor " + Processor +
" without XNACK");
} else if (Processor == "gfx802") {
} else if (Processor == "gfx803") {
} else if (Processor == "gfx810") {
if (!isXnackOnOrAny())
report_fatal_error(
"AMD GPU code object V2 does not support processor " + Processor +
" without XNACK");
} else if (Processor == "gfx900") {
if (isXnackOnOrAny())
Processor = "gfx901";
} else if (Processor == "gfx902") {
if (isXnackOnOrAny())
Processor = "gfx903";
} else if (Processor == "gfx904") {
if (isXnackOnOrAny())
Processor = "gfx905";
} else if (Processor == "gfx906") {
if (isXnackOnOrAny())
Processor = "gfx907";
} else {
report_fatal_error(
"AMD GPU code object V2 does not support processor " + Processor);
}
break;
case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
// xnack.
if (isXnackOnOrAny())
Features += "+xnack";
// In code object v2 and v3, "sramecc" feature was spelled with a
// hyphen ("sram-ecc").
if (isSramEccOnOrAny())
Features += "+sram-ecc";
break;
case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
// sramecc.
if (getSramEccSetting() == TargetIDSetting::Off)
Features += ":sramecc-";
else if (getSramEccSetting() == TargetIDSetting::On)
Features += ":sramecc+";
// xnack.
if (getXnackSetting() == TargetIDSetting::Off)
Features += ":xnack-";
else if (getXnackSetting() == TargetIDSetting::On)
Features += ":xnack+";
break;
default:
break;
}
}

StreamRep << Processor << Features;

Stream.flush();
StreamRep.flush();
return StringRep;
}

unsigned getWavefrontSize(const MCSubtargetInfo *STI) {
Expand Down Expand Up @@ -1629,4 +1807,24 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
}

} // namespace AMDGPU

raw_ostream &operator<<(raw_ostream &OS,
const AMDGPU::IsaInfo::TargetIDSetting S) {
switch (S) {
case (AMDGPU::IsaInfo::TargetIDSetting::Unsupported):
OS << "Unsupported";
break;
case (AMDGPU::IsaInfo::TargetIDSetting::Any):
OS << "Any";
break;
case (AMDGPU::IsaInfo::TargetIDSetting::Off):
OS << "Off";
break;
case (AMDGPU::IsaInfo::TargetIDSetting::On):
OS << "On";
break;
}
return OS;
}

} // namespace llvm
94 changes: 92 additions & 2 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetParser.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <string>
#include <utility>
Expand All @@ -45,6 +46,12 @@ bool isHsaAbiVersion2(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 3,
/// false otherwise.
bool isHsaAbiVersion3(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 4,
/// false otherwise.
bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 3 or 4,
/// false otherwise.
bool isHsaAbiVersion3Or4(const MCSubtargetInfo *STI);

struct GcnBufferFormatInfo {
unsigned Format;
Expand All @@ -70,8 +77,87 @@ enum {
TRAP_NUM_SGPRS = 16
};

/// Streams isa version string for given subtarget \p STI into \p Stream.
void streamIsaVersion(const MCSubtargetInfo *STI, raw_ostream &Stream);
enum class TargetIDSetting {
Unsupported,
Any,
Off,
On
};

class AMDGPUTargetID {
private:
const MCSubtargetInfo &STI;
TargetIDSetting XnackSetting;
TargetIDSetting SramEccSetting;

public:
explicit AMDGPUTargetID(const MCSubtargetInfo &STI);
~AMDGPUTargetID() = default;

/// \return True if the current xnack setting is not "Unsupported".
bool isXnackSupported() const {
return XnackSetting != TargetIDSetting::Unsupported;
}

/// \returns True if the current xnack setting is "On" or "Any".
bool isXnackOnOrAny() const {
return XnackSetting == TargetIDSetting::On ||
XnackSetting == TargetIDSetting::Any;
}

/// \returns True if current xnack setting is "On" or "Off",
/// false otherwise.
bool isXnackOnOrOff() const {
return getXnackSetting() == TargetIDSetting::On ||
getXnackSetting() == TargetIDSetting::Off;
}

/// \returns The current xnack TargetIDSetting, possible options are
/// "Unsupported", "Any", "Off", and "On".
TargetIDSetting getXnackSetting() const {
return XnackSetting;
}

/// Sets xnack setting to \p NewXnackSetting.
void setXnackSetting(TargetIDSetting NewXnackSetting) {
XnackSetting = NewXnackSetting;
}

/// \return True if the current sramecc setting is not "Unsupported".
bool isSramEccSupported() const {
return SramEccSetting != TargetIDSetting::Unsupported;
}

/// \returns True if the current sramecc setting is "On" or "Any".
bool isSramEccOnOrAny() const {
return SramEccSetting == TargetIDSetting::On ||
SramEccSetting == TargetIDSetting::Any;
}

/// \returns True if current sramecc setting is "On" or "Off",
/// false otherwise.
bool isSramEccOnOrOff() const {
return getSramEccSetting() == TargetIDSetting::On ||
getSramEccSetting() == TargetIDSetting::Off;
}

/// \returns The current sramecc TargetIDSetting, possible options are
/// "Unsupported", "Any", "Off", and "On".
TargetIDSetting getSramEccSetting() const {
return SramEccSetting;
}

/// Sets sramecc setting to \p NewSramEccSetting.
void setSramEccSetting(TargetIDSetting NewSramEccSetting) {
SramEccSetting = NewSramEccSetting;
}

void setTargetIDFromFeaturesString(StringRef FS);
void setTargetIDFromTargetIDStream(StringRef TargetID);

/// \returns String representation of an object.
std::string toString() const;
};

/// \returns Wavefront size for given subtarget \p STI.
unsigned getWavefrontSize(const MCSubtargetInfo *STI);
Expand Down Expand Up @@ -845,6 +931,10 @@ struct SIModeRegisterDefaults {
};

} // end namespace AMDGPU

raw_ostream &operator<<(raw_ostream &OS,
const AMDGPU::IsaInfo::TargetIDSetting S);

} // end namespace llvm

#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
12 changes: 6 additions & 6 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) {
}
return;
}
BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
BlobType = ELF::NT_AMD_PAL_METADATA;
NamedMD = M.getNamedMetadata("amdgpu.pal.metadata");
if (!NamedMD || !NamedMD->getNumOperands()) {
// Emit msgpack metadata by default
Expand Down Expand Up @@ -71,7 +71,7 @@ void AMDGPUPALMetadata::readFromIR(Module &M) {
// Metadata.
bool AMDGPUPALMetadata::setFromBlob(unsigned Type, StringRef Blob) {
BlobType = Type;
if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
if (Type == ELF::NT_AMD_PAL_METADATA)
return setFromLegacyBlob(Blob);
return setFromMsgPackBlob(Blob);
}
Expand Down Expand Up @@ -648,7 +648,7 @@ void AMDGPUPALMetadata::toString(std::string &String) {
// a .note record of the specified AMD type. Returns an empty blob if
// there is no PAL metadata,
void AMDGPUPALMetadata::toBlob(unsigned Type, std::string &Blob) {
if (Type == ELF::NT_AMD_AMDGPU_PAL_METADATA)
if (Type == ELF::NT_AMD_PAL_METADATA)
toLegacyBlob(Blob);
else if (Type)
toMsgPackBlob(Blob);
Expand Down Expand Up @@ -761,7 +761,7 @@ const char *AMDGPUPALMetadata::getVendor() const {
}

// Get .note record type of metadata blob to be emitted:
// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
// ELF::NT_AMD_PAL_METADATA (legacy key=val format), or
// ELF::NT_AMDGPU_METADATA (MsgPack format), or
// 0 (no PAL metadata).
unsigned AMDGPUPALMetadata::getType() const {
Expand All @@ -770,12 +770,12 @@ unsigned AMDGPUPALMetadata::getType() const {

// Return whether the blob type is legacy PAL metadata.
bool AMDGPUPALMetadata::isLegacy() const {
return BlobType == ELF::NT_AMD_AMDGPU_PAL_METADATA;
return BlobType == ELF::NT_AMD_PAL_METADATA;
}

// Set legacy PAL metadata format.
void AMDGPUPALMetadata::setLegacy() {
BlobType = ELF::NT_AMD_AMDGPU_PAL_METADATA;
BlobType = ELF::NT_AMD_PAL_METADATA;
}

// Erase all PAL metadata.
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ class AMDGPUPALMetadata {
const char *getVendor() const;

// Get .note record type of metadata blob to be emitted:
// ELF::NT_AMD_AMDGPU_PAL_METADATA (legacy key=val format), or
// ELF::NT_AMD_PAL_METADATA (legacy key=val format), or
// ELF::NT_AMDGPU_METADATA (MsgPack format), or
// 0 (no PAL metadata).
unsigned getType() const;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s

; Check lowering of some large extractelement that use the stack
; instead of register indexing.
Expand Down
78 changes: 39 additions & 39 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -112,57 +112,57 @@ define i128 @extractelement_vgpr_v4i128_vgpr_idx(<4 x i128> addrspace(1)* %ptr,
; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v16, 1, v2
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
; GFX9-NEXT: v_add_u32_e32 v17, 1, v16
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16
; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16
; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2
; GFX9-NEXT: v_add_u32_e32 v3, 1, v2
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v2
; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v2
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16
; GFX9-NEXT: v_cndmask_b32_e64 v12, v8, v10, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v13, v9, v11, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3
; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
; GFX9-NEXT: v_cndmask_b32_e32 v16, v10, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v17, v11, v7, vcc
; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32
; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2
; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v3
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16
; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v13, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v14, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1672,7 +1672,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(double addrspace(1)* %out, i32
; GPRIDX-NEXT: is_ptr64 = 1
; GPRIDX-NEXT: is_dynamic_callstack = 0
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
Expand Down Expand Up @@ -2187,7 +2187,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(float addrspace(1)* %out, i32
; GPRIDX-NEXT: is_ptr64 = 1
; GPRIDX-NEXT: is_dynamic_callstack = 0
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
Expand Down Expand Up @@ -2363,7 +2363,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(double addrspace(1)* %out, i3
; GPRIDX-NEXT: is_ptr64 = 1
; GPRIDX-NEXT: is_dynamic_callstack = 0
; GPRIDX-NEXT: is_debug_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 0
; GPRIDX-NEXT: is_xnack_enabled = 1
; GPRIDX-NEXT: workitem_private_segment_byte_size = 0
; GPRIDX-NEXT: workgroup_group_segment_byte_size = 0
; GPRIDX-NEXT: gds_segment_byte_size = 0
Expand Down
137 changes: 69 additions & 68 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,29 +14,29 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: v_add_u32_e32 v31, 64, v16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x0
; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x0
; GCN-NEXT: s_load_dwordx16 s[52:67], s[10:11], 0x40
; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0x80
; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0x80
; GCN-NEXT: v_add_u32_e32 v32, 0x44, v16
; GCN-NEXT: v_add_u32_e32 v33, 0x48, v16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s12
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mov_b32_e32 v3, s15
; GCN-NEXT: v_mov_b32_e32 v4, s16
; GCN-NEXT: v_mov_b32_e32 v5, s17
; GCN-NEXT: v_mov_b32_e32 v6, s18
; GCN-NEXT: v_mov_b32_e32 v7, s19
; GCN-NEXT: v_mov_b32_e32 v8, s20
; GCN-NEXT: v_mov_b32_e32 v9, s21
; GCN-NEXT: v_mov_b32_e32 v10, s22
; GCN-NEXT: v_mov_b32_e32 v11, s23
; GCN-NEXT: v_mov_b32_e32 v12, s24
; GCN-NEXT: v_mov_b32_e32 v13, s25
; GCN-NEXT: v_mov_b32_e32 v14, s26
; GCN-NEXT: v_mov_b32_e32 v15, s27
; GCN-NEXT: s_load_dwordx16 s[12:27], s[10:11], 0xc0
; GCN-NEXT: v_mov_b32_e32 v0, s36
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: v_mov_b32_e32 v2, s38
; GCN-NEXT: v_mov_b32_e32 v3, s39
; GCN-NEXT: v_mov_b32_e32 v4, s40
; GCN-NEXT: v_mov_b32_e32 v5, s41
; GCN-NEXT: v_mov_b32_e32 v6, s42
; GCN-NEXT: v_mov_b32_e32 v7, s43
; GCN-NEXT: v_mov_b32_e32 v8, s44
; GCN-NEXT: v_mov_b32_e32 v9, s45
; GCN-NEXT: v_mov_b32_e32 v10, s46
; GCN-NEXT: v_mov_b32_e32 v11, s47
; GCN-NEXT: v_mov_b32_e32 v12, s48
; GCN-NEXT: v_mov_b32_e32 v13, s49
; GCN-NEXT: v_mov_b32_e32 v14, s50
; GCN-NEXT: v_mov_b32_e32 v15, s51
; GCN-NEXT: s_load_dwordx16 s[36:51], s[10:11], 0xc0
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:256
; GCN-NEXT: v_add_u32_e32 v0, 4, v16
; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
Expand Down Expand Up @@ -88,110 +88,110 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: v_add_u32_e32 v46, 0x7c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s67
; GCN-NEXT: buffer_store_dword v1, v46, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s12
; GCN-NEXT: v_add_u32_e32 v47, 0x80, v16
; GCN-NEXT: v_mov_b32_e32 v1, s36
; GCN-NEXT: buffer_store_dword v1, v47, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_add_u32_e32 v48, 0x84, v16
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: buffer_store_dword v1, v48, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s14
; GCN-NEXT: v_add_u32_e32 v49, 0x88, v16
; GCN-NEXT: v_mov_b32_e32 v1, s38
; GCN-NEXT: buffer_store_dword v1, v49, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s11, 0x90
; GCN-NEXT: v_mov_b32_e32 v1, s15
; GCN-NEXT: v_add_u32_e32 v50, 0x8c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s39
; GCN-NEXT: s_movk_i32 s11, 0x90
; GCN-NEXT: buffer_store_dword v1, v50, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: v_add_u32_e32 v51, s11, v16
; GCN-NEXT: v_mov_b32_e32 v1, s40
; GCN-NEXT: buffer_store_dword v1, v51, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v52, 0x94, v16
; GCN-NEXT: v_mov_b32_e32 v1, s41
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: buffer_store_dword v1, v52, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v53, 0x98, v16
; GCN-NEXT: v_mov_b32_e32 v1, s42
; GCN-NEXT: v_mov_b32_e32 v1, s18
; GCN-NEXT: buffer_store_dword v1, v53, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s28, 0xa0
; GCN-NEXT: s_movk_i32 s12, 0xa0
; GCN-NEXT: v_add_u32_e32 v54, 0x9c, v16
; GCN-NEXT: v_mov_b32_e32 v1, s43
; GCN-NEXT: v_mov_b32_e32 v1, s19
; GCN-NEXT: buffer_store_dword v1, v54, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v55, s28, v16
; GCN-NEXT: v_mov_b32_e32 v1, s44
; GCN-NEXT: v_add_u32_e32 v55, s12, v16
; GCN-NEXT: v_mov_b32_e32 v1, s20
; GCN-NEXT: buffer_store_dword v1, v55, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v56, 0xa4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s45
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: buffer_store_dword v1, v56, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v57, 0xa8, v16
; GCN-NEXT: v_mov_b32_e32 v1, s46
; GCN-NEXT: v_mov_b32_e32 v1, s22
; GCN-NEXT: buffer_store_dword v1, v57, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s29, 0xb0
; GCN-NEXT: s_movk_i32 s13, 0xb0
; GCN-NEXT: v_add_u32_e32 v58, 0xac, v16
; GCN-NEXT: v_mov_b32_e32 v1, s47
; GCN-NEXT: v_mov_b32_e32 v1, s23
; GCN-NEXT: buffer_store_dword v1, v58, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v59, s29, v16
; GCN-NEXT: v_mov_b32_e32 v1, s48
; GCN-NEXT: v_add_u32_e32 v59, s13, v16
; GCN-NEXT: v_mov_b32_e32 v1, s24
; GCN-NEXT: buffer_store_dword v1, v59, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v60, 0xb4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s49
; GCN-NEXT: v_mov_b32_e32 v1, s25
; GCN-NEXT: buffer_store_dword v1, v60, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v61, 0xb8, v16
; GCN-NEXT: v_mov_b32_e32 v1, s50
; GCN-NEXT: v_mov_b32_e32 v1, s26
; GCN-NEXT: buffer_store_dword v1, v61, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v62, 0xbc, v16
; GCN-NEXT: v_mov_b32_e32 v1, s51
; GCN-NEXT: v_mov_b32_e32 v1, s27
; GCN-NEXT: buffer_store_dword v1, v62, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s12
; GCN-NEXT: v_add_u32_e32 v63, 0xc0, v16
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v1, s36
; GCN-NEXT: buffer_store_dword v1, v63, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s13
; GCN-NEXT: v_add_u32_e32 v64, 0xc4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s37
; GCN-NEXT: buffer_store_dword v1, v64, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v1, s14
; GCN-NEXT: v_add_u32_e32 v65, 0xc8, v16
; GCN-NEXT: v_mov_b32_e32 v1, s38
; GCN-NEXT: buffer_store_dword v1, v65, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s12, 0xd0
; GCN-NEXT: s_movk_i32 s14, 0xd0
; GCN-NEXT: v_add_u32_e32 v66, 0xcc, v16
; GCN-NEXT: v_mov_b32_e32 v1, s15
; GCN-NEXT: v_mov_b32_e32 v1, s39
; GCN-NEXT: buffer_store_dword v1, v66, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v67, s12, v16
; GCN-NEXT: v_mov_b32_e32 v1, s16
; GCN-NEXT: v_add_u32_e32 v67, s14, v16
; GCN-NEXT: v_mov_b32_e32 v1, s40
; GCN-NEXT: buffer_store_dword v1, v67, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v68, 0xd4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s17
; GCN-NEXT: v_mov_b32_e32 v1, s41
; GCN-NEXT: buffer_store_dword v1, v68, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v69, 0xd8, v16
; GCN-NEXT: v_mov_b32_e32 v1, s18
; GCN-NEXT: v_mov_b32_e32 v1, s42
; GCN-NEXT: buffer_store_dword v1, v69, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s13, 0xe0
; GCN-NEXT: s_movk_i32 s15, 0xe0
; GCN-NEXT: v_add_u32_e32 v70, 0xdc, v16
; GCN-NEXT: v_mov_b32_e32 v1, s19
; GCN-NEXT: v_mov_b32_e32 v1, s43
; GCN-NEXT: buffer_store_dword v1, v70, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v71, s13, v16
; GCN-NEXT: v_mov_b32_e32 v1, s20
; GCN-NEXT: v_add_u32_e32 v71, s15, v16
; GCN-NEXT: v_mov_b32_e32 v1, s44
; GCN-NEXT: buffer_store_dword v1, v71, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v72, 0xe4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s21
; GCN-NEXT: v_mov_b32_e32 v1, s45
; GCN-NEXT: buffer_store_dword v1, v72, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v73, 0xe8, v16
; GCN-NEXT: v_mov_b32_e32 v1, s22
; GCN-NEXT: v_mov_b32_e32 v1, s46
; GCN-NEXT: buffer_store_dword v1, v73, s[0:3], 0 offen
; GCN-NEXT: s_movk_i32 s14, 0xf0
; GCN-NEXT: s_movk_i32 s16, 0xf0
; GCN-NEXT: v_add_u32_e32 v74, 0xec, v16
; GCN-NEXT: v_mov_b32_e32 v1, s23
; GCN-NEXT: v_mov_b32_e32 v1, s47
; GCN-NEXT: buffer_store_dword v1, v74, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v75, s14, v16
; GCN-NEXT: v_mov_b32_e32 v1, s24
; GCN-NEXT: v_add_u32_e32 v75, s16, v16
; GCN-NEXT: v_mov_b32_e32 v1, s48
; GCN-NEXT: buffer_store_dword v1, v75, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v76, 0xf4, v16
; GCN-NEXT: v_mov_b32_e32 v1, s25
; GCN-NEXT: v_mov_b32_e32 v1, s49
; GCN-NEXT: s_and_b32 s7, s7, 63
; GCN-NEXT: buffer_store_dword v1, v76, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v77, 0xf8, v16
; GCN-NEXT: v_mov_b32_e32 v1, s26
; GCN-NEXT: v_mov_b32_e32 v1, s50
; GCN-NEXT: v_add_u32_e32 v17, 8, v16
; GCN-NEXT: buffer_store_dword v1, v77, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v78, 0xfc, v16
; GCN-NEXT: v_mov_b32_e32 v1, s27
; GCN-NEXT: v_mov_b32_e32 v1, s51
; GCN-NEXT: s_lshl_b32 s7, s7, 2
; GCN-NEXT: buffer_store_dword v2, v17, s[0:3], 0 offen
; GCN-NEXT: v_add_u32_e32 v18, 12, v16
Expand Down Expand Up @@ -225,6 +225,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: buffer_store_dword v15, v30, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v2, v17, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v3, v18, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v4, v19, s[0:3], 0 offen
Expand Down Expand Up @@ -337,12 +338,12 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s28
; GCN-NEXT: s_add_u32 s4, s8, s12
; GCN-NEXT: global_store_dwordx4 v[0:1], v[36:39], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s29
; GCN-NEXT: s_add_u32 s4, s8, s13
; GCN-NEXT: global_store_dwordx4 v[0:1], v[40:43], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
Expand All @@ -352,17 +353,17 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(<64 x i32> addrspace(1)* %out.
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s12
; GCN-NEXT: s_add_u32 s4, s8, s14
; GCN-NEXT: global_store_dwordx4 v[0:1], v[48:51], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s13
; GCN-NEXT: s_add_u32 s4, s8, s15
; GCN-NEXT: global_store_dwordx4 v[0:1], v[52:55], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: s_add_u32 s4, s8, s14
; GCN-NEXT: s_add_u32 s4, s8, s16
; GCN-NEXT: global_store_dwordx4 v[0:1], v[56:59], off
; GCN-NEXT: s_addc_u32 s5, s9, 0
; GCN-NEXT: v_mov_b32_e32 v0, s4
Expand Down
142 changes: 71 additions & 71 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2133,8 +2133,8 @@ define amdgpu_ps void @insertelement_s_v16i16_s_s(<16 x i16> addrspace(4)* inreg
define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) {
; GFX9-LABEL: insertelement_v_v16i16_s_s:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
; GFX9-NEXT: s_and_b32 s1, s3, 1
; GFX9-NEXT: s_lshr_b32 s12, s3, 1
; GFX9-NEXT: s_mov_b32 s0, 0xffff
Expand All @@ -2152,27 +2152,27 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr,
; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6
; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11]
; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11]
; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11]
; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
Expand Down Expand Up @@ -2846,8 +2846,8 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(<16 x i16> addrspace(4)* inreg
define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) {
; GFX9-LABEL: insertelement_v_v16i16_s_v:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2
; GFX9-NEXT: v_and_b32_e32 v1, 1, v2
; GFX9-NEXT: s_mov_b32 s0, 0xffff
Expand All @@ -2865,26 +2865,26 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11]
; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11]
; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
Expand Down Expand Up @@ -2995,8 +2995,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr,
define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) {
; GFX9-LABEL: insertelement_v_v16i16_v_s:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
; GFX9-NEXT: s_and_b32 s1, s2, 1
; GFX9-NEXT: s_lshr_b32 s12, s2, 1
; GFX9-NEXT: s_lshl_b32 s1, s1, 4
Expand All @@ -3012,27 +3012,27 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6
; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11]
; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11]
; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
Expand Down Expand Up @@ -3143,8 +3143,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr,
define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) {
; GFX9-LABEL: insertelement_v_v16i16_v_v:
; GFX9: ; %bb.0:
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3
; GFX9-NEXT: v_and_b32_e32 v1, 1, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
Expand All @@ -3161,26 +3161,26 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr,
; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11]
; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc
; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13]
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc
; GFX9-NEXT: v_mov_b32_e32 v8, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11]
; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v10, 16
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9]
; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11]
; GFX9-NEXT: v_mov_b32_e32 v11, 0
; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off
; GFX9-NEXT: global_store_dwordx4 v[10:11], v[4:7], off
Expand Down
1 change: 1 addition & 0 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ define amdgpu_kernel void @v_insert_v64i32_37(<64 x i32> addrspace(1)* %ptr.in,
; GCN-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:16
; GCN-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:32
; GCN-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:48
; GCN-NEXT: s_nop 0
; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128
; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192
; GCN-NEXT: s_waitcnt vmcnt(7)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,6 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
; GFX6-LABEL: name: atomic_cmpxchg_s32_local
; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX6: $m0 = S_MOV_B32 -1
; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3)
; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]]
; GFX7-LABEL: name: atomic_cmpxchg_s32_local
; GFX7: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
Expand All @@ -38,6 +30,14 @@ body: |
; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX9: [[DS_CMPST_RTN_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst 4, addrspace 3)
; GFX9: $vgpr0 = COPY [[DS_CMPST_RTN_B32_gfx9_]]
; GFX6-LABEL: name: atomic_cmpxchg_s32_local
; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX6: $m0 = S_MOV_B32 -1
; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3)
; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]]
%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s32) = COPY $vgpr2
Expand All @@ -55,16 +55,6 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1, $vgpr2
; GFX6-LABEL: name: atomic_cmpxchg_s32_local_gep4
; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
; GFX6: %4:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX6: $m0 = S_MOV_B32 -1
; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 %4, [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3)
; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]]
; GFX7-LABEL: name: atomic_cmpxchg_s32_local_gep4
; GFX7: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
Expand All @@ -80,6 +70,16 @@ body: |
; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX9: [[DS_CMPST_RTN_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 4, 0, implicit $exec :: (load store seq_cst 4, addrspace 3)
; GFX9: $vgpr0 = COPY [[DS_CMPST_RTN_B32_gfx9_]]
; GFX6-LABEL: name: atomic_cmpxchg_s32_local_gep4
; GFX6: liveins: $vgpr0, $vgpr1, $vgpr2
; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
; GFX6: %4:vgpr_32, dead %6:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
; GFX6: $m0 = S_MOV_B32 -1
; GFX6: [[DS_CMPST_RTN_B32_:%[0-9]+]]:vgpr_32 = DS_CMPST_RTN_B32 %4, [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 4, addrspace 3)
; GFX6: $vgpr0 = COPY [[DS_CMPST_RTN_B32_]]
%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
%2:vgpr(s32) = COPY $vgpr2
Expand All @@ -99,14 +99,6 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
; GFX6-LABEL: name: atomic_cmpxchg_s64_local
; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
; GFX6: $m0 = S_MOV_B32 -1
; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 3)
; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]]
; GFX7-LABEL: name: atomic_cmpxchg_s64_local
; GFX7: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
Expand All @@ -122,6 +114,14 @@ body: |
; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
; GFX9: [[DS_CMPST_RTN_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 3)
; GFX9: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_gfx9_]]
; GFX6-LABEL: name: atomic_cmpxchg_s64_local
; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
; GFX6: $m0 = S_MOV_B32 -1
; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 3)
; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]]
%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(s64) = COPY $vgpr1_vgpr2
%2:vgpr(s64) = COPY $vgpr3_vgpr4
Expand All @@ -139,14 +139,6 @@ body: |
bb.0:
liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
; GFX6-LABEL: name: atomic_cmpxchg_s64_local_gep4
; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
; GFX6: $m0 = S_MOV_B32 -1
; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 3)
; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]]
; GFX7-LABEL: name: atomic_cmpxchg_s64_local_gep4
; GFX7: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
Expand All @@ -162,6 +154,14 @@ body: |
; GFX9: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
; GFX9: [[DS_CMPST_RTN_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64_gfx9 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst 8, addrspace 3)
; GFX9: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_gfx9_]]
; GFX6-LABEL: name: atomic_cmpxchg_s64_local_gep4
; GFX6: liveins: $vgpr0, $vgpr1_vgpr2, $vgpr3_vgpr4
; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX6: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr1_vgpr2
; GFX6: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
; GFX6: $m0 = S_MOV_B32 -1
; GFX6: [[DS_CMPST_RTN_B64_:%[0-9]+]]:vreg_64 = DS_CMPST_RTN_B64 [[COPY]], [[COPY1]], [[COPY2]], 0, 0, implicit $m0, implicit $exec :: (load store seq_cst 8, addrspace 3)
; GFX6: $vgpr0_vgpr1 = COPY [[DS_CMPST_RTN_B64_]]
%0:vgpr(p3) = COPY $vgpr0
%1:vgpr(s64) = COPY $vgpr1_vgpr2
%2:vgpr(s64) = COPY $vgpr3_vgpr4
Expand Down
Loading