Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3453,6 +3453,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
// TODO: put them back in constructor
// readEnvVars();

// Retrieve the size of the group memory.
for (const auto *Pool : AllMemoryPools) {
if (Pool->isGroup()) {
if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE,
MaxBlockSharedMemSize))
return Err;
break;
}
}

return Plugin::success();
}

Expand Down Expand Up @@ -4327,6 +4337,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (Status == HSA_STATUS_SUCCESS)
Info.add("Cacheline Size", TmpUInt);

Info.add("Max Shared Memory per Work Group", MaxBlockSharedMemSize, "bytes",
DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);

Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
if (Status == HSA_STATUS_SUCCESS)
Info.add("Max Clock Freq", TmpUInt, "MHz",
Expand Down
7 changes: 7 additions & 0 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -869,6 +869,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Get the unique identifier of the device.
const char *getDeviceUid() const { return DeviceUid.c_str(); }

/// Get the total shared memory per block (in bytes) that can be used in any
/// kernel.
size_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }

/// Set the context of the device if needed, before calling device-specific
/// functions. Plugins may implement this function as a no-op if not needed.
virtual Error setContext() = 0;
Expand Down Expand Up @@ -1461,6 +1465,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Variable to enable kernel duration tracing.
BoolEnvar OMPX_KernelDurationTracing;

/// The total per-block native shared memory that a kernel may use.
size_t MaxBlockSharedMemSize = 0;
private:
/// Return the kernel environment object for kernel \p Name.
Expected<KernelEnvironmentTy>
Expand Down Expand Up @@ -1579,6 +1585,7 @@ struct KernelRunRecordTy {
std::unordered_map<std::string, TuningMetadataTy> TuningData;
/// Internal representation for OMPT device (initialize & finalize)
std::atomic<bool> OmptInitialized;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extra new line

};

/// Class implementing common functionalities of offload plugins. Each plugin
Expand Down
12 changes: 8 additions & 4 deletions offload/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Err;
HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);

uint32_t MaxSharedMem;
if (auto Err = getDeviceAttr(
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem))
return Err;
MaxBlockSharedMemSize = MaxSharedMem;

return Plugin::success();
}

Expand Down Expand Up @@ -1092,10 +1098,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
if (Res == CUDA_SUCCESS)
Info.add("Total Constant Memory", TmpInt, "bytes");

Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
TmpInt);
if (Res == CUDA_SUCCESS)
Info.add("Max Shared Memory per Block", TmpInt, "bytes");
Info.add("Max Shared Memory per Block", MaxBlockSharedMemSize, "bytes",
DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);

Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
if (Res == CUDA_SUCCESS)
Expand Down
4 changes: 0 additions & 4 deletions revert_patches.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485)
breaks build of ROCmValidationSuite
[C2y] Support WG14 N3457, the __COUNTER__ macro (#162662)
---
needs more integration offload.
[Offload] Add device info for shared memory (#167817)
---
breaks conformance/2.0/relationals/test_relationals relational_select_signed
"DAG: Allow select ptr combine for non-0 address spaces (#167909)"
---