From 53ee32449eda5d383bc238fe61dc985f4eba1ead Mon Sep 17 00:00:00 2001 From: Kevin Sala Penades Date: Thu, 13 Nov 2025 11:00:12 -0800 Subject: [PATCH] [Offload] Add device info for shared memory (#167817) --- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 13 +++++++++++++ .../common/include/PluginInterface.h | 7 +++++++ offload/plugins-nextgen/cuda/src/rtl.cpp | 12 ++++++++---- revert_patches.txt | 4 ---- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index e74c34a06cae9..0388bbba4ee28 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3453,6 +3453,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { // TODO: put them back in constructor // readEnvVars(); + // Retrieve the size of the group memory. + for (const auto *Pool : AllMemoryPools) { + if (Pool->isGroup()) { + if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, + MaxBlockSharedMemSize)) + return Err; + break; + } + } + return Plugin::success(); } @@ -4327,6 +4337,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { if (Status == HSA_STATUS_SUCCESS) Info.add("Cacheline Size", TmpUInt); + Info.add("Max Shared Memory per Work Group", MaxBlockSharedMemSize, "bytes", + DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE); + Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt); if (Status == HSA_STATUS_SUCCESS) Info.add("Max Clock Freq", TmpUInt, "MHz", diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 25be1f2264642..4fd404a21d630 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -869,6 +869,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Get the unique identifier of the device. const char *getDeviceUid() const { return DeviceUid.c_str(); } + /// Get the total shared memory per block (in bytes) that can be used in any + /// kernel. + size_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; } + /// Set the context of the device if needed, before calling device-specific /// functions. Plugins may implement this function as a no-op if not needed. virtual Error setContext() = 0; @@ -1461,6 +1465,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Variable to enable kernel duration tracing. BoolEnvar OMPX_KernelDurationTracing; + /// The total per-block native shared memory that a kernel may use. + size_t MaxBlockSharedMemSize = 0; private: /// Return the kernel environment object for kernel \p Name. Expected @@ -1579,6 +1585,7 @@ struct KernelRunRecordTy { std::unordered_map TuningData; /// Internal representation for OMPT device (initialize & finalize) std::atomic OmptInitialized; + }; /// Class implementing common functionalities of offload plugins. Each plugin diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 2d7fdaae6dab3..176be9bfcec81 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -382,6 +382,12 @@ struct CUDADeviceTy : public GenericDeviceTy { return Err; HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize); + uint32_t MaxSharedMem; + if (auto Err = getDeviceAttr( + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem)) + return Err; + MaxBlockSharedMemSize = MaxSharedMem; + return Plugin::success(); } @@ -1092,10 +1098,8 @@ struct CUDADeviceTy : public GenericDeviceTy { if (Res == CUDA_SUCCESS) Info.add("Total Constant Memory", TmpInt, "bytes"); - Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - TmpInt); - if (Res == CUDA_SUCCESS) - Info.add("Max Shared Memory per Block", TmpInt, "bytes"); + Info.add("Max Shared Memory per Block", MaxBlockSharedMemSize, "bytes", + DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE); Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt); if (Res == CUDA_SUCCESS) diff --git a/revert_patches.txt b/revert_patches.txt index 71326d5846d49..6b4ff3e89c089 100644 --- a/revert_patches.txt +++ b/revert_patches.txt @@ -5,10 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485) breaks build of ROCmValidationSuite [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662) --- -needs more integration offload. -[Offload] Add device info for shared memory (#167817) ---- breaks conformance/2.0/relationals/test_relationals relational_select_signed "DAG: Allow select ptr combine for non-0 address spaces (#167909)" --- -