From 53ee32449eda5d383bc238fe61dc985f4eba1ead Mon Sep 17 00:00:00 2001
From: Kevin Sala Penades <salapenades1@llnl.gov>
Date: Thu, 13 Nov 2025 11:00:12 -0800
Subject: [PATCH] [Offload] Add device info for shared memory (#167817)

---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp          | 13 +++++++++++++
 .../common/include/PluginInterface.h                |  7 +++++++
 offload/plugins-nextgen/cuda/src/rtl.cpp            | 12 ++++++++----
 revert_patches.txt                                  |  4 ----
 4 files changed, 28 insertions(+), 8 deletions(-)
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e74c34a06cae9..0388bbba4ee28 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3453,6 +3453,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // TODO: put them back in constructor
     //    readEnvVars();
 
+    // Retrieve the size of the group memory.
+    for (const auto *Pool : AllMemoryPools) {
+      if (Pool->isGroup()) {
+        if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE,
+                                     MaxBlockSharedMemSize))
+          return Err;
+        break;
+      }
+    }
+
     return Plugin::success();
   }
 
@@ -4327,6 +4337,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (Status == HSA_STATUS_SUCCESS)
       Info.add("Cacheline Size", TmpUInt);
 
+    Info.add("Max Shared Memory per Work Group", MaxBlockSharedMemSize, "bytes",
+             DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
+
     Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
     if (Status == HSA_STATUS_SUCCESS)
       Info.add("Max Clock Freq", TmpUInt, "MHz",
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 25be1f2264642..4fd404a21d630 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -869,6 +869,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Get the unique identifier of the device.
   const char *getDeviceUid() const { return DeviceUid.c_str(); }
 
+  /// Get the total shared memory per block (in bytes) that can be used in any
+  /// kernel.
+  size_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
+
   /// Set the context of the device if needed, before calling device-specific
   /// functions. Plugins may implement this function as a no-op if not needed.
   virtual Error setContext() = 0;
@@ -1461,6 +1465,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Variable to enable kernel duration tracing.
   BoolEnvar OMPX_KernelDurationTracing;
 
+  /// The total per-block native shared memory that a kernel may use.
+  size_t MaxBlockSharedMemSize = 0;
 private:
   /// Return the kernel environment object for kernel \p Name.
   Expected<KernelEnvironmentTy>
@@ -1579,6 +1585,7 @@ struct KernelRunRecordTy {
   std::unordered_map<std::string, TuningMetadataTy> TuningData;
   /// Internal representation for OMPT device (initialize & finalize)
   std::atomic<bool> OmptInitialized;
+
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 2d7fdaae6dab3..176be9bfcec81 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -382,6 +382,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
       return Err;
     HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);
 
+    uint32_t MaxSharedMem;
+    if (auto Err = getDeviceAttr(
+            CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem))
+      return Err;
+    MaxBlockSharedMemSize = MaxSharedMem;
+
     return Plugin::success();
   }
 
@@ -1092,10 +1098,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (Res == CUDA_SUCCESS)
       Info.add("Total Constant Memory", TmpInt, "bytes");
 
-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
-                           TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Max Shared Memory per Block", TmpInt, "bytes");
+    Info.add("Max Shared Memory per Block", MaxBlockSharedMemSize, "bytes",
+             DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
 
     Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
     if (Res == CUDA_SUCCESS)
diff --git a/revert_patches.txt b/revert_patches.txt
index 71326d5846d49..6b4ff3e89c089 100644
--- a/revert_patches.txt
+++ b/revert_patches.txt
@@ -5,10 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485)
 breaks build of ROCmValidationSuite
 [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662)
 ---
-needs more integration offload.
-[Offload] Add device info for shared memory (#167817)
----
 breaks conformance/2.0/relationals/test_relationals relational_select_signed
 "DAG: Allow select ptr combine for non-0 address spaces (#167909)"
 ---
-