From 5546e854c5c42aabbb975f931f3cafea4ce2bb45 Mon Sep 17 00:00:00 2001 From: nileshnegi Date: Tue, 19 May 2026 01:19:25 -0500 Subject: [PATCH] Fix closest-CPU-NUMA detection regression for GPU executors Restore the Linux-NUMA-indexed cpuAgents[] table that the GPU-to-closest-CPU lookup in CollectTopology() depends on. After #303, CPU agents were enumerated via hsa_iterate_agents + HSA_AGENT_INFO_NODE and stored at the HSA/KFD topology node id, which is not guaranteed to equal the Linux NUMA index (e.g. on systems where GPU agents are enumerated before CPU agents in the KFD topology). When the two diverged, cpuAgents[i] stayed default-initialised, the closestCpuAgent lookup never matched, and GetClosestCpuNumaToGpu() returned -1 for every GPU. That broke the topology display (all GPUs reported with no closest NUMA) and produced malformed transfer descriptors like "R0C-1" in any preset that consumes this info. Switch back to the allocation-based enumeration that preserves cpuAgents[i] == "agent owning Linux NUMA i" while still tolerating NUMA nodes restricted by the process's cpuset / mempolicy: AllocateMemory(MEM_CPU) already rejects such nodes since #303, so we skip them on ERR_FATAL and leave the slot zero-initialised. Restricted NUMAs cannot service transfers anyway, so a zero-handle entry is the correct sentinel and harmless to the handle comparison performed by the closest-CPU lookup. Also drop the explicit hsa_init() / hsa_shut_down() around the iteration: HIP already manages HSA lifetime for this process and tearing it down between topology collection and the rest of the run is unnecessary. Verified on 8 x gfx950, 2 NUMA nodes: - ./TransferBench now prints the expected "NUMA 00 -> GPUs 0 1 2 3" / "NUMA 01 -> GPUs 4 5 6 7" mapping that pre-#303 (e8edacf) produced. Co-authored-by: Claude --- src/header/TransferBench.hpp | 53 +++++++++++++++++------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index 8b635e2..8850ee2 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -7756,39 +7756,36 @@ static bool IsConfiguredGid(union ibv_gid const& gid) hsa_amd_pointer_info_t info; info.size = sizeof(info); - // Callback to process each agent - auto cpuAgentCallback = [](hsa_agent_t agent, void* data) -> hsa_status_t { - std::map* agents = static_cast*>(data); - - hsa_device_type_t deviceType; - hsa_status_t status = hsa_agent_get_info(agent, HSA_AGENT_INFO_DEVICE, &deviceType); - if (status != HSA_STATUS_SUCCESS) return status; - - if (deviceType == HSA_DEVICE_TYPE_CPU) { - uint32_t nodeId; - status = hsa_agent_get_info(agent, HSA_AGENT_INFO_NODE, &nodeId); - if (status == HSA_STATUS_SUCCESS) { - (*agents)[nodeId] = agent; - } else { - return status; - } - } - return HSA_STATUS_SUCCESS; - }; - - // Index CPU agents - hsa_init(); - std::map cpuAgentMap; - hsa_iterate_agents(cpuAgentCallback, &cpuAgentMap); - hsa_shut_down(); - + // Index CPU agents by Linux NUMA node. + // + // AllocateMemory({MEM_CPU, i}, ...) yields a buffer pinned on NUMA i, whose + // hsa_amd_pointer_info::agentOwner is the HSA CPU agent for that NUMA. This + // preserves the invariant `cpuAgents[i] == "agent for Linux NUMA i"` that the + // closest-NUMA-to-GPU detection below depends on. + // + // The previous hsa_iterate_agents + HSA_AGENT_INFO_NODE approach indexed by the + // HSA/KFD topology node id, which is *not* guaranteed to equal the Linux NUMA + // index (e.g. on systems with GPU agents enumerated before CPU agents). When + // they diverge, cpuAgents[i] is left default-initialised (handle=0) and the + // GPU-to-closest-CPU lookup fails, returning -1 for every GPU. + // + // NUMA nodes restricted by the process's cpuset / mempolicy are skipped here + // (AllocateMemory rejects them since #303) without aborting the whole topology + // scan; cpuAgents[i] simply stays zero-initialised for restricted nodes, which + // is fine because those NUMAs cannot be used for transfers anyway. cpuAgents.clear(); int numCpus = numa_num_configured_nodes(); cpuAgents.resize(numCpus); for (int i = 0; i < numCpus; i++) { - if (cpuAgentMap.count(i)) { - cpuAgents[i] = cpuAgentMap[i]; + int32_t* tempBuffer = nullptr; + if (AllocateMemory({MEM_CPU, i}, 1024, (void**)&tempBuffer).errType != ERR_NONE + || tempBuffer == nullptr) { + continue; + } + if (hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL) == HSA_STATUS_SUCCESS) { + cpuAgents[i] = info.agentOwner; } + DeallocateMemory(MEM_CPU, tempBuffer, 1024); } // Index GPU agents