From 2cb701f0502d5834bcd0132bf5bb60fd7f5aeaa2 Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Fri, 17 Apr 2026 23:38:28 -0500 Subject: [PATCH 1/3] Adding a wallclock consistency detection preset --- CHANGELOG.md | 1 + src/client/Presets/Presets.hpp | 2 ++ src/header/TransferBench.hpp | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3951dadc..3525886c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Documentation for TransferBench is available at - Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug) - Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.1 / CUDA 12.8 - Added new bmasweep preset that compares DMA to batched DMA execution for parallel transfers to other GPUs +- Added new wallclock preset that compares wallclock counters across XCCs within a GPU ### Modified - DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp index 6b2dfd6a..c64a79d4 100644 --- a/src/client/Presets/Presets.hpp +++ b/src/client/Presets/Presets.hpp @@ -43,6 +43,7 @@ THE SOFTWARE. #include "Scaling.hpp" #include "Schmoo.hpp" #include "Sweep.hpp" +#include "WallClock.hpp" typedef int (*PresetFunc)(EnvVars& ev, size_t const numBytesPerTransfer, @@ -68,6 +69,7 @@ std::map> presetFuncMap = {"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}}, {"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}}, {"sweep", {SweepPreset, "Ordered sweep through sets of Transfers"}}, + {"wallclock", {WallClockPreset, "Tests wallclock consistency across XCCs within a GPU"}}, }; void DisplayPresets() diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index e7cb1a04..de177866 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -7437,7 +7437,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Kernel macros #undef GetHwId -#undef GetXccId +//#undef GetXccId // Undefine helper macros #undef ERR_CHECK From 7aaa2c3a9c8ec9677952e7fbc0fc6279fc3f8abd Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Sat, 18 Apr 2026 01:21:29 -0500 Subject: [PATCH 2/3] Forgot to add new Preset file --- src/client/Presets/WallClock.hpp | 234 +++++++++++++++++++++++++++++++ 1 file changed, 234 insertions(+) create mode 100644 src/client/Presets/WallClock.hpp diff --git a/src/client/Presets/WallClock.hpp b/src/client/Presets/WallClock.hpp new file mode 100644 index 00000000..7a81c1ec --- /dev/null +++ b/src/client/Presets/WallClock.hpp @@ -0,0 +1,234 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +__global__ void GetXccTimestamps(uint64_t* timestamps, volatile int* readyFlag) +{ + // Only first thread does any work + if (threadIdx.x != 0) return; + + // Threadblocks in first "row" handle timestamps + if (blockIdx.y == 0) { + + // Collect XCD for this + int xccId; + GetXccId(xccId); + + // All threadblocks wait for ready signal + while (*readyFlag == 0); + + // Collect timestamp and save to memory + auto w = GetTimestamp(); + timestamps[xccId] = w; + } else if (blockIdx.x == 0) { + + // Sleep for some number of cycles to ensure that other threadblocks are active + auto w = GetTimestamp(); + while (GetTimestamp() - w < 10000); + + // Signal start to the other threadblocks + *readyFlag = 1; + } +} + +#if defined(__NVCC__) +#define hipDeviceSynchronize cudaDeviceSynchronize +#define hipFree cudaFree +#define hipHostFree cudaFreeHost +#define hipHostMalloc cudaMallocHost +#define hipMalloc cudaMalloc +#define hipMemset cudaMemset +#endif + +int WallClockPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + // Check for single homogenous group + if (Utils::GetNumRankGroups() > 1) { + Utils::Print("[ERROR] wallclock preset can only be run across ranks that are homogenous\n"); + Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n"); + Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n"); + return 1; + } + + int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); + + // Print off env vars + if (Utils::RankDoesOutput()) { + if (!ev.hideEnv) { + if (!ev.outputToCsv) printf("[WallClock Related]\n"); + ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Limit to using %d GPUs (per rank)", numGpuDevices); + ev.Print("NUM_ITERATIONS" , ev.numIterations, "Number of iterations"); + ev.Print("NUM_WARMUPS" , ev.numWarmups, "Number of warmup iterations"); + ev.Print("SHOW_ITERATIONS", ev.showIterations, "Showing per iteration details. Set to 2 to see raw wallclock values"); + } + } + + // Check for env var consistency across ranks + IS_UNIFORM(numGpuDevices, "NUM_GPU_DEVICES"); + IS_UNIFORM(ev.numIterations, "NUM_ITERATIONS"); + IS_UNIFORM(ev.numWarmups, "NUM_WARMUPS"); + IS_UNIFORM(ev.showIterations, "SHOW_ITERATIONS"); + + if (numGpuDevices <= 0) { + Utils::Print("[ERROR] wallclock preset requires at least one GPU\n"); + return 1; + } + + // Collect local results + int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0}); + + // Compute wall clock rate (based on GPU 0) + int wallClockKhz; +#if defined(__NVCC__) + wallClockKhz = 1000000; +#else + HIP_CALL(hipDeviceGetAttribute(&wallClockKhz, hipDeviceAttributeWallClockRate, 0)); +#endif + if (wallClockKhz == 0) wallClockKhz = 100000; + double uSecPerCycle = 1000.0 / wallClockKhz; + + Utils::Print("\nRunning %d iterations. Detected wall clock rate of %dKhz = %.2f usec per cycle\n\n", + ev.numIterations, wallClockKhz, uSecPerCycle); + + std::vector>> results(numGpuDevices, + std::vector>(ev.numIterations, + std::vector(numXccs, 0))); + for (int deviceId = 0; deviceId < numGpuDevices; deviceId++) { + HIP_CALL(hipSetDevice(deviceId)); + + uint64_t* timestamps; + int32_t* readyFlag; + + HIP_CALL(hipHostMalloc((void**)×tamps, numXccs * sizeof(uint64_t))); + HIP_CALL(hipMalloc((void**)&readyFlag, sizeof(int))); + + for (int i = -ev.numWarmups; i < ev.numIterations; i++) + { + HIP_CALL(hipMemset(readyFlag, 0, sizeof(int))); + HIP_CALL(hipDeviceSynchronize()); + GetXccTimestamps<<>>(timestamps, readyFlag); + HIP_CALL(hipDeviceSynchronize()); + if (i >= 0) { + memcpy(results[deviceId][i].data(), timestamps, numXccs * sizeof(uint64_t)); + } + } + + HIP_CALL(hipHostFree(timestamps)); + HIP_CALL(hipFree(readyFlag)); + } + + // Gather results and print + int numRanks = GetNumRanks(); + int myRank = GetRank(); + + // Prepare table of results + int numRows = 1 + numRanks * numGpuDevices * (ev.showIterations ? (ev.numIterations+1) : 1); + int numCols = 5 + (ev.showIterations ? numXccs : 0); + Utils::TableHelper table(numRows, numCols); + + for (int i = 0; i < numCols; i++) { + table.SetColAlignment(i, Utils::TableHelper::ALIGN_CENTER); + } + + // Prepare header row + int currRow = 0; + int currCol = 0; + table.Set(currRow, currCol++, "Rank"); + table.Set(currRow, currCol++, "GPU"); + table.Set(currRow, currCol++, "Iter"); + table.Set(currRow, currCol++, "Delta(cycles)"); + table.Set(currRow, currCol++, "Delta(usec)"); + if (ev.showIterations) { + for (int i = 0; i < numXccs; i++) { + table.Set(currRow, currCol++, " XCC %d ", i); + } + } + currRow++; + + double minDelta = std::numeric_limits::max(); + double maxDelta = std::numeric_limits::lowest(); + + for (int rank = 0; rank < numRanks; rank++) { + table.DrawRowBorder(currRow); + for (int deviceId = 0; deviceId < numGpuDevices; deviceId++) { + size_t totalCycles = 0; + std::vector timestamps(numXccs, 0); + + for (int iteration = 0; iteration < ev.numIterations; iteration++) { + if (rank == myRank) timestamps = results[deviceId][iteration]; + TransferBench::System::Get().Broadcast(rank, numXccs * sizeof(uint64_t), timestamps.data()); + + const auto [min,max] = std::minmax_element(timestamps.begin(), timestamps.end()); + + uint64_t cycles = (*max - *min); + totalCycles += cycles; + + if (ev.showIterations) { + currCol = 0; + table.Set(currRow, currCol++, "%d", rank); + table.Set(currRow, currCol++, "%d", deviceId); + table.Set(currRow, currCol++, "%d", iteration); + table.Set(currRow, currCol++, "%lu", cycles); + table.Set(currRow, currCol++, "%.2f", cycles * uSecPerCycle); + for (int i = 0; i < numXccs; i++) { + table.Set(currRow, currCol++, "%lu", timestamps[i] - (ev.showIterations > 1 ? 0 : *min)); + } + currRow++; + } + } + + double avgCycles = totalCycles * 1.0 / ev.numIterations; + minDelta = std::min(minDelta, avgCycles); + maxDelta = std::max(maxDelta, avgCycles); + currCol = 0; + table.Set(currRow, currCol++, "%d", rank); + table.Set(currRow, currCol++, "%d", deviceId); + table.Set(currRow, currCol++, "AVG"); + table.Set(currRow, currCol++, "%.2f", avgCycles); + table.Set(currRow, currCol++, "%.2f", avgCycles * uSecPerCycle); + currRow++; + } + } + + table.PrintTable(ev.outputToCsv, ev.showBorders); + + Utils::Print("\n"); + Utils::Print("Minimum Delta detected: %.2f cycles (%.2f usec)\n", minDelta, minDelta * uSecPerCycle); + Utils::Print("Maximum Delta detected: %.2f cycles (%.2f usec)\n", maxDelta, maxDelta * uSecPerCycle); + + if (Utils::HasDuplicateHostname()) { + Utils::Print("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); + } + return 0; +} + +#if defined(__NVCC__) +#undef hipDeviceSynchronize +#undef hipFree +#undef hipHostFree +#undef hipHostMalloc +#undef hipMalloc +#undef hipMemset +#endif From 4ba3bcecace55a82dd23c30bf799e150482a7909 Mon Sep 17 00:00:00 2001 From: Gilbert Lee Date: Sat, 18 Apr 2026 19:10:02 -0500 Subject: [PATCH 3/3] Switching to Utils memory allocation/deallocation --- src/client/Presets/WallClock.hpp | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/client/Presets/WallClock.hpp b/src/client/Presets/WallClock.hpp index 7a81c1ec..d599b1c7 100644 --- a/src/client/Presets/WallClock.hpp +++ b/src/client/Presets/WallClock.hpp @@ -51,11 +51,8 @@ __global__ void GetXccTimestamps(uint64_t* timestamps, volatile int* readyFlag) #if defined(__NVCC__) #define hipDeviceSynchronize cudaDeviceSynchronize -#define hipFree cudaFree -#define hipHostFree cudaFreeHost -#define hipHostMalloc cudaMallocHost -#define hipMalloc cudaMalloc #define hipMemset cudaMemset +#define hipSetDevice cudaSetDevice #endif int WallClockPreset(EnvVars& ev, @@ -63,6 +60,10 @@ int WallClockPreset(EnvVars& ev, std::string const presetName, bool const bytesSpecified) { + // Gather results and print + int numRanks = GetNumRanks(); + int myRank = GetRank(); + // Check for single homogenous group if (Utils::GetNumRankGroups() > 1) { Utils::Print("[ERROR] wallclock preset can only be run across ranks that are homogenous\n"); @@ -121,8 +122,15 @@ int WallClockPreset(EnvVars& ev, uint64_t* timestamps; int32_t* readyFlag; - HIP_CALL(hipHostMalloc((void**)×tamps, numXccs * sizeof(uint64_t))); - HIP_CALL(hipMalloc((void**)&readyFlag, sizeof(int))); + if (Utils::AllocateMemory({MEM_CPU_CLOSEST, deviceId}, numXccs * sizeof(uint64_t), (void**)×tamps)) { + Utils::Print("[ERROR] Unable to allocate pinned host memory for storing timestamps for GPU device %d on rank %d\n", + deviceId, myRank); + return 1; + } + if (Utils::AllocateMemory({MEM_GPU, deviceId}, sizeof(int32_t), (void**)&readyFlag)) { + Utils::Print("[ERROR] Unable to allocate readyFlag on GPU device %d on rank %d\n", deviceId, myRank); + return 1; + } for (int i = -ev.numWarmups; i < ev.numIterations; i++) { @@ -135,14 +143,10 @@ int WallClockPreset(EnvVars& ev, } } - HIP_CALL(hipHostFree(timestamps)); - HIP_CALL(hipFree(readyFlag)); + Utils::DeallocateMemory(MEM_CPU_CLOSEST, timestamps, numXccs * sizeof(uint64_t)); + Utils::DeallocateMemory(MEM_GPU, readyFlag, sizeof(int32_t)); } - // Gather results and print - int numRanks = GetNumRanks(); - int myRank = GetRank(); - // Prepare table of results int numRows = 1 + numRanks * numGpuDevices * (ev.showIterations ? (ev.numIterations+1) : 1); int numCols = 5 + (ev.showIterations ? numXccs : 0); @@ -226,9 +230,5 @@ int WallClockPreset(EnvVars& ev, #if defined(__NVCC__) #undef hipDeviceSynchronize -#undef hipFree -#undef hipHostFree -#undef hipHostMalloc -#undef hipMalloc #undef hipMemset #endif