diff --git a/CHANGELOG.md b/CHANGELOG.md index 3951dadc..3525886c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Documentation for TransferBench is available at - Added a new TB_WALLCLOCK_RATE that will override GPU GFX wallclock rate if it returns 0 (debug) - Adding new batched-DMA executor "B", which utilizes the hipMemcpyBatchAsync API introduced in HIP 7.1 / CUDA 12.8 - Added new bmasweep preset that compares DMA to batched DMA execution for parallel transfers to other GPUs +- Added new wallclock preset that compares wallclock counters across XCCs within a GPU ### Modified - DMA-BUF support enablement in CMake changed to ENABLE_DMA_BUF to be more similar to other compile-time options diff --git a/src/client/Presets/Presets.hpp b/src/client/Presets/Presets.hpp index 6b2dfd6a..c64a79d4 100644 --- a/src/client/Presets/Presets.hpp +++ b/src/client/Presets/Presets.hpp @@ -43,6 +43,7 @@ THE SOFTWARE. #include "Scaling.hpp" #include "Schmoo.hpp" #include "Sweep.hpp" +#include "WallClock.hpp" typedef int (*PresetFunc)(EnvVars& ev, size_t const numBytesPerTransfer, @@ -68,6 +69,7 @@ std::map> presetFuncMap = {"scaling", {ScalingPreset, "Run scaling test from one GPU to other devices"}}, {"schmoo", {SchmooPreset, "Scaling tests for local/remote read/write/copy"}}, {"sweep", {SweepPreset, "Ordered sweep through sets of Transfers"}}, + {"wallclock", {WallClockPreset, "Tests wallclock consistency across XCCs within a GPU"}}, }; void DisplayPresets() diff --git a/src/client/Presets/WallClock.hpp b/src/client/Presets/WallClock.hpp new file mode 100644 index 00000000..d599b1c7 --- /dev/null +++ b/src/client/Presets/WallClock.hpp @@ -0,0 +1,234 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +__global__ void GetXccTimestamps(uint64_t* timestamps, volatile int* readyFlag) +{ + // Only first thread does any work + if (threadIdx.x != 0) return; + + // Threadblocks in first "row" handle timestamps + if (blockIdx.y == 0) { + + // Collect XCD for this + int xccId; + GetXccId(xccId); + + // All threadblocks wait for ready signal + while (*readyFlag == 0); + + // Collect timestamp and save to memory + auto w = GetTimestamp(); + timestamps[xccId] = w; + } else if (blockIdx.x == 0) { + + // Sleep for some number of cycles to ensure that other threadblocks are active + auto w = GetTimestamp(); + while (GetTimestamp() - w < 10000); + + // Signal start to the other threadblocks + *readyFlag = 1; + } +} + +#if defined(__NVCC__) +#define hipDeviceSynchronize cudaDeviceSynchronize +#define hipMemset cudaMemset +#define hipSetDevice cudaSetDevice +#endif + +int WallClockPreset(EnvVars& ev, + size_t const numBytesPerTransfer, + std::string const presetName, + bool const bytesSpecified) +{ + // Gather results and print + int numRanks = GetNumRanks(); + int myRank = GetRank(); + + // Check for single homogenous group + if (Utils::GetNumRankGroups() > 1) { + Utils::Print("[ERROR] wallclock preset can only be run across ranks that are homogenous\n"); + Utils::Print("[ERROR] Run ./TransferBench without any args to display topology information\n"); + Utils::Print("[ERROR] TB_NIC_FILTER may also be used to limit NIC visibility\n"); + return 1; + } + + int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); + int numGpuDevices = EnvVars::GetEnvVar("NUM_GPU_DEVICES", numDetectedGpus); + + // Print off env vars + if (Utils::RankDoesOutput()) { + if (!ev.hideEnv) { + if (!ev.outputToCsv) printf("[WallClock Related]\n"); + ev.Print("NUM_GPU_DEVICES", numGpuDevices, "Limit to using %d GPUs (per rank)", numGpuDevices); + ev.Print("NUM_ITERATIONS" , ev.numIterations, "Number of iterations"); + ev.Print("NUM_WARMUPS" , ev.numWarmups, "Number of warmup iterations"); + ev.Print("SHOW_ITERATIONS", ev.showIterations, "Showing per iteration details. Set to 2 to see raw wallclock values"); + } + } + + // Check for env var consistency across ranks + IS_UNIFORM(numGpuDevices, "NUM_GPU_DEVICES"); + IS_UNIFORM(ev.numIterations, "NUM_ITERATIONS"); + IS_UNIFORM(ev.numWarmups, "NUM_WARMUPS"); + IS_UNIFORM(ev.showIterations, "SHOW_ITERATIONS"); + + if (numGpuDevices <= 0) { + Utils::Print("[ERROR] wallclock preset requires at least one GPU\n"); + return 1; + } + + // Collect local results + int numXccs = GetNumExecutorSubIndices({EXE_GPU_GFX, 0}); + + // Compute wall clock rate (based on GPU 0) + int wallClockKhz; +#if defined(__NVCC__) + wallClockKhz = 1000000; +#else + HIP_CALL(hipDeviceGetAttribute(&wallClockKhz, hipDeviceAttributeWallClockRate, 0)); +#endif + if (wallClockKhz == 0) wallClockKhz = 100000; + double uSecPerCycle = 1000.0 / wallClockKhz; + + Utils::Print("\nRunning %d iterations. Detected wall clock rate of %dKhz = %.2f usec per cycle\n\n", + ev.numIterations, wallClockKhz, uSecPerCycle); + + std::vector>> results(numGpuDevices, + std::vector>(ev.numIterations, + std::vector(numXccs, 0))); + for (int deviceId = 0; deviceId < numGpuDevices; deviceId++) { + HIP_CALL(hipSetDevice(deviceId)); + + uint64_t* timestamps; + int32_t* readyFlag; + + if (Utils::AllocateMemory({MEM_CPU_CLOSEST, deviceId}, numXccs * sizeof(uint64_t), (void**)×tamps)) { + Utils::Print("[ERROR] Unable to allocate pinned host memory for storing timestamps for GPU device %d on rank %d\n", + deviceId, myRank); + return 1; + } + if (Utils::AllocateMemory({MEM_GPU, deviceId}, sizeof(int32_t), (void**)&readyFlag)) { + Utils::Print("[ERROR] Unable to allocate readyFlag on GPU device %d on rank %d\n", deviceId, myRank); + return 1; + } + + for (int i = -ev.numWarmups; i < ev.numIterations; i++) + { + HIP_CALL(hipMemset(readyFlag, 0, sizeof(int))); + HIP_CALL(hipDeviceSynchronize()); + GetXccTimestamps<<>>(timestamps, readyFlag); + HIP_CALL(hipDeviceSynchronize()); + if (i >= 0) { + memcpy(results[deviceId][i].data(), timestamps, numXccs * sizeof(uint64_t)); + } + } + + Utils::DeallocateMemory(MEM_CPU_CLOSEST, timestamps, numXccs * sizeof(uint64_t)); + Utils::DeallocateMemory(MEM_GPU, readyFlag, sizeof(int32_t)); + } + + // Prepare table of results + int numRows = 1 + numRanks * numGpuDevices * (ev.showIterations ? (ev.numIterations+1) : 1); + int numCols = 5 + (ev.showIterations ? numXccs : 0); + Utils::TableHelper table(numRows, numCols); + + for (int i = 0; i < numCols; i++) { + table.SetColAlignment(i, Utils::TableHelper::ALIGN_CENTER); + } + + // Prepare header row + int currRow = 0; + int currCol = 0; + table.Set(currRow, currCol++, "Rank"); + table.Set(currRow, currCol++, "GPU"); + table.Set(currRow, currCol++, "Iter"); + table.Set(currRow, currCol++, "Delta(cycles)"); + table.Set(currRow, currCol++, "Delta(usec)"); + if (ev.showIterations) { + for (int i = 0; i < numXccs; i++) { + table.Set(currRow, currCol++, " XCC %d ", i); + } + } + currRow++; + + double minDelta = std::numeric_limits::max(); + double maxDelta = std::numeric_limits::lowest(); + + for (int rank = 0; rank < numRanks; rank++) { + table.DrawRowBorder(currRow); + for (int deviceId = 0; deviceId < numGpuDevices; deviceId++) { + size_t totalCycles = 0; + std::vector timestamps(numXccs, 0); + + for (int iteration = 0; iteration < ev.numIterations; iteration++) { + if (rank == myRank) timestamps = results[deviceId][iteration]; + TransferBench::System::Get().Broadcast(rank, numXccs * sizeof(uint64_t), timestamps.data()); + + const auto [min,max] = std::minmax_element(timestamps.begin(), timestamps.end()); + + uint64_t cycles = (*max - *min); + totalCycles += cycles; + + if (ev.showIterations) { + currCol = 0; + table.Set(currRow, currCol++, "%d", rank); + table.Set(currRow, currCol++, "%d", deviceId); + table.Set(currRow, currCol++, "%d", iteration); + table.Set(currRow, currCol++, "%lu", cycles); + table.Set(currRow, currCol++, "%.2f", cycles * uSecPerCycle); + for (int i = 0; i < numXccs; i++) { + table.Set(currRow, currCol++, "%lu", timestamps[i] - (ev.showIterations > 1 ? 0 : *min)); + } + currRow++; + } + } + + double avgCycles = totalCycles * 1.0 / ev.numIterations; + minDelta = std::min(minDelta, avgCycles); + maxDelta = std::max(maxDelta, avgCycles); + currCol = 0; + table.Set(currRow, currCol++, "%d", rank); + table.Set(currRow, currCol++, "%d", deviceId); + table.Set(currRow, currCol++, "AVG"); + table.Set(currRow, currCol++, "%.2f", avgCycles); + table.Set(currRow, currCol++, "%.2f", avgCycles * uSecPerCycle); + currRow++; + } + } + + table.PrintTable(ev.outputToCsv, ev.showBorders); + + Utils::Print("\n"); + Utils::Print("Minimum Delta detected: %.2f cycles (%.2f usec)\n", minDelta, minDelta * uSecPerCycle); + Utils::Print("Maximum Delta detected: %.2f cycles (%.2f usec)\n", maxDelta, maxDelta * uSecPerCycle); + + if (Utils::HasDuplicateHostname()) { + Utils::Print("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n"); + } + return 0; +} + +#if defined(__NVCC__) +#undef hipDeviceSynchronize +#undef hipMemset +#endif diff --git a/src/header/TransferBench.hpp b/src/header/TransferBench.hpp index e7cb1a04..de177866 100644 --- a/src/header/TransferBench.hpp +++ b/src/header/TransferBench.hpp @@ -7437,7 +7437,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) // Kernel macros #undef GetHwId -#undef GetXccId +//#undef GetXccId // Undefine helper macros #undef ERR_CHECK