From 95fa5646a42406d4ccd520115e08e5e746165a8b Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Thu, 6 Nov 2025 10:15:18 -0600 Subject: [PATCH] [Offload] Remove handling for device memory pool (#163629) Summary: This was a lot of code that was only used for upstream LLVM builds of AMDGPU offloading. We have a generic and fast `malloc` in `libc` now so just use that. Simplifies code, can be added back if we start providing alternate forms but I don't think there's a single use-case that would justify it yet. --- offload/include/Shared/Environment.h | 22 ---- offload/plugins-nextgen/amdgpu/src/rtl.cpp | 14 --- .../common/include/PluginInterface.h | 25 ++-- .../common/src/PluginInterface.cpp | 102 ++--------------- offload/plugins-nextgen/cuda/src/rtl.cpp | 6 +- offload/plugins-nextgen/host/src/rtl.cpp | 8 -- .../{offloading => libc}/malloc_parallel.c | 0 offload/test/mapping/lambda_mapping.cpp | 2 + offload/test/offloading/malloc.c | 2 +- openmp/device/include/Allocator.h | 6 - openmp/device/src/Allocator.cpp | 94 +++++++++------ openmp/device/src/Kernel.cpp | 1 - openmp/device/src/Misc.cpp | 4 +- openmp/device/src/State.cpp | 107 +++--------------- openmp/docs/design/Runtimes.rst | 1 - revert_patches.txt | 3 - 16 files changed, 101 insertions(+), 296 deletions(-) rename offload/test/{offloading => libc}/malloc_parallel.c (100%) diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h index 2a283bd6fa4ed..79e45fd8e082d 100644 --- a/offload/include/Shared/Environment.h +++ b/offload/include/Shared/Environment.h @@ -21,7 +21,6 @@ enum class DeviceDebugKind : uint32_t { Assertion = 1U << 0, FunctionTracing = 1U << 1, CommonIssues = 1U << 2, - AllocationTracker = 1U << 3, PGODump = 1U << 4, }; @@ -36,27 +35,6 @@ struct DeviceEnvironmentTy { uint64_t HardwareParallelism; }; -struct DeviceMemoryPoolTy { - void *Ptr; - uint64_t Size; -}; - -struct DeviceMemoryPoolTrackingTy { - uint64_t NumAllocations; - uint64_t AllocationTotal; - uint64_t AllocationMin; - uint64_t AllocationMax; - - void combine(DeviceMemoryPoolTrackingTy &Other) { - NumAllocations += Other.NumAllocations; - AllocationTotal += Other.AllocationTotal; - AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin - : AllocationMin; - AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax - : AllocationMax; - } -}; - // NOTE: Please don't change the order of those members as their indices are // used in the middle end. Always add the new data member at the end. // Different from KernelEnvironmentTy below, this structure contains members diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index bae3de7c402ba..061d4fd0ad7e7 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -4560,17 +4560,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Plugin::success(); } - Error getDeviceHeapSize(uint64_t &Value) override { - Value = DeviceMemoryPoolSize; - return Plugin::success(); - } - Error setDeviceHeapSize(uint64_t Value) override { - for (DeviceImageTy *Image : LoadedImages) - if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value)) - return Err; - DeviceMemoryPoolSize = Value; - return Plugin::success(); - } Error getDeviceMemorySize(uint64_t &Value) override { for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { @@ -5023,9 +5012,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Pointer to the preallocated device memory pool void *PreAllocatedDeviceMemoryPool; - /// The current size of the global device memory pool (managed by us). - uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */; - /// The current size of the stack that will be used in cases where it could /// not be statically determined. /// Default: 1024, in conformity to hipLimitStackSize. diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 021ab736941bf..25be1f2264642 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -894,10 +894,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { Error unloadBinary(DeviceImageTy *Image); virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0; - /// Setup the global device memory pool, if the plugin requires one. - Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, - uint64_t PoolSize); - // Setup the RPC server for this device if needed. This may not run on some // plugins like the CPU targets. By default, it will not be executed so it is // up to the target to override this using the shouldSetupRPCServer function. @@ -1229,6 +1225,16 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Allocate and construct a kernel object. virtual Expected constructKernel(const char *Name) = 0; + virtual bool hasDeviceHeapSize() { return false; } + virtual Error getDeviceHeapSize(uint64_t &V) { + return Plugin::error(error::ErrorCode::UNSUPPORTED, + "%s not supported by platform", __func__); + } + virtual Error setDeviceHeapSize(uint64_t V) { + return Plugin::error(error::ErrorCode::UNSUPPORTED, + "%s not supported by platform", __func__); + } + /// Returns true if current plugin architecture is an APU /// and unified_shared_memory was not requested by the program. bool useAutoZeroCopy(); @@ -1363,12 +1369,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// plugin can implement the setters as no-op and setting the output /// value to zero for the getters. virtual Error setDeviceStackSize(uint64_t V) = 0; - virtual Error getDeviceHeapSize(uint64_t &V) = 0; - virtual Error setDeviceHeapSize(uint64_t V) = 0; - - /// Indicate whether the device should setup the global device memory pool. If - /// false is return the value on the device will be uninitialized. - virtual bool shouldSetupDeviceMemoryPool() const { return true; } /// Indicate whether or not the device should setup the RPC server. This is /// only necessary for unhosted targets like the GPU. @@ -1466,9 +1466,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { Expected getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image); - DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0}; - DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; - bool IsFastReductionEnabled = false; }; @@ -1580,6 +1577,8 @@ struct KernelRunRecordTy { uint32_t RunLimiter = ThreadCandidate.size() * CUMultiplierCandidate.size(); // Used for keeping track of the metatdata used in tuning for each kernel. std::unordered_map TuningData; + /// Internal representation for OMPT device (initialize & finalize) + std::atomic OmptInitialized; }; /// Class implementing common functionalities of offload plugins. Each plugin diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index d2d0a50a7c1d5..1c42db54f3828 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -899,13 +899,15 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) { return StackSizeEnvarOrErr.takeError(); OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr); - auto HeapSizeEnvarOrErr = UInt64Envar::create( - "LIBOMPTARGET_HEAP_SIZE", - [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); }, - [this](uint64_t V) -> Error { return setDeviceHeapSize(V); }); - if (!HeapSizeEnvarOrErr) - return HeapSizeEnvarOrErr.takeError(); - OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr); + if (hasDeviceHeapSize()) { + auto HeapSizeEnvarOrErr = UInt64Envar::create( + "LIBOMPTARGET_HEAP_SIZE", + [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); }, + [this](uint64_t V) -> Error { return setDeviceHeapSize(V); }); + if (!HeapSizeEnvarOrErr) + return HeapSizeEnvarOrErr.takeError(); + OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr); + } // Update the maximum number of teams and threads after the device // initialization sets the corresponding hardware limit. @@ -938,19 +940,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) { if (auto Err = callGlobalDestructors(Plugin, *Image)) return Err; - if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) { - GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); - DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0}; - GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker", - sizeof(DeviceMemoryPoolTrackingTy), - &ImageDeviceMemoryPoolTracking); - if (auto Err = - GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) { - consumeError(std::move(Err)); - } - DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking); - } - GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image); if (!ProfOrErr) @@ -976,22 +965,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { return Err; LoadedImages.clear(); - if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) { - // TODO: Write this by default into a file. - printf("\n\n|-----------------------\n" - "| Device memory tracker:\n" - "|-----------------------\n" - "| #Allocations: %lu\n" - "| Byes allocated: %lu\n" - "| Minimal allocation: %lu\n" - "| Maximal allocation: %lu\n" - "|-----------------------\n\n\n", - DeviceMemoryPoolTracking.NumAllocations, - DeviceMemoryPoolTracking.AllocationTotal, - DeviceMemoryPoolTracking.AllocationMin, - DeviceMemoryPoolTracking.AllocationMax); - } - // Delete the memory manager before deinitializing the device. Otherwise, // we may delete device allocations after the device is deinitialized. if (MemoryManager) @@ -1048,18 +1021,6 @@ Expected GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, // Add the image to list. LoadedImages.push_back(Image); - // Setup the global device memory pool if needed. - if (!Plugin.getRecordReplay().isReplaying() && - shouldSetupDeviceMemoryPool()) { - uint64_t HeapSize; - auto SizeOrErr = getDeviceHeapSize(HeapSize); - if (SizeOrErr) { - REPORT("No global device memory pool due to error: %s\n", - toString(std::move(SizeOrErr)).data()); - } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize)) - return std::move(Err); - } - if (auto Err = setupRPCServer(Plugin, *Image)) return std::move(Err); @@ -1074,51 +1035,6 @@ Expected GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, return Image; } -Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin, - DeviceImageTy &Image, - uint64_t PoolSize) { - // Free the old pool, if any. - if (DeviceMemoryPool.Ptr) { - if (auto Err = dataDelete(DeviceMemoryPool.Ptr, - TargetAllocTy::TARGET_ALLOC_DEVICE)) - return Err; - } - - DeviceMemoryPool.Size = PoolSize; - auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr, - TargetAllocTy::TARGET_ALLOC_DEVICE); - if (AllocOrErr) { - DeviceMemoryPool.Ptr = *AllocOrErr; - } else { - auto Err = AllocOrErr.takeError(); - REPORT("Failure to allocate device memory for global memory pool: %s\n", - toString(std::move(Err)).data()); - DeviceMemoryPool.Ptr = nullptr; - DeviceMemoryPool.Size = 0; - } - - // Create the metainfo of the device environment global. - GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); - if (!GHandler.isSymbolInImage(*this, Image, - "__omp_rtl_device_memory_pool_tracker")) { - DP("Skip the memory pool as there is no tracker symbol in the image."); - return Error::success(); - } - - GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker", - sizeof(DeviceMemoryPoolTrackingTy), - &DeviceMemoryPoolTracking); - if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal)) - return Err; - - // Create the metainfo of the device environment global. - GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool", - sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool); - - // Write device environment values to the device. - return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal); -} - Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image) { // The plugin either does not need an RPC server or it is unavailable. diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index 0c9fbd01f8576..2d7fdaae6dab3 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1238,11 +1238,6 @@ struct CUDADeviceTy : public GenericDeviceTy { return Info; } - virtual bool shouldSetupDeviceMemoryPool() const override { - /// We use the CUDA malloc for now. - return false; - } - /// Getters and setters for stack and heap sizes. Error getDeviceStackSize(uint64_t &Value) override { return getCtxLimit(CU_LIMIT_STACK_SIZE, Value); @@ -1253,6 +1248,7 @@ struct CUDADeviceTy : public GenericDeviceTy { Error getDeviceHeapSize(uint64_t &Value) override { return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value); } + bool hasDeviceHeapSize() override { return true; } Error setDeviceHeapSize(uint64_t Value) override { return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value); } diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index 5758b634a7456..75f85287e86dc 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -393,9 +393,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy { return Info; } - /// This plugin should not setup the device environment or memory pool. - virtual bool shouldSetupDeviceMemoryPool() const override { return false; }; - /// Getters and setters for stack size and heap size not relevant. Error getDeviceStackSize(uint64_t &Value) override { Value = 0; @@ -404,11 +401,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy { Error setDeviceStackSize(uint64_t Value) override { return Plugin::success(); } - Error getDeviceHeapSize(uint64_t &Value) override { - Value = 0; - return Plugin::success(); - } - Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); } private: /// Grid values for Generic ELF64 plugins. diff --git a/offload/test/offloading/malloc_parallel.c b/offload/test/libc/malloc_parallel.c similarity index 100% rename from offload/test/offloading/malloc_parallel.c rename to offload/test/libc/malloc_parallel.c diff --git a/offload/test/mapping/lambda_mapping.cpp b/offload/test/mapping/lambda_mapping.cpp index 8f55f50efc7d6..b9579bc0c2a91 100644 --- a/offload/test/mapping/lambda_mapping.cpp +++ b/offload/test/mapping/lambda_mapping.cpp @@ -5,6 +5,8 @@ // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic // RUN: %libomptarget-compileoptxx-run-and-check-generic +// REQUIRES: libc + #include template diff --git a/offload/test/offloading/malloc.c b/offload/test/offloading/malloc.c index 7b98e1f1110e5..04e72561d3127 100644 --- a/offload/test/offloading/malloc.c +++ b/offload/test/offloading/malloc.c @@ -10,7 +10,7 @@ int main() { int Threads = 64; int Teams = 10; - // Allocate ~55MB on the device. + // Allocate ~160 KiB on the device. #pragma omp target map(from : DP) DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams); diff --git a/openmp/device/include/Allocator.h b/openmp/device/include/Allocator.h index dc4d029ed75f3..507ec6327126a 100644 --- a/openmp/device/include/Allocator.h +++ b/openmp/device/include/Allocator.h @@ -14,18 +14,12 @@ #include "DeviceTypes.h" -// Forward declaration. -struct KernelEnvironmentTy; - namespace ompx { namespace allocator { static uint64_t constexpr ALIGNMENT = 16; -/// Initialize the allocator according to \p KernelEnvironment -void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment); - /// Allocate \p Size bytes. [[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void * alloc(uint64_t Size); diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp index aac2a6005158e..b6e8f06bc164f 100644 --- a/openmp/device/src/Allocator.cpp +++ b/openmp/device/src/Allocator.cpp @@ -16,44 +16,59 @@ #include "DeviceUtils.h" #include "Mapping.h" #include "Synchronization.h" +#include "Platform.h" using namespace ompx; +using namespace allocator; + +// Provide a default implementation of malloc / free for AMDGPU platforms built +// without 'libc' support. +extern "C" { + +[[gnu::noinline]] uint64_t __asan_malloc_impl(uint64_t bufsz, uint64_t pc); +[[gnu::noinline]] void __asan_free_impl(uint64_t ptr, uint64_t pc); +[[gnu::noinline]] uint64_t __ockl_dm_alloc(uint64_t bufsz); +[[gnu::noinline]] void __ockl_dm_dealloc(uint64_t ptr); + +#ifdef __AMDGPU__ +[[gnu::noinline]] void *__alt_libc_malloc(size_t sz); +[[gnu::noinline]] void __alt_libc_free(void *ptr); + +[[gnu::noinline]] uint64_t __ockl_devmem_request(uint64_t addr, uint64_t size) { + if (size) { // allocation request + [[clang::noinline]] return (uint64_t)__alt_libc_malloc((size_t)size); + } else { // free request + [[clang::noinline]] __alt_libc_free((void *)addr); + return 0; + } +} +#endif + +#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) +[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } +[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } +#else +[[gnu::leaf]] void *malloc(size_t Size); +[[gnu::leaf]] void free(void *Ptr); +#endif +} -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility( - "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool; -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy - __omp_rtl_device_memory_pool_tracker; +static constexpr uint64_t MEMORY_SIZE = /* 1 MiB */ 1024 * 1024; +alignas(ALIGNMENT) static uint8_t Memory[MEMORY_SIZE] = {0}; -/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool -/// directly. +// Fallback bump pointer interface for platforms without a functioning +// allocator. struct BumpAllocatorTy final { + uint64_t Offset = 0; void *alloc(uint64_t Size) { Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT)); - if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) { - atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1, - atomic::seq_cst); - atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size, - atomic::seq_cst); - atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size, - atomic::seq_cst); - atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size, - atomic::seq_cst); - } - - uint64_t *Data = - reinterpret_cast(&__omp_rtl_device_memory_pool.Ptr); - uint64_t End = - reinterpret_cast(Data) + __omp_rtl_device_memory_pool.Size; - - uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst); - if (OldData + Size > End) + uint64_t OldData = atomic::add(&Offset, Size, atomic::seq_cst); + if (OldData + Size >= MEMORY_SIZE) __builtin_trap(); - return reinterpret_cast(OldData); + return &Memory[OldData]; } void free(void *) {} @@ -65,13 +80,26 @@ BumpAllocatorTy BumpAllocator; /// ///{ -void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) { - // TODO: Check KernelEnvironment for an allocator choice as soon as we have - // more than one. +void *allocator::alloc(uint64_t Size) { +#if defined(__AMDGPU__) && defined(SANITIZER_AMDGPU) + return reinterpret_cast( + __asan_malloc_impl(Size, uint64_t(__builtin_return_address(0)))); +#elif defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) + return reinterpret_cast(__ockl_dm_alloc(Size)); +#else + return ::malloc(Size); +#endif } -void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); } - -void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); } +void allocator::free(void *Ptr) { +#if defined(__AMDGPU__) && defined(SANITIZER_AMDGPU) + __asan_free_impl(reinterpret_cast(Ptr), + uint64_t(__builtin_return_address(0))); +#elif defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) + __ockl_dm_dealloc(reinterpret_cast(Ptr)); +#else + ::free(Size); +#endif +} ///} diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp index e5535d082bfeb..6526a97dd58d6 100644 --- a/openmp/device/src/Kernel.cpp +++ b/openmp/device/src/Kernel.cpp @@ -41,7 +41,6 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, synchronize::init(IsSPMD); mapping::init(IsSPMD); state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment); - allocator::init(IsSPMD, KernelEnvironment); workshare::init(IsSPMD); } diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp index 4f9c77f9eefaa..8250dc40015cc 100644 --- a/openmp/device/src/Misc.cpp +++ b/openmp/device/src/Misc.cpp @@ -100,7 +100,7 @@ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { case omp_const_mem_alloc: case omp_high_bw_mem_alloc: case omp_low_lat_mem_alloc: - return malloc(size); + return ompx::allocator::alloc(size); default: return nullptr; } @@ -113,7 +113,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) { case omp_const_mem_alloc: case omp_high_bw_mem_alloc: case omp_low_lat_mem_alloc: - free(ptr); + ompx::allocator::free(ptr); return; case omp_null_allocator: default: diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp index 20fdf3c0be753..8617452f7db34 100644 --- a/openmp/device/src/State.cpp +++ b/openmp/device/src/State.cpp @@ -55,98 +55,15 @@ using namespace ompx; namespace { -/// Malloc/Free API implementation -/// AMDGCN does not expose a malloc/free API, while -/// NVPTX does. FOr this reason, the order of the following malloc/free -/// variant declarations and definitions is important and should not be changed - -/// AMDGCN implementations of the shuffle sync idiom -/// -///{ - -// global_allocate uses ockl_dm_alloc/asan_malloc_impl to manage a global memory -// heap -__attribute__((noinline)) extern "C" uint64_t __ockl_dm_alloc(uint64_t bufsz); -__attribute__((noinline)) extern "C" void __ockl_dm_dealloc(uint64_t ptr); -#if SANITIZER_AMDGPU -__attribute__((noinline)) extern "C" uint64_t __asan_malloc_impl(uint64_t bufsz, - uint64_t pc); -__attribute__((noinline)) extern "C" void __asan_free_impl(uint64_t ptr, - uint64_t pc); -#endif -#ifdef __AMDGPU__ -extern "C" { -__attribute__((noinline)) uint64_t __ockl_devmem_request(uint64_t addr, - uint64_t size) { - if (size) { // allocation request - [[clang::noinline]] return (uint64_t)__alt_libc_malloc((size_t)size); - } else { // free request - [[clang::noinline]] __alt_libc_free((void *)addr); - return 0; - } -} - -__attribute__((noinline)) void *internal_malloc(uint64_t Size) { -#if SANITIZER_AMDGPU - uint64_t ptr = - __asan_malloc_impl(Size, (uint64_t)__builtin_return_address(0)); - return (void *)ptr; -#else - [[clang::noinline]] return (void *)__ockl_dm_alloc(Size); -#endif -} - -__attribute__((noinline)) void internal_free(void *Ptr) { -#if SANITIZER_AMDGPU - __asan_free_impl((uint64_t)Ptr, (uint64_t)__builtin_return_address(0)); -#else - [[clang::noinline]] __ockl_dm_dealloc((uint64_t)Ptr); -#endif -} -} -#endif - -extern "C" { -#ifdef __AMDGCN__ -#ifdef USE_BUMP_ALLOCATOR -[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } -[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } -#else -void *malloc(size_t Size) { return internal_malloc(Size); } -void free(void *Ptr) { internal_free(Ptr); } -#endif -#else -#ifdef USE_BUMP_ALLOCATOR - -[[gnu::weak, gnu::leaf]] void *malloc(size_t Size); -[[gnu::weak, gnu::leaf]] void free(void *Ptr); -#else -__attribute__((leaf)) void *malloc(size_t Size); -__attribute__((leaf)) void free(void *Ptr); -#endif -#endif -} // extern "C" - -///} -/// NVPTX implementations of internal mallocs -/// -///{ -#ifdef __NVPTX__ -extern "C" { -void *internal_malloc(uint64_t Size) { return malloc(Size); } - -void internal_free(void *Ptr) { free(Ptr); } -} -#endif - /// A "smart" stack in shared memory. /// -/// The stack exposes a malloc/free interface but works like a stack internally. -/// In fact, it is a separate stack *per warp*. That means, each warp must push -/// and pop symmetrically or this breaks, badly. The implementation will (aim -/// to) detect non-lock-step warps and fallback to malloc/free. The same will -/// happen if a warp runs out of memory. The master warp in generic memory is -/// special and is given more memory than the rest. +/// The stack exposes a malloc/free interface but works like a stack +/// internally. In fact, it is a separate stack *per warp*. That means, each +/// warp must push and pop symmetrically or this breaks, badly. The +/// implementation will (aim to) detect non-lock-step warps and fallback to +/// malloc/free. The same will happen if a warp runs out of memory. The +/// master warp in generic memory is special and is given more memory than +/// the rest. /// struct SharedMemorySmartStackTy { /// Initialize the stack. Must be called by all threads. @@ -246,13 +163,13 @@ void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { } void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { - void *Ptr = malloc(Bytes); + void *Ptr = allocator::alloc(Bytes); if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr) printf("nullptr returned by malloc!\n"); return Ptr; } -void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } +void memory::freeGlobal(void *Ptr, const char *Reason) { allocator::free(Ptr); } ///} @@ -558,6 +475,8 @@ void __kmpc_get_shared_variables(void ***GlobalArgs) { } extern "C" { -__attribute__((leaf)) void *__kmpc_impl_malloc(uint64_t t) { return malloc(t); } -__attribute__((leaf)) void __kmpc_impl_free(void *ptr) { free(ptr); } +__attribute__((leaf)) void *__kmpc_impl_malloc(uint64_t t) { + return allocator::alloc(t); +} +__attribute__((leaf)) void __kmpc_impl_free(void *ptr) { allocator::free(ptr); } } diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst index 64cb1952a5813..4e4a2617759d4 100644 --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1513,5 +1513,4 @@ debugging features are supported. * Enable debugging assertions in the device. ``0x01`` * Enable diagnosing common problems during offloading . ``0x4`` - * Enable device malloc statistics (amdgpu only). ``0x8`` * Dump device PGO counters (only if PGO on GPU is enabled). ``0x10`` diff --git a/revert_patches.txt b/revert_patches.txt index b98f337e0d7ac..9e465ba90ae6a 100644 --- a/revert_patches.txt +++ b/revert_patches.txt @@ -5,6 +5,3 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485) breaks build of ROCmValidationSuite [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662) --- -breaks openmp smoke and openmpapps -[Offload] Remove handling for device memory pool (#163629) ----