From 95fa5646a42406d4ccd520115e08e5e746165a8b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 6 Nov 2025 10:15:18 -0600
Subject: [PATCH] [Offload] Remove handling for device memory pool (#163629)

Summary:
This was a lot of code that was only used for upstream LLVM builds of
AMDGPU offloading. We have a generic and fast `malloc` in `libc` now so
just use that. Simplifies code, can be added back if we start providing
alternate forms but I don't think there's a single use-case that would
justify it yet.
---
 offload/include/Shared/Environment.h          |  22 ----
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |  14 ---
 .../common/include/PluginInterface.h          |  25 ++--
 .../common/src/PluginInterface.cpp            | 102 ++---------------
 offload/plugins-nextgen/cuda/src/rtl.cpp      |   6 +-
 offload/plugins-nextgen/host/src/rtl.cpp      |   8 --
 .../{offloading => libc}/malloc_parallel.c    |   0
 offload/test/mapping/lambda_mapping.cpp       |   2 +
 offload/test/offloading/malloc.c              |   2 +-
 openmp/device/include/Allocator.h             |   6 -
 openmp/device/src/Allocator.cpp               |  94 +++++++++------
 openmp/device/src/Kernel.cpp                  |   1 -
 openmp/device/src/Misc.cpp                    |   4 +-
 openmp/device/src/State.cpp                   | 107 +++---------------
 openmp/docs/design/Runtimes.rst               |   1 -
 revert_patches.txt                            |   3 -
 16 files changed, 101 insertions(+), 296 deletions(-)
 rename offload/test/{offloading => libc}/malloc_parallel.c (100%)

diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 2a283bd6fa4ed..79e45fd8e082d 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -21,7 +21,6 @@ enum class DeviceDebugKind : uint32_t {
   Assertion = 1U << 0,
   FunctionTracing = 1U << 1,
   CommonIssues = 1U << 2,
-  AllocationTracker = 1U << 3,
   PGODump = 1U << 4,
 };
 
@@ -36,27 +35,6 @@ struct DeviceEnvironmentTy {
   uint64_t HardwareParallelism;
 };
 
-struct DeviceMemoryPoolTy {
-  void *Ptr;
-  uint64_t Size;
-};
-
-struct DeviceMemoryPoolTrackingTy {
-  uint64_t NumAllocations;
-  uint64_t AllocationTotal;
-  uint64_t AllocationMin;
-  uint64_t AllocationMax;
-
-  void combine(DeviceMemoryPoolTrackingTy &Other) {
-    NumAllocations += Other.NumAllocations;
-    AllocationTotal += Other.AllocationTotal;
-    AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
-                                                        : AllocationMin;
-    AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
-                                                        : AllocationMax;
-  }
-};
-
 // NOTE: Please don't change the order of those members as their indices are
 // used in the middle end. Always add the new data member at the end.
 // Different from KernelEnvironmentTy below, this structure contains members
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index bae3de7c402ba..061d4fd0ad7e7 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -4560,17 +4560,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     return Plugin::success();
   }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = DeviceMemoryPoolSize;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override {
-    for (DeviceImageTy *Image : LoadedImages)
-      if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
-        return Err;
-    DeviceMemoryPoolSize = Value;
-    return Plugin::success();
-  }
 
   Error getDeviceMemorySize(uint64_t &Value) override {
     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
@@ -5023,9 +5012,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Pointer to the preallocated device memory pool
   void *PreAllocatedDeviceMemoryPool;
 
-  /// The current size of the global device memory pool (managed by us).
-  uint64_t DeviceMemoryPoolSize = 1L << 29L /* 512MB */;
-
   /// The current size of the stack that will be used in cases where it could
   /// not be statically determined.
   /// Default: 1024, in conformity to hipLimitStackSize.
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 021ab736941bf..25be1f2264642 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -894,10 +894,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error unloadBinary(DeviceImageTy *Image);
   virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
 
-  /// Setup the global device memory pool, if the plugin requires one.
-  Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
-                              uint64_t PoolSize);
-
   // Setup the RPC server for this device if needed. This may not run on some
   // plugins like the CPU targets. By default, it will not be executed so it is
   // up to the target to override this using the shouldSetupRPCServer function.
@@ -1229,6 +1225,16 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Allocate and construct a kernel object.
   virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
 
+  virtual bool hasDeviceHeapSize() { return false; }
+  virtual Error getDeviceHeapSize(uint64_t &V) {
+    return Plugin::error(error::ErrorCode::UNSUPPORTED,
+                         "%s not supported by platform", __func__);
+  }
+  virtual Error setDeviceHeapSize(uint64_t V) {
+    return Plugin::error(error::ErrorCode::UNSUPPORTED,
+                         "%s not supported by platform", __func__);
+  }
+
   /// Returns true if current plugin architecture is an APU
   /// and unified_shared_memory was not requested by the program.
   bool useAutoZeroCopy();
@@ -1363,12 +1369,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// plugin can implement the setters as no-op and setting the output
   /// value to zero for the getters.
   virtual Error setDeviceStackSize(uint64_t V) = 0;
-  virtual Error getDeviceHeapSize(uint64_t &V) = 0;
-  virtual Error setDeviceHeapSize(uint64_t V) = 0;
-
-  /// Indicate whether the device should setup the global device memory pool. If
-  /// false is return the value on the device will be uninitialized.
-  virtual bool shouldSetupDeviceMemoryPool() const { return true; }
 
   /// Indicate whether or not the device should setup the RPC server. This is
   /// only necessary for unhosted targets like the GPU.
@@ -1466,9 +1466,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Expected<KernelEnvironmentTy>
   getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);
 
-  DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
-  DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
-
   bool IsFastReductionEnabled = false;
 };
 
@@ -1580,6 +1577,8 @@ struct KernelRunRecordTy {
   uint32_t RunLimiter = ThreadCandidate.size() * CUMultiplierCandidate.size();
   // Used for keeping track of the metatdata used in tuning for each kernel.
   std::unordered_map<std::string, TuningMetadataTy> TuningData;
+  /// Internal representation for OMPT device (initialize & finalize)
+  std::atomic<bool> OmptInitialized;
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index d2d0a50a7c1d5..1c42db54f3828 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -899,13 +899,15 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
     return StackSizeEnvarOrErr.takeError();
   OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
 
-  auto HeapSizeEnvarOrErr = UInt64Envar::create(
-      "LIBOMPTARGET_HEAP_SIZE",
-      [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
-      [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
-  if (!HeapSizeEnvarOrErr)
-    return HeapSizeEnvarOrErr.takeError();
-  OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
+  if (hasDeviceHeapSize()) {
+    auto HeapSizeEnvarOrErr = UInt64Envar::create(
+        "LIBOMPTARGET_HEAP_SIZE",
+        [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
+        [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
+    if (!HeapSizeEnvarOrErr)
+      return HeapSizeEnvarOrErr.takeError();
+    OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
+  }
 
   // Update the maximum number of teams and threads after the device
   // initialization sets the corresponding hardware limit.
@@ -938,19 +940,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
   if (auto Err = callGlobalDestructors(Plugin, *Image))
     return Err;
 
-  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
-    GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-    DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
-    GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                           sizeof(DeviceMemoryPoolTrackingTy),
-                           &ImageDeviceMemoryPoolTracking);
-    if (auto Err =
-            GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
-      consumeError(std::move(Err));
-    }
-    DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
-  }
-
   GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
   auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
   if (!ProfOrErr)
@@ -976,22 +965,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
       return Err;
   LoadedImages.clear();
 
-  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
-    // TODO: Write this by default into a file.
-    printf("\n\n|-----------------------\n"
-           "| Device memory tracker:\n"
-           "|-----------------------\n"
-           "| #Allocations: %lu\n"
-           "| Byes allocated: %lu\n"
-           "| Minimal allocation: %lu\n"
-           "| Maximal allocation: %lu\n"
-           "|-----------------------\n\n\n",
-           DeviceMemoryPoolTracking.NumAllocations,
-           DeviceMemoryPoolTracking.AllocationTotal,
-           DeviceMemoryPoolTracking.AllocationMin,
-           DeviceMemoryPoolTracking.AllocationMax);
-  }
-
   // Delete the memory manager before deinitializing the device. Otherwise,
   // we may delete device allocations after the device is deinitialized.
   if (MemoryManager)
@@ -1048,18 +1021,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   // Add the image to list.
   LoadedImages.push_back(Image);
 
-  // Setup the global device memory pool if needed.
-  if (!Plugin.getRecordReplay().isReplaying() &&
-      shouldSetupDeviceMemoryPool()) {
-    uint64_t HeapSize;
-    auto SizeOrErr = getDeviceHeapSize(HeapSize);
-    if (SizeOrErr) {
-      REPORT("No global device memory pool due to error: %s\n",
-             toString(std::move(SizeOrErr)).data());
-    } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
-      return std::move(Err);
-  }
-
   if (auto Err = setupRPCServer(Plugin, *Image))
     return std::move(Err);
 
@@ -1074,51 +1035,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   return Image;
 }
 
-Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
-                                             DeviceImageTy &Image,
-                                             uint64_t PoolSize) {
-  // Free the old pool, if any.
-  if (DeviceMemoryPool.Ptr) {
-    if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE))
-      return Err;
-  }
-
-  DeviceMemoryPool.Size = PoolSize;
-  auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE);
-  if (AllocOrErr) {
-    DeviceMemoryPool.Ptr = *AllocOrErr;
-  } else {
-    auto Err = AllocOrErr.takeError();
-    REPORT("Failure to allocate device memory for global memory pool: %s\n",
-           toString(std::move(Err)).data());
-    DeviceMemoryPool.Ptr = nullptr;
-    DeviceMemoryPool.Size = 0;
-  }
-
-  // Create the metainfo of the device environment global.
-  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-  if (!GHandler.isSymbolInImage(*this, Image,
-                                "__omp_rtl_device_memory_pool_tracker")) {
-    DP("Skip the memory pool as there is no tracker symbol in the image.");
-    return Error::success();
-  }
-
-  GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                         sizeof(DeviceMemoryPoolTrackingTy),
-                         &DeviceMemoryPoolTracking);
-  if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
-    return Err;
-
-  // Create the metainfo of the device environment global.
-  GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
-                        sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
-
-  // Write device environment values to the device.
-  return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
-}
-
 Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
                                       DeviceImageTy &Image) {
   // The plugin either does not need an RPC server or it is unavailable.
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 0c9fbd01f8576..2d7fdaae6dab3 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1238,11 +1238,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Info;
   }
 
-  virtual bool shouldSetupDeviceMemoryPool() const override {
-    /// We use the CUDA malloc for now.
-    return false;
-  }
-
   /// Getters and setters for stack and heap sizes.
   Error getDeviceStackSize(uint64_t &Value) override {
     return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
@@ -1253,6 +1248,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
   Error getDeviceHeapSize(uint64_t &Value) override {
     return getCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
   }
+  bool hasDeviceHeapSize() override { return true; }
   Error setDeviceHeapSize(uint64_t Value) override {
     return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
   }
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 5758b634a7456..75f85287e86dc 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -393,9 +393,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return Info;
   }
 
-  /// This plugin should not setup the device environment or memory pool.
-  virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
-
   /// Getters and setters for stack size and heap size not relevant.
   Error getDeviceStackSize(uint64_t &Value) override {
     Value = 0;
@@ -404,11 +401,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error setDeviceStackSize(uint64_t Value) override {
     return Plugin::success();
   }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = 0;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
 
 private:
   /// Grid values for Generic ELF64 plugins.
diff --git a/offload/test/offloading/malloc_parallel.c b/offload/test/libc/malloc_parallel.c
similarity index 100%
rename from offload/test/offloading/malloc_parallel.c
rename to offload/test/libc/malloc_parallel.c
diff --git a/offload/test/mapping/lambda_mapping.cpp b/offload/test/mapping/lambda_mapping.cpp
index 8f55f50efc7d6..b9579bc0c2a91 100644
--- a/offload/test/mapping/lambda_mapping.cpp
+++ b/offload/test/mapping/lambda_mapping.cpp
@@ -5,6 +5,8 @@
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
 // RUN: %libomptarget-compileoptxx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <iostream>
 
 template <typename LOOP_BODY>
diff --git a/offload/test/offloading/malloc.c b/offload/test/offloading/malloc.c
index 7b98e1f1110e5..04e72561d3127 100644
--- a/offload/test/offloading/malloc.c
+++ b/offload/test/offloading/malloc.c
@@ -10,7 +10,7 @@ int main() {
   int Threads = 64;
   int Teams = 10;
 
-  // Allocate ~55MB on the device.
+  // Allocate ~160 KiB on the device.
 #pragma omp target map(from : DP)
   DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams);
 
diff --git a/openmp/device/include/Allocator.h b/openmp/device/include/Allocator.h
index dc4d029ed75f3..507ec6327126a 100644
--- a/openmp/device/include/Allocator.h
+++ b/openmp/device/include/Allocator.h
@@ -14,18 +14,12 @@
 
 #include "DeviceTypes.h"
 
-// Forward declaration.
-struct KernelEnvironmentTy;
-
 namespace ompx {
 
 namespace allocator {
 
 static uint64_t constexpr ALIGNMENT = 16;
 
-/// Initialize the allocator according to \p KernelEnvironment
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
-
 /// Allocate \p Size bytes.
 [[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
 alloc(uint64_t Size);
diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp
index aac2a6005158e..b6e8f06bc164f 100644
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@@ -16,44 +16,59 @@
 #include "DeviceUtils.h"
 #include "Mapping.h"
 #include "Synchronization.h"
+#include "Platform.h"
 
 using namespace ompx;
+using namespace allocator;
+
+// Provide a default implementation of malloc / free for AMDGPU platforms built
+// without 'libc' support.
+extern "C" {
+
+[[gnu::noinline]] uint64_t __asan_malloc_impl(uint64_t bufsz, uint64_t pc);
+[[gnu::noinline]] void __asan_free_impl(uint64_t ptr, uint64_t pc);
+[[gnu::noinline]] uint64_t __ockl_dm_alloc(uint64_t bufsz);
+[[gnu::noinline]] void __ockl_dm_dealloc(uint64_t ptr);
+
+#ifdef __AMDGPU__
+[[gnu::noinline]] void *__alt_libc_malloc(size_t sz);
+[[gnu::noinline]] void __alt_libc_free(void *ptr);
+
+[[gnu::noinline]] uint64_t __ockl_devmem_request(uint64_t addr, uint64_t size) {
+  if (size) { // allocation request
+    [[clang::noinline]] return (uint64_t)__alt_libc_malloc((size_t)size);
+  } else { // free request
+    [[clang::noinline]] __alt_libc_free((void *)addr);
+    return 0;
+  }
+}
+#endif
+
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
+[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
+#else
+[[gnu::leaf]] void *malloc(size_t Size);
+[[gnu::leaf]] void free(void *Ptr);
+#endif
+}
 
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility(
-      "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
-    __omp_rtl_device_memory_pool_tracker;
+static constexpr uint64_t MEMORY_SIZE = /* 1 MiB */ 1024 * 1024;
+alignas(ALIGNMENT) static uint8_t Memory[MEMORY_SIZE] = {0};
 
-/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
-/// directly.
+// Fallback bump pointer interface for platforms without a functioning
+// allocator.
 struct BumpAllocatorTy final {
+  uint64_t Offset = 0;
 
   void *alloc(uint64_t Size) {
     Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
 
-    if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
-                  atomic::seq_cst);
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
-                  atomic::seq_cst);
-      atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
-                  atomic::seq_cst);
-      atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
-                  atomic::seq_cst);
-    }
-
-    uint64_t *Data =
-        reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
-    uint64_t End =
-        reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
-
-    uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
-    if (OldData + Size > End)
+    uint64_t OldData = atomic::add(&Offset, Size, atomic::seq_cst);
+    if (OldData + Size >= MEMORY_SIZE)
       __builtin_trap();
 
-    return reinterpret_cast<void *>(OldData);
+    return &Memory[OldData];
   }
 
   void free(void *) {}
@@ -65,13 +80,26 @@ BumpAllocatorTy BumpAllocator;
 ///
 ///{
 
-void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
-  // TODO: Check KernelEnvironment for an allocator choice as soon as we have
-  // more than one.
+void *allocator::alloc(uint64_t Size) {
+#if defined(__AMDGPU__) && defined(SANITIZER_AMDGPU)
+  return reinterpret_cast<void *>(
+      __asan_malloc_impl(Size, uint64_t(__builtin_return_address(0))));
+#elif defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+  return reinterpret_cast<void *>(__ockl_dm_alloc(Size));
+#else
+  return ::malloc(Size);
+#endif
 }
 
-void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
-
-void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
+void allocator::free(void *Ptr) {
+#if defined(__AMDGPU__) && defined(SANITIZER_AMDGPU)
+  __asan_free_impl(reinterpret_cast<uint64_t>(Ptr),
+                   uint64_t(__builtin_return_address(0)));
+#elif defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+  __ockl_dm_dealloc(reinterpret_cast<uint64_t>(Ptr));
+#else
+  ::free(Size);
+#endif
+}
 
 ///}
diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp
index e5535d082bfeb..6526a97dd58d6 100644
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -41,7 +41,6 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
   synchronize::init(IsSPMD);
   mapping::init(IsSPMD);
   state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
-  allocator::init(IsSPMD, KernelEnvironment);
   workshare::init(IsSPMD);
 }
 
diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp
index 4f9c77f9eefaa..8250dc40015cc 100644
--- a/openmp/device/src/Misc.cpp
+++ b/openmp/device/src/Misc.cpp
@@ -100,7 +100,7 @@ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
   case omp_const_mem_alloc:
   case omp_high_bw_mem_alloc:
   case omp_low_lat_mem_alloc:
-    return malloc(size);
+    return ompx::allocator::alloc(size);
   default:
     return nullptr;
   }
@@ -113,7 +113,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
   case omp_const_mem_alloc:
   case omp_high_bw_mem_alloc:
   case omp_low_lat_mem_alloc:
-    free(ptr);
+    ompx::allocator::free(ptr);
     return;
   case omp_null_allocator:
   default:
diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp
index 20fdf3c0be753..8617452f7db34 100644
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -55,98 +55,15 @@ using namespace ompx;
 
 namespace {
 
-/// Malloc/Free API implementation
-/// AMDGCN does not expose a malloc/free API, while
-/// NVPTX does. FOr this reason, the order of the following malloc/free
-/// variant declarations and definitions is important and should not be changed
-
-/// AMDGCN implementations of the shuffle sync idiom
-///
-///{
-
-// global_allocate uses ockl_dm_alloc/asan_malloc_impl to manage a global memory
-// heap
-__attribute__((noinline)) extern "C" uint64_t __ockl_dm_alloc(uint64_t bufsz);
-__attribute__((noinline)) extern "C" void __ockl_dm_dealloc(uint64_t ptr);
-#if SANITIZER_AMDGPU
-__attribute__((noinline)) extern "C" uint64_t __asan_malloc_impl(uint64_t bufsz,
-                                                                 uint64_t pc);
-__attribute__((noinline)) extern "C" void __asan_free_impl(uint64_t ptr,
-                                                           uint64_t pc);
-#endif
-#ifdef __AMDGPU__
-extern "C" {
-__attribute__((noinline)) uint64_t __ockl_devmem_request(uint64_t addr,
-                                                         uint64_t size) {
-  if (size) { // allocation request
-    [[clang::noinline]] return (uint64_t)__alt_libc_malloc((size_t)size);
-  } else { // free request
-    [[clang::noinline]] __alt_libc_free((void *)addr);
-    return 0;
-  }
-}
-
-__attribute__((noinline)) void *internal_malloc(uint64_t Size) {
-#if SANITIZER_AMDGPU
-  uint64_t ptr =
-      __asan_malloc_impl(Size, (uint64_t)__builtin_return_address(0));
-  return (void *)ptr;
-#else
-  [[clang::noinline]] return (void *)__ockl_dm_alloc(Size);
-#endif
-}
-
-__attribute__((noinline)) void internal_free(void *Ptr) {
-#if SANITIZER_AMDGPU
-  __asan_free_impl((uint64_t)Ptr, (uint64_t)__builtin_return_address(0));
-#else
-  [[clang::noinline]] __ockl_dm_dealloc((uint64_t)Ptr);
-#endif
-}
-}
-#endif
-
-extern "C" {
-#ifdef __AMDGCN__
-#ifdef USE_BUMP_ALLOCATOR
-[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
-[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
-#else
-void *malloc(size_t Size) { return internal_malloc(Size); }
-void free(void *Ptr) { internal_free(Ptr); }
-#endif
-#else
-#ifdef USE_BUMP_ALLOCATOR
-
-[[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
-[[gnu::weak, gnu::leaf]] void free(void *Ptr);
-#else
-__attribute__((leaf)) void *malloc(size_t Size);
-__attribute__((leaf)) void free(void *Ptr);
-#endif
-#endif
-} // extern "C"
-
-///}
-/// NVPTX implementations of internal mallocs
-///
-///{
-#ifdef __NVPTX__
-extern "C" {
-void *internal_malloc(uint64_t Size) { return malloc(Size); }
-
-void internal_free(void *Ptr) { free(Ptr); }
-}
-#endif
-
 /// A "smart" stack in shared memory.
 ///
-/// The stack exposes a malloc/free interface but works like a stack internally.
-/// In fact, it is a separate stack *per warp*. That means, each warp must push
-/// and pop symmetrically or this breaks, badly. The implementation will (aim
-/// to) detect non-lock-step warps and fallback to malloc/free. The same will
-/// happen if a warp runs out of memory. The master warp in generic memory is
-/// special and is given more memory than the rest.
+/// The stack exposes a malloc/free interface but works like a stack
+/// internally. In fact, it is a separate stack *per warp*. That means, each
+/// warp must push and pop symmetrically or this breaks, badly. The
+/// implementation will (aim to) detect non-lock-step warps and fallback to
+/// malloc/free. The same will happen if a warp runs out of memory. The
+/// master warp in generic memory is special and is given more memory than
+/// the rest.
 ///
 struct SharedMemorySmartStackTy {
   /// Initialize the stack. Must be called by all threads.
@@ -246,13 +163,13 @@ void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
 }
 
 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
-  void *Ptr = malloc(Bytes);
+  void *Ptr = allocator::alloc(Bytes);
   if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
     printf("nullptr returned by malloc!\n");
   return Ptr;
 }
 
-void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
+void memory::freeGlobal(void *Ptr, const char *Reason) { allocator::free(Ptr); }
 
 ///}
 
@@ -558,6 +475,8 @@ void __kmpc_get_shared_variables(void ***GlobalArgs) {
 }
 
 extern "C" {
-__attribute__((leaf)) void *__kmpc_impl_malloc(uint64_t t) { return malloc(t); }
-__attribute__((leaf)) void __kmpc_impl_free(void *ptr) { free(ptr); }
+__attribute__((leaf)) void *__kmpc_impl_malloc(uint64_t t) {
+  return allocator::alloc(t);
+}
+__attribute__((leaf)) void __kmpc_impl_free(void *ptr) { allocator::free(ptr); }
 }
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst
index 64cb1952a5813..4e4a2617759d4 100644
--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1513,5 +1513,4 @@ debugging features are supported.
 
     * Enable debugging assertions in the device. ``0x01``
     * Enable diagnosing common problems during offloading . ``0x4``
-    * Enable device malloc statistics (amdgpu only). ``0x8``
     * Dump device PGO counters (only if PGO on GPU is enabled). ``0x10``
diff --git a/revert_patches.txt b/revert_patches.txt
index b98f337e0d7ac..9e465ba90ae6a 100644
--- a/revert_patches.txt
+++ b/revert_patches.txt
@@ -5,6 +5,3 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485)
 breaks build of ROCmValidationSuite
 [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662)
 ---
-breaks openmp smoke and openmpapps
-[Offload] Remove handling for device memory pool (#163629)
----