diff --git a/csrc/finegrained_alloc/build_cuda.sh b/csrc/finegrained_alloc/build_cuda.sh
new file mode 100644
index 00000000..dedf2933
--- /dev/null
+++ b/csrc/finegrained_alloc/build_cuda.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+name="finegrained_allocator"
+
+# Warnings forwarded to host compiler (GCC/Clang)
+basic_warnings="-Xcompiler=-Wall -Xcompiler=-Wextra"
+
+strict_warnings="-Xcompiler=-Wshadow \
+ -Xcompiler=-Wnon-virtual-dtor \
+ -Xcompiler=-Wold-style-cast \
+ -Xcompiler=-Wcast-align \
+ -Xcompiler=-Woverloaded-virtual \
+ -Xcompiler=-Wconversion \
+ -Xcompiler=-Wsign-conversion \
+ -Xcompiler=-Wnull-dereference \
+ -Xcompiler=-Wdouble-promotion \
+ -Xcompiler=-Wformat=2"
+
+# NVCC supports -std=c++17 directly
+std_flags="-std=c++17"
+
+# Output settings
+output_flags="-Xcompiler=-fPIC -shared -o lib${name}.so"
+
+nvcc -arch=sm_90 $basic_warnings $strict_warnings $std_flags $output_flags ${name}.cu
\ No newline at end of file
diff --git a/iris/cuda.py b/iris/cuda.py
new file mode 100644
index 00000000..565573d7
--- /dev/null
+++ b/iris/cuda.py
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+import ctypes
+import numpy as np
+import sys
+
+rt_path = "libcudart.so"
+cuda_runtime = ctypes.cdll.LoadLibrary(rt_path)
+
+
+def cuda_try(err):
+    if err != 0:
+        cuda_runtime.cudaGetErrorString.restype = ctypes.c_char_p
+        error_string = cuda_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8")
+        raise RuntimeError(f"cuda error code {err}: {error_string}")
+
+
+class cudaIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+def open_ipc_handle(ipc_handle_data, rank):
+    ptr = ctypes.c_void_p()
+    cudaIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
+    cuda_runtime.cudaIpcOpenMemHandle.argtypes = [
+        ctypes.POINTER(ctypes.c_void_p),
+        cudaIpcMemHandle_t,
+        ctypes.c_uint,
+    ]
+    if isinstance(ipc_handle_data, np.ndarray):
+        if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != 128:
+            raise ValueError("ipc_handle_data must be a 128-element uint8 numpy array")
+        ipc_handle_bytes = ipc_handle_data.tobytes()
+        ipc_handle_data = (ctypes.c_char * 128).from_buffer_copy(ipc_handle_bytes)
+    else:
+        raise TypeError("ipc_handle_data must be a numpy.ndarray of dtype uint8 with 128 elements")
+
+    raw_memory = ctypes.create_string_buffer(128)
+    ctypes.memset(raw_memory, 0x00, 128)
+    ipc_handle_struct = cudaIpcMemHandle_t.from_buffer(raw_memory)
+    ipc_handle_data_bytes = bytes(ipc_handle_data)
+    ctypes.memmove(raw_memory, ipc_handle_data_bytes, 128)
+
+    cuda_try(
+        cuda_runtime.cudaIpcOpenMemHandle(
+            ctypes.byref(ptr),
+            ipc_handle_struct,
+            cudaIpcMemLazyEnablePeerAccess,
+        )
+    )
+
+    return ptr.value
+
+
+def get_ipc_handle(ptr, rank):
+    ipc_handle = cudaIpcMemHandle_t()
+    cuda_try(cuda_runtime.cudaIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
+    return ipc_handle
+
+
+def count_devices():
+    device_count = ctypes.c_int()
+    cuda_try(cuda_runtime.cudaGetDeviceCount(ctypes.byref(device_count)))
+    return device_count.value
+
+
+def set_device(gpu_id):
+    cuda_try(cuda_runtime.cudaSetDevice(gpu_id))
+
+
+def get_device_id():
+    device_id = ctypes.c_int()
+    cuda_try(cuda_runtime.cudaGetDevice(ctypes.byref(device_id)))
+    return device_id.value
+
+
+def get_cu_count(device_id=None):
+    if device_id is None:
+        device_id = get_device_id()
+
+    cudaDeviceAttributeMultiprocessorCount = 16
+    cu_count = ctypes.c_int()
+
+    cuda_try(
+        cuda_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDeviceAttributeMultiprocessorCount, device_id)
+    )
+
+    return cu_count.value
+
+
+# Starting ROCm 6.5
+# def get_xcc_count(device_id=None):
+#     if device_id is None:
+#         device_id = get_device()
+
+#     cudaDeviceAttributeNumberOfXccs = ??
+#     xcc_count = ctypes.c_int()
+
+#     cuda_try(cuda_runtime.cudaDeviceGetAttribute(
+#         ctypes.byref(xcc_count),
+#         cudaDeviceAttributeNumberOfXccs,
+#         device_id
+#     ))
+
+#     return xcc_count
+
+
+def get_wall_clock_rate(device_id):
+    cudaDevAttrMemoryClockRate = 36
+    wall_clock_rate = ctypes.c_int()
+    status = cuda_runtime.cudaDeviceGetAttribute(ctypes.byref(wall_clock_rate), cudaDevAttrMemoryClockRate, device_id)
+    cuda_try(status)
+    return wall_clock_rate.value
+
+
+def malloc_fine_grained(size):
+    return cuda_malloc(size)
+
+
+def cuda_malloc(size):
+    ptr = ctypes.c_void_p()
+    cuda_try(cuda_runtime.cudaMalloc(ctypes.byref(ptr), size))
+    return ptr
+
+
+def cuda_free(ptr):
+    cuda_try(cuda_runtime.cudaFree(ptr))