From 502a986d6b35adb20d2ac8b5a536c684e1b66359 Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Fri, 3 Oct 2025 02:34:27 +0000 Subject: [PATCH] debug feature PYTORCH_CUDA_MEMORY_CACHING_MEMSET_ZEROS A user may call torch.empty() when they intended torch.zeros(). This debugging flag could help triage such cases by setting all allocations from the CUDACachingAllocator to zeros. --- c10/cuda/CUDACachingAllocator.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 4a1e4654f9207..052a5201d53e8 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -3261,6 +3261,15 @@ class DeviceCachingAllocator { } }; +static bool zeroAllocations() { + static auto has_cuda_env = + c10::utils::check_env("PYTORCH_CUDA_MEMORY_CACHING_MEMSET_ZEROS") == true; + static auto has_rocm_env = + c10::utils::check_env("PYTORCH_HIP_MEMORY_CACHING_MEMSET_ZEROS") == true; + static bool zeros = has_cuda_env || has_rocm_env; + return zeros; +} + // Returns whether to force all allocations to bypass the caching allocator and // go straight to cudaMalloc. This setting is useful when debugging GPU memory // errors, since the caching allocator foils cuda-memcheck. @@ -3652,6 +3661,10 @@ class NativeCachingAllocator : public CUDAAllocator { TORCH_SDT_WITH_SEMAPHORE(malloc, devPtr, device, size, stream.id()); } + if (zeroAllocations()) { + C10_CUDA_CHECK(cudaMemsetAsync(devPtr, 0, size, stream)); + } + return {devPtr, devPtr, deleteFunc, Device(DeviceType::CUDA, device)}; } DeleterFnPtr raw_deleter() const override { @@ -3734,6 +3747,12 @@ class NativeCachingAllocator : public CUDAAllocator { C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device)); } + if (zeroAllocations()) { + c10::DeviceIndex device = 0; + C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); + C10_CUDA_CHECK( + cudaMemsetAsync(r, 0, nbytes, cuda::getCurrentCUDAStream(device))); + } return r; } @@ -3749,6 +3768,9 @@ class NativeCachingAllocator : public CUDAAllocator { C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); malloc(&r, device, nbytes, stream); } + if (zeroAllocations()) { + C10_CUDA_CHECK(cudaMemsetAsync(r, 0, nbytes, stream)); + } return r; }