In [None]:
# RuntimeError: use_libuv was requested but PyTorch was build without libuv support
# https://docs.libuv.org/en/v1.x/
# conda install -c conda-forge libuv -y

In [None]:
# Need to test
# pip install --no-cache-dir torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
# pip install --force-reinstall --no-cache-dir torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
#torchrun --standalone --nproc_per_node=1 Mask_RCNN_CUDA_DDP_v1.py --run-mode ddp --batch-size 4 --num-workers 4 --train-epochs 5 --train-ckpt-folder train_v0

#set MASTER_ADDR=127.0.0.1
#set MASTER_PORT=29501
#torchrun --nproc_per_node=1 Mask_RCNN_CUDA_DDP_v1.py --run-mode ddp --batch-size 4 --num-workers 4 --train-epochs 5 --train-ckpt-folder train_v0

#pip install --force-reinstall --no-cache-dir "numpy<2"


In [None]:
import os
#os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


## |||||||||||||||||||||||||||||||||||||||||

# Test GPU

In [1]:
import torch
import time

# --- sanity check ---
assert torch.cuda.is_available(), "CUDA is not available"

device = "cuda"
dtype = torch.float16   # 2 bytes per element

# Target allocation: ~300 MB
target_mb = 10
bytes_per_elem = torch.tensor([], dtype=dtype).element_size()
num_elements = (target_mb * 1024 * 1024) // bytes_per_elem

print(f"Allocating ~{target_mb} MB on GPU...")

x = torch.empty(num_elements, device=device, dtype=dtype)

torch.cuda.synchronize()
print("Allocation successful.")
print("Check nvidia-smi now. Holding memory for 30 seconds...")

time.sleep(30)

del x
torch.cuda.empty_cache()
torch.cuda.synchronize()
print("VRAM released.")


Allocating ~10 MB on GPU...
Allocation successful.
Check nvidia-smi now. Holding memory for 30 seconds...
VRAM released.


## FP16 raw CUDA memory allocation test (no cuDNN)

In [2]:
import torch
import time

# --- sanity check ---
assert torch.cuda.is_available(), "CUDA is not available"

device = "cuda"
dtype = torch.float16   # 2 bytes per element

# Target allocation: ~300 MB

for target_mb in range(200,1000,200):
    bytes_per_elem = torch.tensor([], dtype=dtype).element_size()
    num_elements = (target_mb * 1024 * 1024) // bytes_per_elem

    print(f"Allocating ~{target_mb} MB on GPU...")

    x = torch.empty(num_elements, device=device, dtype=dtype)

    torch.cuda.synchronize()
    print("Allocation successful.")
    print("Check nvidia-smi now. Holding memory for 3 seconds...")

    time.sleep(3)

    del x
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    print("VRAM released.")


Allocating ~200 MB on GPU...
Allocation successful.
Check nvidia-smi now. Holding memory for 3 seconds...
VRAM released.
Allocating ~400 MB on GPU...
Allocation successful.
Check nvidia-smi now. Holding memory for 3 seconds...
VRAM released.
Allocating ~600 MB on GPU...
Allocation successful.
Check nvidia-smi now. Holding memory for 3 seconds...
VRAM released.
Allocating ~800 MB on GPU...
Allocation successful.
Check nvidia-smi now. Holding memory for 3 seconds...
VRAM released.


## cuDNN benchmark enabled: FP16 convolution memory stress test

In [3]:
import torch
import torch.nn as nn
import time
import math

# --- sanity check ---
assert torch.cuda.is_available(), "CUDA not available"

device = "cuda"
dtype = torch.float16
torch.backends.cudnn.benchmark = True  # allow workspace growth

# Fixed conv layer (small weights, large activations)
conv = nn.Conv2d(
    in_channels=64,
    out_channels=64,
    kernel_size=3,
    padding=1,
    bias=False
).to(device=device, dtype=dtype)

bytes_per = torch.tensor([], dtype=dtype).element_size()

print(f"GPU: {torch.cuda.get_device_name(0)}")
print("Starting conv memory test...\n")

for target_mb in range(500, 1000, 500):
    # We want roughly target_mb used by activations
    # activation_bytes â‰ˆ N * C * H * W * bytes_per
    target_bytes = target_mb * 1024 * 1024

    C = 64
    N = 8  # batch size
    elems_per_image = target_bytes // (N * C * bytes_per)

    side = int(math.sqrt(elems_per_image))
    H = W = max((side // 16) * 16, 16)  # align nicely

    print(f"Allocating ~{target_mb} MB with tensor [{N},{C},{H},{W}]")

    x = torch.empty((N, C, H, W), device=device, dtype=dtype)

    # Run a few convolutions to force cuDNN workspace allocation
    for _ in range(5):
        y = conv(x)

    torch.cuda.synchronize()
    print("  Conv OK, holding memory for 3s...")
    time.sleep(3)

    del x, y
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    print("  Released.\n")

print("Done.")


GPU: Tesla K80
Starting conv memory test...

Allocating ~500 MB with tensor [8,64,704,704]
  Conv OK, holding memory for 3s...
  Released.

Done.


## cuDNN benchmark enabled: FP32 convolution memory stress test

In [4]:
import torch
import torch.nn as nn
import time
import math

assert torch.cuda.is_available()

device = "cuda"
dtype = torch.float32   # FP32 now
torch.backends.cudnn.benchmark = True

conv = nn.Conv2d(
    in_channels=64,
    out_channels=64,
    kernel_size=3,
    padding=1,
    bias=False
).to(device=device, dtype=dtype)

bytes_per = torch.tensor([], dtype=dtype).element_size()

print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"dtype={dtype}, bytes/elem={bytes_per}\n")

for target_mb in range(500, 1000, 100):  # lower ceiling for FP32
    target_bytes = target_mb * 1024 * 1024

    C = 64
    N = 4  # smaller batch for FP32
    elems_per_image = target_bytes // (N * C * bytes_per)

    side = int(math.sqrt(elems_per_image))
    H = W = max((side // 16) * 16, 16)

    print(f"Allocating ~{target_mb} MB with [{N},{C},{H},{W}] FP32")

    x = torch.empty((N, C, H, W), device=device, dtype=dtype)

    for _ in range(3):
        y = conv(x)

    torch.cuda.synchronize()
    print("  Conv OK, holding 3s...")
    time.sleep(3)

    del x, y
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    print("  Released.\n")

print("Done.")


GPU: Tesla K80
dtype=torch.float32, bytes/elem=4

Allocating ~500 MB with [4,64,704,704] FP32
  Conv OK, holding 3s...
  Released.

Allocating ~600 MB with [4,64,768,768] FP32
  Conv OK, holding 3s...
  Released.

Allocating ~700 MB with [4,64,832,832] FP32
  Conv OK, holding 3s...
  Released.

Allocating ~800 MB with [4,64,896,896] FP32
  Conv OK, holding 3s...
  Released.

Allocating ~900 MB with [4,64,960,960] FP32
  Conv OK, holding 3s...
  Released.

Done.


## CuDNN configuration for debugging CUDA memory issues (FP32)

In [5]:
import torch
import torch.nn as nn
import time
import math

assert torch.cuda.is_available()
## Disable CuDNN - 32 bit data
# IMPORTANT in notebooks: restart kernel before running this if you got illegal access before.
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.enabled = True

device = "cuda"
dtype = torch.float32

conv = nn.Conv2d(64, 64, 3, padding=1, bias=False).to(device=device, dtype=dtype)

bytes_per = torch.tensor([], dtype=dtype).element_size()
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"dtype={dtype}, bytes/elem={bytes_per}\n")

for target_mb in range(500, 1000, 100):
    target_bytes = target_mb * 1024 * 1024
    C = 64
    N = 4
    elems_per_image = target_bytes // (N * C * bytes_per)
    side = int(math.sqrt(elems_per_image))
    H = W = max((side // 16) * 16, 16)

    print(f"Allocating ~{target_mb} MB with [{N},{C},{H},{W}] FP32")
    x = torch.empty((N, C, H, W), device=device, dtype=dtype)

    for _ in range(3):
        y = conv(x)

    torch.cuda.synchronize()
    print("  Conv OK, holding 3s...")
    time.sleep(3)

    del x, y
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    print("  Released.\n")

print("Done.")


GPU: Tesla K80
dtype=torch.float32, bytes/elem=4

Allocating ~500 MB with [4,64,704,704] FP32
  Conv OK, holding 3s...
  Released.

Allocating ~600 MB with [4,64,768,768] FP32
  Conv OK, holding 3s...
  Released.

Allocating ~700 MB with [4,64,832,832] FP32
  Conv OK, holding 3s...
  Released.

Allocating ~800 MB with [4,64,896,896] FP32
  Conv OK, holding 3s...
  Released.

Allocating ~900 MB with [4,64,960,960] FP32
  Conv OK, holding 3s...
  Released.

Done.


## cuDNN disabled: native PyTorch conv2d (FP32)

In [6]:
import torch
import torch.nn.functional as F
import time
import math

assert torch.cuda.is_available()

torch.backends.cudnn.enabled = False  # key line

device = "cuda"
dtype = torch.float32

# weight: [out_ch, in_ch, kH, kW]
w = torch.empty((64, 64, 3, 3), device=device, dtype=dtype)
torch.nn.init.kaiming_uniform_(w, a=math.sqrt(5))

N, C, H, W = 4, 64, 704, 704
x = torch.empty((N, C, H, W), device=device, dtype=dtype)

# warmup
for _ in range(3):
    y = F.conv2d(x, w, bias=None, stride=1, padding=1)

torch.cuda.synchronize()
print("conv2d OK (cuDNN OFF). Holding 3s...")
time.sleep(3)


conv2d OK (cuDNN OFF). Holding 3s...


## cuDNN-off FP32 convolution stress test for legacy GPUs

In [7]:
import torch
import torch.nn.functional as F
import time
import math

assert torch.cuda.is_available()

torch.backends.cudnn.enabled = False  # key for K80 stability

device = "cuda"
dtype = torch.float32
bytes_per = torch.tensor([], dtype=dtype).element_size()

# Fixed conv weights
w = torch.empty((64, 64, 3, 3), device=device, dtype=dtype)
torch.nn.init.kaiming_uniform_(w, a=math.sqrt(5))

print(f"GPU: {torch.cuda.get_device_name(0)}")
print("Starting FP32 conv memory test (cuDNN OFF)\n")

for target_mb in range(50, 1000, 50):
    target_bytes = target_mb * 1024 * 1024

    C = 64
    N = 4
    elems_per_image = target_bytes // (N * C * bytes_per)
    side = int(math.sqrt(max(elems_per_image, 1)))
    H = W = max((side // 16) * 16, 16)

    print(f"Alloc ~{target_mb} MB with x=[{N},{C},{H},{W}] FP32")

    x = torch.empty((N, C, H, W), device=device, dtype=dtype)

    # run a few convs to ensure it really uses the tensors
    for _ in range(3):
        y = F.conv2d(x, w, bias=None, stride=1, padding=1)

    torch.cuda.synchronize()
    print("  OK, holding 2s...")
    time.sleep(2)

    del x, y
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    print("  Released.\n")

print("Done.")


GPU: Tesla K80
Starting FP32 conv memory test (cuDNN OFF)

Alloc ~50 MB with x=[4,64,224,224] FP32
  OK, holding 2s...
  Released.

Alloc ~100 MB with x=[4,64,320,320] FP32
  OK, holding 2s...
  Released.

Alloc ~150 MB with x=[4,64,384,384] FP32
  OK, holding 2s...
  Released.

Alloc ~200 MB with x=[4,64,448,448] FP32
  OK, holding 2s...
  Released.

Alloc ~250 MB with x=[4,64,496,496] FP32
  OK, holding 2s...
  Released.

Alloc ~300 MB with x=[4,64,544,544] FP32
  OK, holding 2s...
  Released.

Alloc ~350 MB with x=[4,64,592,592] FP32
  OK, holding 2s...
  Released.

Alloc ~400 MB with x=[4,64,640,640] FP32
  OK, holding 2s...
  Released.

Alloc ~450 MB with x=[4,64,672,672] FP32
  OK, holding 2s...
  Released.

Alloc ~500 MB with x=[4,64,704,704] FP32
  OK, holding 2s...
  Released.

Alloc ~550 MB with x=[4,64,736,736] FP32
  OK, holding 2s...
  Released.

Alloc ~600 MB with x=[4,64,768,768] FP32
  OK, holding 2s...
  Released.

Alloc ~650 MB with x=[4,64,800,800] FP32
  OK, holding

## Hardware and Driver version

In [8]:
import torch
print(torch.__version__)
print(torch.version.cuda)
print(torch.backends.cudnn.version())


1.12.0+cu102
10.2
7605


In [9]:
import torch

print("CUDA available:", torch.cuda.is_available())
print("CUDA version (PyTorch):", torch.version.cuda)
print("GPU:", torch.cuda.get_device_name(0))


CUDA available: True
CUDA version (PyTorch): 10.2
GPU: Tesla K80


In [10]:
!nvidia-smi --query-gpu=driver_version,name --format=csv,noheader


470.256.02, Tesla K80
470.256.02, Tesla K80
470.256.02, Tesla K80
470.256.02, Tesla K80
470.256.02, Tesla K80
470.256.02, Tesla K80
470.256.02, Tesla K80
470.256.02, Tesla K80


In [11]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [12]:
import torch

print("=== GPU / CUDA / cuDNN info ===")
!nvidia-smi --query-gpu=name,driver_version --format=csv,noheader

print("\nPyTorch:", torch.__version__)
print("CUDA (PyTorch):", torch.version.cuda)
print("cuDNN:", torch.backends.cudnn.version())
print("CUDA available:", torch.cuda.is_available())


=== GPU / CUDA / cuDNN info ===
Tesla K80, 470.256.02
Tesla K80, 470.256.02
Tesla K80, 470.256.02
Tesla K80, 470.256.02
Tesla K80, 470.256.02
Tesla K80, 470.256.02
Tesla K80, 470.256.02
Tesla K80, 470.256.02

PyTorch: 1.12.0+cu102
CUDA (PyTorch): 10.2
cuDNN: 7605
CUDA available: True


In [13]:
import os
print(f"PID: {os.getpid()}")

PID: 124998


# Kill PID

In [None]:
import os, signal
os.kill(os.getpid(), signal.SIGKILL)
