#### Useful checks you can run for Cuda GPU Processing

In [None]:
import torch

# List properties
props = torch.cuda.get_device_properties(0)
print(sorted([a for a in dir(props) if not a.startswith('_')]))

compute capability: (8, 6)
['L2_cache_size', 'gcnArchName', 'is_integrated', 'is_multi_gpu_board', 'major', 'max_threads_per_multi_processor', 'minor', 'multi_processor_count', 'name', 'pci_bus_id', 'pci_device_id', 'pci_domain_id', 'regs_per_multiprocessor', 'shared_memory_per_block', 'shared_memory_per_block_optin', 'shared_memory_per_multiprocessor', 'total_memory', 'uuid', 'warp_size']


In [None]:
# Full properties details
print("Compute capability, major and minor:", torch.cuda.get_device_capability(0))
print("Name:", props.name)
print("Total memory (GB):", props.total_memory / 1024**3)
print("Multi-processor count:", props.multi_processor_count)
print("Max threads per multi-processor:", getattr(props, "max_threads_per_multi_processor", "n/a"))
print("Max threads per block (approx):", getattr(props, "max_threads_per_block", "n/a"))

# Verify mixed precision support
print("CUDA available:", torch.cuda.is_available())
print("torch.version.cuda:", torch.version.cuda)
print("AMP available (autocast):", hasattr(torch.cuda.amp, "autocast"))

Compute capability, major and minor: (8, 6)
Name: NVIDIA GeForce RTX 3090
Total memory (GB): 23.99951171875
Multi-processor count: 82
Max threads per multi-processor: 1536
Max threads per block (approx): n/a
CUDA available: True
torch.version.cuda: 12.6
AMP available (autocast): True


#### Testing BFloat16 support

In [8]:
try:
    x = torch.randn(4, device='cuda', dtype=torch.bfloat16)
    y = x + x
    print("bfloat16 ops OK")
except Exception as e:
    print("bfloat16 ops failed:", e)

bfloat16 ops OK
