In [1]:
import os
import time
import torch

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

In [3]:
print('cuda available:', torch.cuda.is_available())
print('cuda device count:', torch.cuda.device_count())
print('cuda current device:', torch.cuda.current_device())

cpu = torch.device('cpu')
cuda = torch.device('cuda')     # default cuda device
cuda0 = torch.device('cuda:0')  # first cuda device
cuda1 = torch.device('cuda:1')  # second cuda device - will cause errors later on if only one cuda device visible

cuda available: True
cuda device count: 2
cuda current device: 0


In [4]:
x = torch.tensor([1., 2.], device=cuda0)
# x.device is device(type='cuda', index=0)
y = torch.tensor([1., 2.]).cuda()
# y.device is device(type='cuda', index=0)

print(x.device, y.device)

with torch.cuda.device(1): # only works if at least two cuda devices are visible
    # allocates a tensor on GPU 1
    a = torch.tensor([1., 2.], device=cuda)

    # transfers a tensor from CPU to GPU 1
    b = torch.tensor([1., 2.]).cuda()
    # a.device and b.device are device(type='cuda', index=1)

    print('a', a.device)
    print('b', b.device)

    # You can also use ``Tensor.to`` to transfer a tensor:
    b2 = torch.tensor([1., 2.]).to(device=cuda)
    # b.device and b2.device are device(type='cuda', index=1)

    print('b2', b2.device)

    c = a + b
    # c.device is device(type='cuda', index=1)

    print('c', c.device)

    z = x + y
    # z.device is device(type='cuda', index=0)

    print('z', z.device)
    
    # even within a context, you can specify the device
    # (or give a GPU index to the .cuda call)
    d = torch.randn(2, device=cuda0)
    e = torch.randn(2).to(cuda0)
    f = torch.randn(2).cuda(cuda0)
    # d.device, e.device, and f.device are all device(type='cuda', index=2)

    print('d', d.device)
    print('e', e.device)
    print('f', f.device)

cuda:0 cuda:0
a cuda:1
b cuda:1
b2 cuda:1
c cuda:1
z cuda:0
d cuda:0
e cuda:0
f cuda:0


In [5]:
print('peer access cuda:0 --> cuda:1', torch.cuda.can_device_access_peer(cuda0, cuda1))
print('peer access cuda:1 --> cuda:0', torch.cuda.can_device_access_peer(cuda1, cuda0))

tensor0 = torch.zeros((32*1024, 32*1024), dtype = torch.int, device=cuda0)
tensor1 = torch.zeros((32*1024, 32*1024), dtype = torch.int, device=cuda1)

start = time.time()
for i in range(100):
    result = tensor0.to(cuda1)
    torch.cuda.synchronize()
end = time.time()
print('copy tensor cuda:0 --> cuda:1', f'{end - start:.4f} seconds')

start = time.time()
for i in range(100):
    result = tensor1.to(cuda0)
    torch.cuda.synchronize()
end = time.time()
print('copy tensor cuda:1 --> cuda:0', f'{end - start:.4f} seconds')

start = time.time()
for i in range(100):
    result = tensor0.to(cpu)
    torch.cuda.synchronize()
end = time.time()
print('copy tensor cuda:0 --> cpu', f'{end - start:.4f} seconds')

start = time.time()
for i in range(100):
    result = tensor1.to(cpu)
    torch.cuda.synchronize()
end = time.time()
print('copy tensor cuda:1 --> cpu', f'{end - start:.4f} seconds')

peer access cuda:0 --> cuda:1 False
peer access cuda:1 --> cuda:0 False
copy tensor cuda:0 --> cuda:1 32.9526 seconds
copy tensor cuda:1 --> cuda:0 32.4832 seconds
copy tensor cuda:0 --> cpu 174.5688 seconds
copy tensor cuda:1 --> cpu 177.9368 seconds
