# Pylops - NCCLBackend

### Author: M.Ravasi

In [1]:
# !conda install -c conda-forge nccl

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import cupy as cp
import cupy.cuda.nccl as nccl
import multiprocessing

from cupy import testing


## nccl.NcclCommunicator.initAll

Works on single process

In [3]:
"""
# Number of GPUs to use
n_gpus = 2  # Modify this based on your setup

# Initialize CUDA devices
devices = [cp.cuda.Device(i) for i in range(n_gpus)]

# Initialize arrays on each device
arrays = []
for i, d in enumerate(devices):
    with cp.cuda.Device(d):
        array = (i + 1) * cp.array([1, 2, 3, 4], dtype=cp.float32)
        print(array, array.device, array + array)
        arrays.append(array)

# Create a list of device IDs
device_ids = [d.id for d in devices]

# Initialize NCCL communicators for all devices
comms = nccl.NcclCommunicator.initAll(n_gpus)

# Perform all-reduce operation
for i, d in enumerate(devices):
    with d:
        comms[i].allReduce(arrays[i].data.ptr, arrays[i].data.ptr, arrays[i].size, nccl.NCCL_FLOAT32, nccl.NCCL_SUM, cp.cuda.Stream.null.ptr)

# Synchronize to ensure all operations are complete
for d in devices:
    with d:
        cp.cuda.Stream.null.synchronize()

# Verify the result
for i, d in enumerate(devices):
    with d:
        print(f"Device {i}: {arrays[i]}")
"""

'\n# Number of GPUs to use\nn_gpus = 2  # Modify this based on your setup\n\n# Initialize CUDA devices\ndevices = [cp.cuda.Device(i) for i in range(n_gpus)]\n\n# Initialize arrays on each device\narrays = []\nfor i, d in enumerate(devices):\n    with cp.cuda.Device(d):\n        array = (i + 1) * cp.array([1, 2, 3, 4], dtype=cp.float32)\n        print(array, array.device, array + array)\n        arrays.append(array)\n\n# Create a list of device IDs\ndevice_ids = [d.id for d in devices]\n\n# Initialize NCCL communicators for all devices\ncomms = nccl.NcclCommunicator.initAll(n_gpus)\n\n# Perform all-reduce operation\nfor i, d in enumerate(devices):\n    with d:\n        comms[i].allReduce(arrays[i].data.ptr, arrays[i].data.ptr, arrays[i].size, nccl.NCCL_FLOAT32, nccl.NCCL_SUM, cp.cuda.Stream.null.ptr)\n\n# Synchronize to ensure all operations are complete\nfor d in devices:\n    with d:\n        cp.cuda.Stream.null.synchronize()\n\n# Verify the result\nfor i, d in enumerate(devices):\n  

## nccl.NcclCommunicator

Does not work, may need MPI/Multiprocess

In [4]:
"""
# Number of GPUs to use
n_gpus = 2  # Modify this based on your setup

# Initialize CUDA devices
devices = [cp.cuda.Device(i) for i in range(n_gpus)]
print(devices)

# Initialize arrays on each device
arrays = []
for d in devices:
    with cp.cuda.Device(d):
        array = cp.array([1, 2, 3, 4], dtype=cp.float32)
        print(array, array.device, array + array)
        arrays.append(array)

# Generate a unique NCCL ID
comm_id = nccl.get_unique_id()

# Initialize NCCL communicators
comms = []
for i, d in enumerate(devices):
    with d:
        print(d)
        comms.append(nccl.NcclCommunicator(n_gpus, comm_id, i))

# Synchronize all devices before starting the operation
for d in devices:
    with d:
        cp.cuda.Stream.null.synchronize()

        
# Perform all-reduce operation
for i, d in enumerate(devices):
    with d:
        comms[i].allReduce(arrays[i].data.ptr, arrays[i].data.ptr, arrays[i].size, nccl.NCCL_FLOAT32, nccl.NCCL_SUM, cp.cuda.Stream.null.ptr)

# Verify the result
for i, d in enumerate(devices):
    with d:
        print(f"Device {i}: {arrays[i]}")

"""

'\n# Number of GPUs to use\nn_gpus = 2  # Modify this based on your setup\n\n# Initialize CUDA devices\ndevices = [cp.cuda.Device(i) for i in range(n_gpus)]\nprint(devices)\n\n# Initialize arrays on each device\narrays = []\nfor d in devices:\n    with cp.cuda.Device(d):\n        array = cp.array([1, 2, 3, 4], dtype=cp.float32)\n        print(array, array.device, array + array)\n        arrays.append(array)\n\n# Generate a unique NCCL ID\ncomm_id = nccl.get_unique_id()\n\n# Initialize NCCL communicators\ncomms = []\nfor i, d in enumerate(devices):\n    with d:\n        print(d)\n        comms.append(nccl.NcclCommunicator(n_gpus, comm_id, i))\n\n# Synchronize all devices before starting the operation\nfor d in devices:\n    with d:\n        cp.cuda.Stream.null.synchronize()\n\n        \n# Perform all-reduce operation\nfor i, d in enumerate(devices):\n    with d:\n        comms[i].allReduce(arrays[i].data.ptr, arrays[i].data.ptr, arrays[i].size, nccl.NCCL_FLOAT32, nccl.NCCL_SUM, cp.cuda.

In [5]:
def f(n_devices, device, comm_id, rank):
    device.use()
    comm = nccl.NcclCommunicator(n_devices, comm_id, rank)
    x = cp.zeros((2, 3, 4), dtype='float32')
    comm.broadcast(
        x.data.ptr, x.data.ptr, x.size, nccl.NCCL_FLOAT, 0,
        cp.cuda.Stream.null.ptr)

    e = cp.ones((2, 3, 4), dtype='float32')
    testing.assert_allclose(x, e)
    device.synchronize()

    print('Rank {} successfully finished.'.format(rank))

multiprocessing.set_start_method('spawn', force=True)

n_devices = 2
devices = [cp.cuda.Device(i) for i in range(n_devices)]

comm_id = nccl.get_unique_id()

ps = []
for i in range(1, n_devices):
    p = multiprocessing.Process(
        target=f, args=(n_devices, devices[i], comm_id, i))
    p.start()
    ps.append(p)

device = devices[0]
device.use()
comm = nccl.NcclCommunicator(n_devices, comm_id, 0)
x = cp.ones((2, 3, 4), dtype='float32')
comm.broadcast(
    x.data.ptr, x.data.ptr, x.size, nccl.NCCL_FLOAT, 0,
    cp.cuda.Stream.null.ptr)

for p in ps:
    p.join()

print('Rank 0 successfully finished.')

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/ravasim/miniconda3_v23/envs/pylops_cupy_mpi4py_3090/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/home/ravasim/miniconda3_v23/envs/pylops_cupy_mpi4py_3090/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'f' on <module '__main__' (built-in)>
