# Multi GPU simulations

Below we will explore how CUDAQ can seamlessly utilize multiple GPUs and multiple QPUs in the future.

1. Scale qubit count to access second and third GPU
2. Distribute collection of x_train on multiple GPUs asynchronously
3. Distribute collection of terms in a given hamiltonian
4. Execute different kernels on different GPUs

In [10]:
import cudaq
from cudaq import spin
import numpy as np

# cudaq.set_target('nvidia')
# cudaq.set_target('nvidia-mgpu')
# cudaq.set_target('qpp-cpu')
cudaq.set_target('nvidia-mqpu')

# Scaling qubit count to go beyond single GPU memory requirements

In [11]:
n_qubits = 20
n_samples = 1000
h = spin.z(0)

n_parameters = n_qubits*3
parameters = np.random.default_rng(13).uniform(low=0, high=1, size = (n_samples,n_parameters))
np.random.seed(1)


kernel, params = cudaq.make_kernel(list)

qubits = kernel.qalloc(n_qubits)
qubits_list = list(range(n_qubits))

for i in range(n_qubits):
    kernel.rx(params[i], qubits[i])

for i in range(n_qubits):
    kernel.ry(params[i + n_qubits], qubits[i])

for i in range(n_qubits):
    kernel.rz(params[i + n_qubits*2], qubits[i])

for q1, q2 in zip(qubits_list[0::2], qubits_list[1::2]):
    kernel.cz(qubits[q1], qubits[q2])

# exp_vals = cudaq.observe_n(kernel, h, parameters)

import time
t0 = time.time()
exp_vals = [cudaq.observe(kernel, h, parameters[i]) for i in range(parameters.shape[0])]
print("Single GPU/MIG execution time:", time.time()-t0)

Single GPU/MIG execution time: 10.56591248512268


# Asynchronous data collection via batching x_train

In [12]:
import subprocess

num_qpus = str(subprocess.check_output(["nvidia-smi", "-L"])).count('UUID')
print("The number of GPUs used for distributed QPU simulations:", num_qpus)

The number of GPUs used for distributed QPU simulations: 2


In [13]:
print(parameters.shape)

xi = np.split(parameters, num_qpus)

print(len(xi))

print(xi[0].shape)

(1000, 60)
2
(500, 60)


In [14]:
asyncresults = []

t0 = time.time()
for i in range(len(xi)):
    for j in range(xi[i].shape[0]):
        asyncresults.append(cudaq.observe_async(kernel, h, xi[i][j,:], qpu_id = i))

expvals = []
for res in asyncresults:
    expvals.append(res.get().expectation())
print("Execution time using", num_qpus, "GPUs/MIGs:", time.time()-t0)

Execution time using 2 GPUs/MIGs: 5.320219039916992


# Asynchronous data collection via batching hamiltonian terms

In [15]:
n_qubits = 10
n_terms = 1000

# Create a parameterized ansatz kernel
kernel, params = cudaq.make_kernel(list)

qubits = kernel.qalloc(n_qubits)
qubits_list = list(range(n_qubits))

for i in range(n_qubits):
    kernel.rx(params[i], qubits[i])

for i in range(n_qubits):
    kernel.ry(params[i + n_qubits], qubits[i])

for i in range(n_qubits):
    kernel.rz(params[i + n_qubits*2], qubits[i])

for q1, q2 in zip(qubits_list[0::2], qubits_list[1::2]):
    kernel.cz(qubits[q1], qubits[q2])

# We create a random hamiltonian with 10e3 terms
hamiltonian = cudaq.SpinOperator.random(n_qubits, n_terms)

# Create some random parameters
n_parameters = n_qubits*3
parameters = np.random.default_rng(13).uniform(low=-1., high=1., size = n_parameters)
np.random.seed(1)


t0 = time.time()
expectation = cudaq.observe(
    kernel, hamiltonian, parameters)  # Single GPU.
print("Single GPU/MIG execution time:", time.time()-t0)

print("The expectation value =", round(expectation.expectation(), 2))

Single GPU/MIG execution time: 0.24846696853637695
The expectation value = 0.16


In [16]:
t0 = time.time()
expectation = cudaq.observe(
    kernel, hamiltonian, parameters,
    execution=cudaq.parallel.thread)  # Single node, multi-GPU.
print("Execution time using", num_qpus, "GPUs/MIGs:", time.time()-t0)

print("The expectation value =", round(expectation.expectation(), 2))

Execution time using 2 GPUs/MIGs: 0.11322236061096191
The expectation value = 0.16


# Different kernels being executed at the same time

In [17]:
cudaq.set_target('nvidia-mqpu')

n_qubits = 10
n_samples = 500
h = spin.z(0)

n_parameters = n_qubits
parameters = np.random.default_rng(13).uniform(low=0, high=1, size = (n_samples,n_parameters))
np.random.seed(1)

###################################################

kernel1, params = cudaq.make_kernel(list)

qubits = kernel1.qalloc(n_qubits)

for i in range(n_qubits):
    kernel1.rx(params[i], qubits[i])

###################################################

kernel2, params = cudaq.make_kernel(list)

qubits = kernel2.qalloc(n_qubits)

for i in range(n_qubits):
    kernel2.ry(params[i], qubits[i])


In [18]:
exp_vals1 = [cudaq.observe_async(kernel1, h, parameters[i], qpu_id = 0) for i in range(parameters.shape[0])]
exp_vals2 = [cudaq.observe_async(kernel2, h, parameters[i], qpu_id = 1) for i in range(parameters.shape[0])]