Skip to content

Commit

Permalink
Restore batching memory control functionality to LightningGPU (#564)
Browse files Browse the repository at this point in the history
* Restore batching memory control functionality to LightningGPU

* Auto update version

* Trigger CI

* Update serialised data returned to include offsets for Hamiltonians

* Fix branch path

* Fix setup.py install for ease of dev

* Fix tests

* Revert copy to move for Jac batching

* Update Changelog

* Revert move to copy for now

* Temporarily silence Pylint

* Fix formatting

---------

Co-authored-by: Dev version update bot <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Lee O'Riordan <loriordan@lbl.gov>
  • Loading branch information
3 people committed Nov 24, 2023
1 parent 655d537 commit 028ad9b
Show file tree
Hide file tree
Showing 10 changed files with 120 additions and 44 deletions.
5 changes: 4 additions & 1 deletion .github/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@

### Bug fixes

* Revert single-node multi-GPU batching behaviour to match https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/27.
[(#564)](https://github.com/PennyLaneAI/pennylane-lightning/pull/564)

* Move deprecated `stateprep` `QuantumScript` argument into the operation list in `mpitests/test_adjoint_jacobian.py`.
[(#540)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/540)

Expand All @@ -50,7 +53,7 @@

This release contains contributions from (in alphabetical order):

Isaac De Vlugt, Vincent Michaud-Rioux, Shuli Shu
Isaac De Vlugt, Vincent Michaud-Rioux, Lee James O'Riordan, Shuli Shu

---

Expand Down
25 changes: 23 additions & 2 deletions pennylane_lightning/core/_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,12 @@ class QuantumScriptSerializer:
"""

# pylint: disable=import-outside-toplevel, too-many-instance-attributes, c-extension-no-member
def __init__(self, device_name, use_csingle: bool = False, use_mpi: bool = False):
def __init__(
self, device_name, use_csingle: bool = False, use_mpi: bool = False, split_obs: bool = False
):
self.use_csingle = use_csingle
self.device_name = device_name
self.split_obs = split_obs
if device_name == "lightning.qubit":
try:
import pennylane_lightning.lightning_qubit_ops as lightning_ops
Expand Down Expand Up @@ -189,6 +192,10 @@ def _tensor_ob(self, observable, wires_map: dict):
def _hamiltonian(self, observable, wires_map: dict):
coeffs = np.array(unwrap(observable.coeffs)).astype(self.rtype)
terms = [self._ob(t, wires_map) for t in observable.ops]

if self.split_obs:
return [self.hamiltonian_obs([c], [t]) for (c, t) in zip(coeffs, terms)]

return self.hamiltonian_obs(coeffs, terms)

def _sparse_hamiltonian(self, observable, wires_map: dict):
Expand Down Expand Up @@ -240,6 +247,9 @@ def _pauli_sentence(self, observable, wires_map: dict):
pwords, coeffs = zip(*observable.items())
terms = [self._pauli_word(pw, wires_map) for pw in pwords]
coeffs = np.array(coeffs).astype(self.rtype)

if self.split_obs:
return [self.hamiltonian_obs([c], [t]) for (c, t) in zip(coeffs, terms)]
return self.hamiltonian_obs(coeffs, terms)

# pylint: disable=protected-access
Expand Down Expand Up @@ -269,7 +279,18 @@ def serialize_observables(self, tape: QuantumTape, wires_map: dict) -> List:
the C++ backend
"""

return [self._ob(observable, wires_map) for observable in tape.observables]
serialized_obs = []
offset_indices = [0]

for observable in tape.observables:
ser_ob = self._ob(observable, wires_map)
if isinstance(ser_ob, list):
serialized_obs.extend(ser_ob)
offset_indices.append(offset_indices[-1] + len(ser_ob))
else:
serialized_obs.append(ser_ob)
offset_indices.append(offset_indices[-1] + 1)
return serialized_obs, offset_indices

def serialize_ops(
self, tape: QuantumTape, wires_map: dict
Expand Down
2 changes: 1 addition & 1 deletion pennylane_lightning/core/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
Version number (major.minor.patch[-label])
"""

__version__ = "0.34.0-dev9"
__version__ = "0.34.0-dev10"
14 changes: 9 additions & 5 deletions pennylane_lightning/core/lightning_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,15 +254,18 @@ def _get_basis_state_index(self, state, wires):
basis_states = qml.math.convert_like(basis_states, state)
return int(qml.math.dot(state, basis_states))

# pylint: disable=too-many-function-args, assignment-from-no-return
def _process_jacobian_tape(self, tape, starting_state, use_device_state, use_mpi: bool = False):
# pylint: disable=too-many-function-args, assignment-from-no-return, too-many-arguments
def _process_jacobian_tape(
self, tape, starting_state, use_device_state, use_mpi: bool = False, split_obs: bool = False
):
state_vector = self._init_process_jacobian_tape(tape, starting_state, use_device_state)

obs_serialized = QuantumScriptSerializer(
self.short_name, self.use_csingle, use_mpi
obs_serialized, obs_idx_offsets = QuantumScriptSerializer(
self.short_name, self.use_csingle, use_mpi, split_obs
).serialize_observables(tape, self.wire_map)

ops_serialized, use_sp = QuantumScriptSerializer(
self.short_name, self.use_csingle, use_mpi
self.short_name, self.use_csingle, use_mpi, split_obs
).serialize_ops(tape, self.wire_map)

ops_serialized = self.create_ops_list(*ops_serialized)
Expand Down Expand Up @@ -300,6 +303,7 @@ def _process_jacobian_tape(self, tape, starting_state, use_device_state, use_mpi
"tp_shift": tp_shift,
"record_tp_rows": record_tp_rows,
"all_params": all_params,
"obs_idx_offsets": obs_idx_offsets,
}

# pylint: disable=unnecessary-pass
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ class AdjointJacobian final
threads.reserve(num_gpus);

// Hold results of threaded GPU executions
std::vector<std::future<std::vector<PrecisionT>>> futures;
std::vector<std::future<std::vector<PrecisionT>>> jac_futures;

// Iterate over the chunked observables, and submit the Jacobian task
// for execution
Expand All @@ -150,7 +150,7 @@ class AdjointJacobian final
std::ceil((obs.size() * (i + 1) / num_chunks) - 1));

std::promise<std::vector<PrecisionT>> jac_subset_promise;
futures.emplace_back(jac_subset_promise.get_future());
jac_futures.emplace_back(jac_subset_promise.get_future());

auto adj_lambda =
[&](std::promise<std::vector<PrecisionT>> j_promise,
Expand Down Expand Up @@ -195,11 +195,11 @@ class AdjointJacobian final

/// Ensure the new local jacs are inserted and
/// overwrite the 0 jacs values before returning
for (std::size_t i = 0; i < futures.size(); i++) {
for (std::size_t i = 0; i < jac_futures.size(); i++) {
const auto first = static_cast<std::size_t>(
std::ceil(obs.size() * i / num_chunks));

auto jac_chunk = futures[i].get();
auto jac_chunk = jac_futures[i].get();
for (std::size_t j = 0; j < jac_chunk.size(); j++) {
std::copy(jac_chunk.begin(), jac_chunk.end(),
jac.begin() + first * tp_size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
#include <future>
#include <omp.h>
#include <span>
#include <thread>
#include <variant>

#include "AdjointJacobianBase.hpp"
Expand Down Expand Up @@ -102,8 +101,6 @@ class AdjointJacobianMPI final

/**
* @brief Batches the adjoint_jacobian method over the available GPUs.
* Explicitly forbids OMP_NUM_THREADS>1 to avoid issues with std::thread
* contention and state access issues.
*
* @param jac Preallocated vector for Jacobian data results.
* @param jd JacobianData represents the QuantumTape to differentiate.
Expand Down Expand Up @@ -334,4 +331,4 @@ class AdjointJacobianMPI final
}
};

} // namespace Pennylane::LightningGPU::Algorithms
} // namespace Pennylane::LightningGPU::Algorithms
74 changes: 60 additions & 14 deletions pennylane_lightning/lightning_gpu/lightning_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,7 @@ def _init_process_jacobian_tape(self, tape, starting_state, use_device_state):
self.apply(tape.operations)
return self._gpu_state

# pylint: disable=too-many-branches
def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
"""Implements the adjoint method outlined in
`Jones and Gacon <https://arxiv.org/abs/2009.02823>`__ to differentiate an input tape.
Expand Down Expand Up @@ -673,7 +674,7 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
self._check_adjdiff_supported_operations(tape.operations)

processed_data = self._process_jacobian_tape(
tape, starting_state, use_device_state, self._mpi
tape, starting_state, use_device_state, self._mpi, self._batch_obs
)

if not processed_data: # training_params is empty
Expand All @@ -691,24 +692,69 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
"""
adjoint_jacobian = _adj_dtype(self.use_csingle, self._mpi)()

if self._batch_obs:
adjoint_jacobian = adjoint_jacobian.batched
if self._batch_obs: # Batching of Measurements
if not self._mpi: # Single-node path, controlled batching over available GPUs
num_obs = len(processed_data["obs_serialized"])
batch_size = (
num_obs
if isinstance(self._batch_obs, bool)
else self._batch_obs * self._dp.getTotalDevices()
)
jac = []
for chunk in range(0, num_obs, batch_size):
obs_chunk = processed_data["obs_serialized"][chunk : chunk + batch_size]
jac_chunk = adjoint_jacobian.batched(
self._gpu_state,
obs_chunk,
processed_data["ops_serialized"],
trainable_params,
)
jac.extend(jac_chunk)
else: # MPI path, restrict memory per known GPUs
jac = adjoint_jacobian.batched(
self._gpu_state,
processed_data["obs_serialized"],
processed_data["ops_serialized"],
trainable_params,
)

jac = adjoint_jacobian(
processed_data["state_vector"],
processed_data["obs_serialized"],
processed_data["ops_serialized"],
trainable_params,
)
else:
jac = adjoint_jacobian(
self._gpu_state,
processed_data["obs_serialized"],
processed_data["ops_serialized"],
trainable_params,
)

jac = np.array(jac) # only for parameters differentiable with the adjoint method
jac = jac.reshape(-1, len(trainable_params))
jac_r = np.zeros((jac.shape[0], processed_data["all_params"]))

jac_r[:, processed_data["record_tp_rows"]] = jac
jac_r = np.zeros((len(tape.observables), processed_data["all_params"]))
if not self._batch_obs:
jac_r[:, processed_data["record_tp_rows"]] = jac
else:
# Reduce over decomposed expval(H), if required.
for idx in range(len(processed_data["obs_idx_offsets"][0:-1])):
if (
processed_data["obs_idx_offsets"][idx + 1]
- processed_data["obs_idx_offsets"][idx]
) > 1:
jac_r[idx, :] = np.sum(
jac[
processed_data["obs_idx_offsets"][idx] : processed_data[
"obs_idx_offsets"
][idx + 1],
:,
],
axis=0,
)
else:
jac_r[idx, :] = jac[
processed_data["obs_idx_offsets"][idx] : processed_data[
"obs_idx_offsets"
][idx + 1],
:,
]

if hasattr(qml, "active_return"):
return self._adjoint_jacobian_processing(jac_r) if qml.active_return() else jac_r
return self._adjoint_jacobian_processing(jac_r)

# pylint: disable=inconsistent-return-statements, line-too-long, missing-function-docstring
Expand Down
1 change: 0 additions & 1 deletion tests/test_adjoint_jacobian.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,6 @@ def test_multiple_rx_gradient_expval_hamiltonian(self, tol, dev):
[-np.sin(params[0]) * np.cos(params[2]), 0, -np.cos(params[0]) * np.sin(params[2])]
)
)

assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)

qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]
Expand Down
18 changes: 9 additions & 9 deletions tests/test_serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def test_tensor_non_tensor_return(self, use_csingle):
tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128
named_obs = NamedObsC64 if use_csingle else NamedObsC128

s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
tape, self.wires_dict
)

Expand All @@ -147,7 +147,7 @@ def test_hermitian_return(self, use_csingle):
hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
c_dtype = np.complex64 if use_csingle else np.complex128

s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
tape, self.wires_dict
)
s_expected = hermitian_obs(
Expand All @@ -168,7 +168,7 @@ def test_hermitian_tensor_return(self, use_csingle):
c_dtype = np.complex64 if use_csingle else np.complex128
tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128
hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
tape, self.wires_dict
)

Expand All @@ -192,7 +192,7 @@ def test_mixed_tensor_return(self, use_csingle):
hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
named_obs = NamedObsC64 if use_csingle else NamedObsC128

s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
tape, self.wires_dict
)

Expand Down Expand Up @@ -225,7 +225,7 @@ def test_hamiltonian_return(self, use_csingle):
r_dtype = np.float32 if use_csingle else np.float64
c_dtype = np.complex64 if use_csingle else np.complex128

s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
tape, self.wires_dict
)

Expand Down Expand Up @@ -267,7 +267,7 @@ def test_hamiltonian_tensor_return(self, use_csingle):
r_dtype = np.float32 if use_csingle else np.float64
c_dtype = np.complex64 if use_csingle else np.complex128

s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
tape, self.wires_dict
)

Expand Down Expand Up @@ -322,7 +322,7 @@ def test_hamiltonian_mix_return(self, use_csingle):
r_dtype = np.float32 if use_csingle else np.float64
c_dtype = np.complex64 if use_csingle else np.complex128

s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
tape, self.wires_dict
)

Expand Down Expand Up @@ -374,7 +374,7 @@ def test_hamiltonian_mix_return(self, use_csingle):
def test_op_arithmetic_uses_hamiltonian(self, use_csingle, obs, coeffs, terms):
"""Tests that an arithmetic obs with a PauliRep serializes as a Hamiltonian."""
tape = qml.tape.QuantumTape(measurements=[qml.expval(obs)])
res = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
res, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
tape, self.wires_dict
)
assert len(res) == 1
Expand All @@ -400,7 +400,7 @@ def test_op_arithmetic_uses_hamiltonian(self, use_csingle, obs, coeffs, terms):
def test_multi_wire_identity(self, use_csingle):
"""Tests that multi-wire Identity does not fail serialization."""
tape = qml.tape.QuantumTape(measurements=[qml.expval(qml.Identity(wires=[1, 2]))])
res = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
res, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
tape, self.wires_dict
)
assert len(res) == 1
Expand Down
12 changes: 9 additions & 3 deletions tests/test_serialize_chunk_obs.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,18 @@ class TestSerializeObs:
def test_chunk_obs(self, use_csingle, obs_chunk):
"""Test chunking of observable array"""
with qml.tape.QuantumTape() as tape:
qml.expval(
0.5 * qml.PauliX(0) @ qml.PauliZ(1)
+ 0.7 * qml.PauliZ(0) @ qml.PauliX(1)
+ 1.2 * qml.PauliY(0) @ qml.PauliY(1)
)
qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
qml.expval(qml.PauliY(wires=1))
qml.expval(qml.PauliX(0) @ qml.Hermitian([[0, 1], [1, 0]], wires=3) @ qml.Hadamard(2))
qml.expval(qml.Hermitian(qml.PauliZ.compute_matrix(), wires=0) @ qml.Identity(1))
s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
tape, self.wires_dict
)
s, offsets = QuantumScriptSerializer(
device_name, use_csingle, split_obs=True
).serialize_observables(tape, self.wires_dict)
obtained_chunks = pennylane_lightning.core.lightning_base._chunk_iterable(s, obs_chunk)
assert len(list(obtained_chunks)) == int(np.ceil(len(s) / obs_chunk))
assert [0, 3, 4, 5, 6, 7] == offsets

0 comments on commit 028ad9b

Please sign in to comment.