diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index 0bc27c089..ca65c98f5 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -40,6 +40,9 @@ ### Bug fixes +* Revert single-node multi-GPU batching behaviour to match https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/27. + [(#564)](https://github.com/PennyLaneAI/pennylane-lightning/pull/564) + * Move deprecated `stateprep` `QuantumScript` argument into the operation list in `mpitests/test_adjoint_jacobian.py`. [(#540)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/540) @@ -50,7 +53,7 @@ This release contains contributions from (in alphabetical order): -Isaac De Vlugt, Vincent Michaud-Rioux, Shuli Shu +Isaac De Vlugt, Vincent Michaud-Rioux, Lee James O'Riordan, Shuli Shu --- diff --git a/pennylane_lightning/core/_serialize.py b/pennylane_lightning/core/_serialize.py index 4dc30a7ec..4fc71fa79 100644 --- a/pennylane_lightning/core/_serialize.py +++ b/pennylane_lightning/core/_serialize.py @@ -52,9 +52,12 @@ class QuantumScriptSerializer: """ # pylint: disable=import-outside-toplevel, too-many-instance-attributes, c-extension-no-member - def __init__(self, device_name, use_csingle: bool = False, use_mpi: bool = False): + def __init__( + self, device_name, use_csingle: bool = False, use_mpi: bool = False, split_obs: bool = False + ): self.use_csingle = use_csingle self.device_name = device_name + self.split_obs = split_obs if device_name == "lightning.qubit": try: import pennylane_lightning.lightning_qubit_ops as lightning_ops @@ -189,6 +192,10 @@ def _tensor_ob(self, observable, wires_map: dict): def _hamiltonian(self, observable, wires_map: dict): coeffs = np.array(unwrap(observable.coeffs)).astype(self.rtype) terms = [self._ob(t, wires_map) for t in observable.ops] + + if self.split_obs: + return [self.hamiltonian_obs([c], [t]) for (c, t) in zip(coeffs, terms)] + return self.hamiltonian_obs(coeffs, terms) def _sparse_hamiltonian(self, observable, wires_map: dict): @@ -240,6 +247,9 @@ def _pauli_sentence(self, observable, wires_map: dict): pwords, coeffs = zip(*observable.items()) terms = [self._pauli_word(pw, wires_map) for pw in pwords] coeffs = np.array(coeffs).astype(self.rtype) + + if self.split_obs: + return [self.hamiltonian_obs([c], [t]) for (c, t) in zip(coeffs, terms)] return self.hamiltonian_obs(coeffs, terms) # pylint: disable=protected-access @@ -269,7 +279,18 @@ def serialize_observables(self, tape: QuantumTape, wires_map: dict) -> List: the C++ backend """ - return [self._ob(observable, wires_map) for observable in tape.observables] + serialized_obs = [] + offset_indices = [0] + + for observable in tape.observables: + ser_ob = self._ob(observable, wires_map) + if isinstance(ser_ob, list): + serialized_obs.extend(ser_ob) + offset_indices.append(offset_indices[-1] + len(ser_ob)) + else: + serialized_obs.append(ser_ob) + offset_indices.append(offset_indices[-1] + 1) + return serialized_obs, offset_indices def serialize_ops( self, tape: QuantumTape, wires_map: dict diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index e3668fdf7..1038a32aa 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.34.0-dev9" +__version__ = "0.34.0-dev10" diff --git a/pennylane_lightning/core/lightning_base.py b/pennylane_lightning/core/lightning_base.py index 163f7cdeb..ac4e064e2 100644 --- a/pennylane_lightning/core/lightning_base.py +++ b/pennylane_lightning/core/lightning_base.py @@ -254,15 +254,18 @@ def _get_basis_state_index(self, state, wires): basis_states = qml.math.convert_like(basis_states, state) return int(qml.math.dot(state, basis_states)) - # pylint: disable=too-many-function-args, assignment-from-no-return - def _process_jacobian_tape(self, tape, starting_state, use_device_state, use_mpi: bool = False): + # pylint: disable=too-many-function-args, assignment-from-no-return, too-many-arguments + def _process_jacobian_tape( + self, tape, starting_state, use_device_state, use_mpi: bool = False, split_obs: bool = False + ): state_vector = self._init_process_jacobian_tape(tape, starting_state, use_device_state) - obs_serialized = QuantumScriptSerializer( - self.short_name, self.use_csingle, use_mpi + obs_serialized, obs_idx_offsets = QuantumScriptSerializer( + self.short_name, self.use_csingle, use_mpi, split_obs ).serialize_observables(tape, self.wire_map) + ops_serialized, use_sp = QuantumScriptSerializer( - self.short_name, self.use_csingle, use_mpi + self.short_name, self.use_csingle, use_mpi, split_obs ).serialize_ops(tape, self.wire_map) ops_serialized = self.create_ops_list(*ops_serialized) @@ -300,6 +303,7 @@ def _process_jacobian_tape(self, tape, starting_state, use_device_state, use_mpi "tp_shift": tp_shift, "record_tp_rows": record_tp_rows, "all_params": all_params, + "obs_idx_offsets": obs_idx_offsets, } # pylint: disable=unnecessary-pass diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPU.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPU.hpp index 391e165a3..350fab6f6 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPU.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPU.hpp @@ -139,7 +139,7 @@ class AdjointJacobian final threads.reserve(num_gpus); // Hold results of threaded GPU executions - std::vector>> futures; + std::vector>> jac_futures; // Iterate over the chunked observables, and submit the Jacobian task // for execution @@ -150,7 +150,7 @@ class AdjointJacobian final std::ceil((obs.size() * (i + 1) / num_chunks) - 1)); std::promise> jac_subset_promise; - futures.emplace_back(jac_subset_promise.get_future()); + jac_futures.emplace_back(jac_subset_promise.get_future()); auto adj_lambda = [&](std::promise> j_promise, @@ -195,11 +195,11 @@ class AdjointJacobian final /// Ensure the new local jacs are inserted and /// overwrite the 0 jacs values before returning - for (std::size_t i = 0; i < futures.size(); i++) { + for (std::size_t i = 0; i < jac_futures.size(); i++) { const auto first = static_cast( std::ceil(obs.size() * i / num_chunks)); - auto jac_chunk = futures[i].get(); + auto jac_chunk = jac_futures[i].get(); for (std::size_t j = 0; j < jac_chunk.size(); j++) { std::copy(jac_chunk.begin(), jac_chunk.end(), jac.begin() + first * tp_size); diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPUMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPUMPI.hpp index 419b2221c..5d42c5f41 100644 --- a/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPUMPI.hpp +++ b/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPUMPI.hpp @@ -21,7 +21,6 @@ #include #include #include -#include #include #include "AdjointJacobianBase.hpp" @@ -102,8 +101,6 @@ class AdjointJacobianMPI final /** * @brief Batches the adjoint_jacobian method over the available GPUs. - * Explicitly forbids OMP_NUM_THREADS>1 to avoid issues with std::thread - * contention and state access issues. * * @param jac Preallocated vector for Jacobian data results. * @param jd JacobianData represents the QuantumTape to differentiate. @@ -334,4 +331,4 @@ class AdjointJacobianMPI final } }; -} // namespace Pennylane::LightningGPU::Algorithms \ No newline at end of file +} // namespace Pennylane::LightningGPU::Algorithms diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index 177275ec1..e7e76bcfb 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -644,6 +644,7 @@ def _init_process_jacobian_tape(self, tape, starting_state, use_device_state): self.apply(tape.operations) return self._gpu_state + # pylint: disable=too-many-branches def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False): """Implements the adjoint method outlined in `Jones and Gacon `__ to differentiate an input tape. @@ -673,7 +674,7 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False): self._check_adjdiff_supported_operations(tape.operations) processed_data = self._process_jacobian_tape( - tape, starting_state, use_device_state, self._mpi + tape, starting_state, use_device_state, self._mpi, self._batch_obs ) if not processed_data: # training_params is empty @@ -691,24 +692,69 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False): """ adjoint_jacobian = _adj_dtype(self.use_csingle, self._mpi)() - if self._batch_obs: - adjoint_jacobian = adjoint_jacobian.batched + if self._batch_obs: # Batching of Measurements + if not self._mpi: # Single-node path, controlled batching over available GPUs + num_obs = len(processed_data["obs_serialized"]) + batch_size = ( + num_obs + if isinstance(self._batch_obs, bool) + else self._batch_obs * self._dp.getTotalDevices() + ) + jac = [] + for chunk in range(0, num_obs, batch_size): + obs_chunk = processed_data["obs_serialized"][chunk : chunk + batch_size] + jac_chunk = adjoint_jacobian.batched( + self._gpu_state, + obs_chunk, + processed_data["ops_serialized"], + trainable_params, + ) + jac.extend(jac_chunk) + else: # MPI path, restrict memory per known GPUs + jac = adjoint_jacobian.batched( + self._gpu_state, + processed_data["obs_serialized"], + processed_data["ops_serialized"], + trainable_params, + ) - jac = adjoint_jacobian( - processed_data["state_vector"], - processed_data["obs_serialized"], - processed_data["ops_serialized"], - trainable_params, - ) + else: + jac = adjoint_jacobian( + self._gpu_state, + processed_data["obs_serialized"], + processed_data["ops_serialized"], + trainable_params, + ) jac = np.array(jac) # only for parameters differentiable with the adjoint method jac = jac.reshape(-1, len(trainable_params)) - jac_r = np.zeros((jac.shape[0], processed_data["all_params"])) - - jac_r[:, processed_data["record_tp_rows"]] = jac + jac_r = np.zeros((len(tape.observables), processed_data["all_params"])) + if not self._batch_obs: + jac_r[:, processed_data["record_tp_rows"]] = jac + else: + # Reduce over decomposed expval(H), if required. + for idx in range(len(processed_data["obs_idx_offsets"][0:-1])): + if ( + processed_data["obs_idx_offsets"][idx + 1] + - processed_data["obs_idx_offsets"][idx] + ) > 1: + jac_r[idx, :] = np.sum( + jac[ + processed_data["obs_idx_offsets"][idx] : processed_data[ + "obs_idx_offsets" + ][idx + 1], + :, + ], + axis=0, + ) + else: + jac_r[idx, :] = jac[ + processed_data["obs_idx_offsets"][idx] : processed_data[ + "obs_idx_offsets" + ][idx + 1], + :, + ] - if hasattr(qml, "active_return"): - return self._adjoint_jacobian_processing(jac_r) if qml.active_return() else jac_r return self._adjoint_jacobian_processing(jac_r) # pylint: disable=inconsistent-return-statements, line-too-long, missing-function-docstring diff --git a/tests/test_adjoint_jacobian.py b/tests/test_adjoint_jacobian.py index c68e925eb..2fb080610 100644 --- a/tests/test_adjoint_jacobian.py +++ b/tests/test_adjoint_jacobian.py @@ -357,7 +357,6 @@ def test_multiple_rx_gradient_expval_hamiltonian(self, tol, dev): [-np.sin(params[0]) * np.cos(params[2]), 0, -np.cos(params[0]) * np.sin(params[2])] ) ) - assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0) qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__] diff --git a/tests/test_serialize.py b/tests/test_serialize.py index ca3861019..78a0f2bcd 100644 --- a/tests/test_serialize.py +++ b/tests/test_serialize.py @@ -127,7 +127,7 @@ def test_tensor_non_tensor_return(self, use_csingle): tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128 named_obs = NamedObsC64 if use_csingle else NamedObsC128 - s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( + s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( tape, self.wires_dict ) @@ -147,7 +147,7 @@ def test_hermitian_return(self, use_csingle): hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128 c_dtype = np.complex64 if use_csingle else np.complex128 - s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( + s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( tape, self.wires_dict ) s_expected = hermitian_obs( @@ -168,7 +168,7 @@ def test_hermitian_tensor_return(self, use_csingle): c_dtype = np.complex64 if use_csingle else np.complex128 tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128 hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128 - s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( + s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( tape, self.wires_dict ) @@ -192,7 +192,7 @@ def test_mixed_tensor_return(self, use_csingle): hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128 named_obs = NamedObsC64 if use_csingle else NamedObsC128 - s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( + s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( tape, self.wires_dict ) @@ -225,7 +225,7 @@ def test_hamiltonian_return(self, use_csingle): r_dtype = np.float32 if use_csingle else np.float64 c_dtype = np.complex64 if use_csingle else np.complex128 - s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( + s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( tape, self.wires_dict ) @@ -267,7 +267,7 @@ def test_hamiltonian_tensor_return(self, use_csingle): r_dtype = np.float32 if use_csingle else np.float64 c_dtype = np.complex64 if use_csingle else np.complex128 - s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( + s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( tape, self.wires_dict ) @@ -322,7 +322,7 @@ def test_hamiltonian_mix_return(self, use_csingle): r_dtype = np.float32 if use_csingle else np.float64 c_dtype = np.complex64 if use_csingle else np.complex128 - s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( + s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( tape, self.wires_dict ) @@ -374,7 +374,7 @@ def test_hamiltonian_mix_return(self, use_csingle): def test_op_arithmetic_uses_hamiltonian(self, use_csingle, obs, coeffs, terms): """Tests that an arithmetic obs with a PauliRep serializes as a Hamiltonian.""" tape = qml.tape.QuantumTape(measurements=[qml.expval(obs)]) - res = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( + res, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( tape, self.wires_dict ) assert len(res) == 1 @@ -400,7 +400,7 @@ def test_op_arithmetic_uses_hamiltonian(self, use_csingle, obs, coeffs, terms): def test_multi_wire_identity(self, use_csingle): """Tests that multi-wire Identity does not fail serialization.""" tape = qml.tape.QuantumTape(measurements=[qml.expval(qml.Identity(wires=[1, 2]))]) - res = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( + res, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( tape, self.wires_dict ) assert len(res) == 1 diff --git a/tests/test_serialize_chunk_obs.py b/tests/test_serialize_chunk_obs.py index 017900c2a..ec70b998c 100644 --- a/tests/test_serialize_chunk_obs.py +++ b/tests/test_serialize_chunk_obs.py @@ -37,12 +37,18 @@ class TestSerializeObs: def test_chunk_obs(self, use_csingle, obs_chunk): """Test chunking of observable array""" with qml.tape.QuantumTape() as tape: + qml.expval( + 0.5 * qml.PauliX(0) @ qml.PauliZ(1) + + 0.7 * qml.PauliZ(0) @ qml.PauliX(1) + + 1.2 * qml.PauliY(0) @ qml.PauliY(1) + ) qml.expval(qml.PauliZ(0) @ qml.PauliX(1)) qml.expval(qml.PauliY(wires=1)) qml.expval(qml.PauliX(0) @ qml.Hermitian([[0, 1], [1, 0]], wires=3) @ qml.Hadamard(2)) qml.expval(qml.Hermitian(qml.PauliZ.compute_matrix(), wires=0) @ qml.Identity(1)) - s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables( - tape, self.wires_dict - ) + s, offsets = QuantumScriptSerializer( + device_name, use_csingle, split_obs=True + ).serialize_observables(tape, self.wires_dict) obtained_chunks = pennylane_lightning.core.lightning_base._chunk_iterable(s, obs_chunk) assert len(list(obtained_chunks)) == int(np.ceil(len(s) / obs_chunk)) + assert [0, 3, 4, 5, 6, 7] == offsets