Restore batching memory control functionality to LightningGPU (#564)

* Restore batching memory control functionality to LightningGPU * Auto update version * Trigger CI * Update serialised data returned to include offsets for Hamiltonians * Fix branch path * Fix setup.py install for ease of dev * Fix tests * Revert copy to move for Jac batching * Update Changelog * Revert move to copy for now * Temporarily silence Pylint * Fix formatting --------- Co-authored-by: Dev version update bot <github-actions[bot]@users.noreply.github.com> Co-authored-by: Lee O'Riordan <loriordan@lbl.gov>
PennyLaneAI · Nov 24, 2023 · 028ad9b · 028ad9b
1 parent 655d537
commit 028ad9b
Show file tree

Hide file tree

Showing 10 changed files with 120 additions and 44 deletions.
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -40,6 +40,9 @@
 
 ### Bug fixes
 
+* Revert single-node multi-GPU batching behaviour to match https://github.com/PennyLaneAI/pennylane-lightning-gpu/pull/27.
+  [(#564)](https://github.com/PennyLaneAI/pennylane-lightning/pull/564)
+
 * Move deprecated `stateprep` `QuantumScript` argument into the operation list in `mpitests/test_adjoint_jacobian.py`.
   [(#540)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/540)
 
@@ -50,7 +53,7 @@
 
 This release contains contributions from (in alphabetical order):
 
-Isaac De Vlugt, Vincent Michaud-Rioux, Shuli Shu
+Isaac De Vlugt, Vincent Michaud-Rioux, Lee James O'Riordan, Shuli Shu
 
 ---
 

diff --git a/pennylane_lightning/core/_serialize.py b/pennylane_lightning/core/_serialize.py
@@ -52,9 +52,12 @@ class QuantumScriptSerializer:
     """
 
     # pylint: disable=import-outside-toplevel, too-many-instance-attributes, c-extension-no-member
-    def __init__(self, device_name, use_csingle: bool = False, use_mpi: bool = False):
+    def __init__(
+        self, device_name, use_csingle: bool = False, use_mpi: bool = False, split_obs: bool = False
+    ):
         self.use_csingle = use_csingle
         self.device_name = device_name
+        self.split_obs = split_obs
         if device_name == "lightning.qubit":
             try:
                 import pennylane_lightning.lightning_qubit_ops as lightning_ops
@@ -189,6 +192,10 @@ def _tensor_ob(self, observable, wires_map: dict):
     def _hamiltonian(self, observable, wires_map: dict):
         coeffs = np.array(unwrap(observable.coeffs)).astype(self.rtype)
         terms = [self._ob(t, wires_map) for t in observable.ops]
+
+        if self.split_obs:
+            return [self.hamiltonian_obs([c], [t]) for (c, t) in zip(coeffs, terms)]
+
         return self.hamiltonian_obs(coeffs, terms)
 
     def _sparse_hamiltonian(self, observable, wires_map: dict):
@@ -240,6 +247,9 @@ def _pauli_sentence(self, observable, wires_map: dict):
         pwords, coeffs = zip(*observable.items())
         terms = [self._pauli_word(pw, wires_map) for pw in pwords]
         coeffs = np.array(coeffs).astype(self.rtype)
+
+        if self.split_obs:
+            return [self.hamiltonian_obs([c], [t]) for (c, t) in zip(coeffs, terms)]
         return self.hamiltonian_obs(coeffs, terms)
 
     # pylint: disable=protected-access
@@ -269,7 +279,18 @@ def serialize_observables(self, tape: QuantumTape, wires_map: dict) -> List:
                 the C++ backend
         """
 
-        return [self._ob(observable, wires_map) for observable in tape.observables]
+        serialized_obs = []
+        offset_indices = [0]
+
+        for observable in tape.observables:
+            ser_ob = self._ob(observable, wires_map)
+            if isinstance(ser_ob, list):
+                serialized_obs.extend(ser_ob)
+                offset_indices.append(offset_indices[-1] + len(ser_ob))
+            else:
+                serialized_obs.append(ser_ob)
+                offset_indices.append(offset_indices[-1] + 1)
+        return serialized_obs, offset_indices
 
     def serialize_ops(
         self, tape: QuantumTape, wires_map: dict

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.34.0-dev9"
+__version__ = "0.34.0-dev10"
diff --git a/pennylane_lightning/core/lightning_base.py b/pennylane_lightning/core/lightning_base.py
@@ -254,15 +254,18 @@ def _get_basis_state_index(self, state, wires):
         basis_states = qml.math.convert_like(basis_states, state)
         return int(qml.math.dot(state, basis_states))
 
-    # pylint: disable=too-many-function-args, assignment-from-no-return
-    def _process_jacobian_tape(self, tape, starting_state, use_device_state, use_mpi: bool = False):
+    # pylint: disable=too-many-function-args, assignment-from-no-return, too-many-arguments
+    def _process_jacobian_tape(
+        self, tape, starting_state, use_device_state, use_mpi: bool = False, split_obs: bool = False
+    ):
         state_vector = self._init_process_jacobian_tape(tape, starting_state, use_device_state)
 
-        obs_serialized = QuantumScriptSerializer(
-            self.short_name, self.use_csingle, use_mpi
+        obs_serialized, obs_idx_offsets = QuantumScriptSerializer(
+            self.short_name, self.use_csingle, use_mpi, split_obs
         ).serialize_observables(tape, self.wire_map)
+
         ops_serialized, use_sp = QuantumScriptSerializer(
-            self.short_name, self.use_csingle, use_mpi
+            self.short_name, self.use_csingle, use_mpi, split_obs
         ).serialize_ops(tape, self.wire_map)
 
         ops_serialized = self.create_ops_list(*ops_serialized)
@@ -300,6 +303,7 @@ def _process_jacobian_tape(self, tape, starting_state, use_device_state, use_mpi
             "tp_shift": tp_shift,
             "record_tp_rows": record_tp_rows,
             "all_params": all_params,
+            "obs_idx_offsets": obs_idx_offsets,
         }
 
     # pylint: disable=unnecessary-pass

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPU.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPU.hpp
@@ -139,7 +139,7 @@ class AdjointJacobian final
         threads.reserve(num_gpus);
 
         // Hold results of threaded GPU executions
-        std::vector<std::future<std::vector<PrecisionT>>> futures;
+        std::vector<std::future<std::vector<PrecisionT>>> jac_futures;
 
         // Iterate over the chunked observables, and submit the Jacobian task
         // for execution
@@ -150,7 +150,7 @@ class AdjointJacobian final
                 std::ceil((obs.size() * (i + 1) / num_chunks) - 1));
 
             std::promise<std::vector<PrecisionT>> jac_subset_promise;
-            futures.emplace_back(jac_subset_promise.get_future());
+            jac_futures.emplace_back(jac_subset_promise.get_future());
 
             auto adj_lambda =
                 [&](std::promise<std::vector<PrecisionT>> j_promise,
@@ -195,11 +195,11 @@ class AdjointJacobian final
 
         /// Ensure the new local jacs are inserted and
         /// overwrite the 0 jacs values before returning
-        for (std::size_t i = 0; i < futures.size(); i++) {
+        for (std::size_t i = 0; i < jac_futures.size(); i++) {
             const auto first = static_cast<std::size_t>(
                 std::ceil(obs.size() * i / num_chunks));
 
-            auto jac_chunk = futures[i].get();
+            auto jac_chunk = jac_futures[i].get();
             for (std::size_t j = 0; j < jac_chunk.size(); j++) {
                 std::copy(jac_chunk.begin(), jac_chunk.end(),
                           jac.begin() + first * tp_size);

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPUMPI.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/algorithms/AdjointJacobianGPUMPI.hpp
@@ -21,7 +21,6 @@
 #include <future>
 #include <omp.h>
 #include <span>
-#include <thread>
 #include <variant>
 
 #include "AdjointJacobianBase.hpp"
@@ -102,8 +101,6 @@ class AdjointJacobianMPI final
 
     /**
      * @brief Batches the adjoint_jacobian method over the available GPUs.
-     * Explicitly forbids OMP_NUM_THREADS>1 to avoid issues with std::thread
-     * contention and state access issues.
      *
      * @param jac Preallocated vector for Jacobian data results.
      * @param jd JacobianData represents the QuantumTape to differentiate.
@@ -334,4 +331,4 @@ class AdjointJacobianMPI final
     }
 };
 
-} // namespace Pennylane::LightningGPU::Algorithms
+} // namespace Pennylane::LightningGPU::Algorithms
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -644,6 +644,7 @@ def _init_process_jacobian_tape(self, tape, starting_state, use_device_state):
                 self.apply(tape.operations)
             return self._gpu_state
 
+        # pylint: disable=too-many-branches
         def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
             """Implements the adjoint method outlined in
             `Jones and Gacon <https://arxiv.org/abs/2009.02823>`__ to differentiate an input tape.
@@ -673,7 +674,7 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
             self._check_adjdiff_supported_operations(tape.operations)
 
             processed_data = self._process_jacobian_tape(
-                tape, starting_state, use_device_state, self._mpi
+                tape, starting_state, use_device_state, self._mpi, self._batch_obs
             )
 
             if not processed_data:  # training_params is empty
@@ -691,24 +692,69 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
             """
             adjoint_jacobian = _adj_dtype(self.use_csingle, self._mpi)()
 
-            if self._batch_obs:
-                adjoint_jacobian = adjoint_jacobian.batched
+            if self._batch_obs:  # Batching of Measurements
+                if not self._mpi:  # Single-node path, controlled batching over available GPUs
+                    num_obs = len(processed_data["obs_serialized"])
+                    batch_size = (
+                        num_obs
+                        if isinstance(self._batch_obs, bool)
+                        else self._batch_obs * self._dp.getTotalDevices()
+                    )
+                    jac = []
+                    for chunk in range(0, num_obs, batch_size):
+                        obs_chunk = processed_data["obs_serialized"][chunk : chunk + batch_size]
+                        jac_chunk = adjoint_jacobian.batched(
+                            self._gpu_state,
+                            obs_chunk,
+                            processed_data["ops_serialized"],
+                            trainable_params,
+                        )
+                        jac.extend(jac_chunk)
+                else:  # MPI path, restrict memory per known GPUs
+                    jac = adjoint_jacobian.batched(
+                        self._gpu_state,
+                        processed_data["obs_serialized"],
+                        processed_data["ops_serialized"],
+                        trainable_params,
+                    )
 
-            jac = adjoint_jacobian(
-                processed_data["state_vector"],
-                processed_data["obs_serialized"],
-                processed_data["ops_serialized"],
-                trainable_params,
-            )
+            else:
+                jac = adjoint_jacobian(
+                    self._gpu_state,
+                    processed_data["obs_serialized"],
+                    processed_data["ops_serialized"],
+                    trainable_params,
+                )
 
             jac = np.array(jac)  # only for parameters differentiable with the adjoint method
             jac = jac.reshape(-1, len(trainable_params))
-            jac_r = np.zeros((jac.shape[0], processed_data["all_params"]))
-
-            jac_r[:, processed_data["record_tp_rows"]] = jac
+            jac_r = np.zeros((len(tape.observables), processed_data["all_params"]))
+            if not self._batch_obs:
+                jac_r[:, processed_data["record_tp_rows"]] = jac
+            else:
+                # Reduce over decomposed expval(H), if required.
+                for idx in range(len(processed_data["obs_idx_offsets"][0:-1])):
+                    if (
+                        processed_data["obs_idx_offsets"][idx + 1]
+                        - processed_data["obs_idx_offsets"][idx]
+                    ) > 1:
+                        jac_r[idx, :] = np.sum(
+                            jac[
+                                processed_data["obs_idx_offsets"][idx] : processed_data[
+                                    "obs_idx_offsets"
+                                ][idx + 1],
+                                :,
+                            ],
+                            axis=0,
+                        )
+                    else:
+                        jac_r[idx, :] = jac[
+                            processed_data["obs_idx_offsets"][idx] : processed_data[
+                                "obs_idx_offsets"
+                            ][idx + 1],
+                            :,
+                        ]
 
-            if hasattr(qml, "active_return"):
-                return self._adjoint_jacobian_processing(jac_r) if qml.active_return() else jac_r
             return self._adjoint_jacobian_processing(jac_r)
 
         # pylint: disable=inconsistent-return-statements, line-too-long, missing-function-docstring

diff --git a/tests/test_adjoint_jacobian.py b/tests/test_adjoint_jacobian.py
@@ -357,7 +357,6 @@ def test_multiple_rx_gradient_expval_hamiltonian(self, tol, dev):
                 [-np.sin(params[0]) * np.cos(params[2]), 0, -np.cos(params[0]) * np.sin(params[2])]
             )
         )
-
         assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
     qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]

diff --git a/tests/test_serialize.py b/tests/test_serialize.py
@@ -127,7 +127,7 @@ def test_tensor_non_tensor_return(self, use_csingle):
         tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128
         named_obs = NamedObsC64 if use_csingle else NamedObsC128
 
-        s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
+        s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
             tape, self.wires_dict
         )
 
@@ -147,7 +147,7 @@ def test_hermitian_return(self, use_csingle):
         hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
         c_dtype = np.complex64 if use_csingle else np.complex128
 
-        s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
+        s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
             tape, self.wires_dict
         )
         s_expected = hermitian_obs(
@@ -168,7 +168,7 @@ def test_hermitian_tensor_return(self, use_csingle):
         c_dtype = np.complex64 if use_csingle else np.complex128
         tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128
         hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
-        s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
+        s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
             tape, self.wires_dict
         )
 
@@ -192,7 +192,7 @@ def test_mixed_tensor_return(self, use_csingle):
         hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
         named_obs = NamedObsC64 if use_csingle else NamedObsC128
 
-        s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
+        s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
             tape, self.wires_dict
         )
 
@@ -225,7 +225,7 @@ def test_hamiltonian_return(self, use_csingle):
         r_dtype = np.float32 if use_csingle else np.float64
         c_dtype = np.complex64 if use_csingle else np.complex128
 
-        s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
+        s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
             tape, self.wires_dict
         )
 
@@ -267,7 +267,7 @@ def test_hamiltonian_tensor_return(self, use_csingle):
         r_dtype = np.float32 if use_csingle else np.float64
         c_dtype = np.complex64 if use_csingle else np.complex128
 
-        s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
+        s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
             tape, self.wires_dict
         )
 
@@ -322,7 +322,7 @@ def test_hamiltonian_mix_return(self, use_csingle):
         r_dtype = np.float32 if use_csingle else np.float64
         c_dtype = np.complex64 if use_csingle else np.complex128
 
-        s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
+        s, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
             tape, self.wires_dict
         )
 
@@ -374,7 +374,7 @@ def test_hamiltonian_mix_return(self, use_csingle):
     def test_op_arithmetic_uses_hamiltonian(self, use_csingle, obs, coeffs, terms):
         """Tests that an arithmetic obs with a PauliRep serializes as a Hamiltonian."""
         tape = qml.tape.QuantumTape(measurements=[qml.expval(obs)])
-        res = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
+        res, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
             tape, self.wires_dict
         )
         assert len(res) == 1
@@ -400,7 +400,7 @@ def test_op_arithmetic_uses_hamiltonian(self, use_csingle, obs, coeffs, terms):
     def test_multi_wire_identity(self, use_csingle):
         """Tests that multi-wire Identity does not fail serialization."""
         tape = qml.tape.QuantumTape(measurements=[qml.expval(qml.Identity(wires=[1, 2]))])
-        res = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
+        res, _ = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
             tape, self.wires_dict
         )
         assert len(res) == 1

diff --git a/tests/test_serialize_chunk_obs.py b/tests/test_serialize_chunk_obs.py
@@ -37,12 +37,18 @@ class TestSerializeObs:
     def test_chunk_obs(self, use_csingle, obs_chunk):
         """Test chunking of observable array"""
         with qml.tape.QuantumTape() as tape:
+            qml.expval(
+                0.5 * qml.PauliX(0) @ qml.PauliZ(1)
+                + 0.7 * qml.PauliZ(0) @ qml.PauliX(1)
+                + 1.2 * qml.PauliY(0) @ qml.PauliY(1)
+            )
             qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
             qml.expval(qml.PauliY(wires=1))
             qml.expval(qml.PauliX(0) @ qml.Hermitian([[0, 1], [1, 0]], wires=3) @ qml.Hadamard(2))
             qml.expval(qml.Hermitian(qml.PauliZ.compute_matrix(), wires=0) @ qml.Identity(1))
-        s = QuantumScriptSerializer(device_name, use_csingle).serialize_observables(
-            tape, self.wires_dict
-        )
+        s, offsets = QuantumScriptSerializer(
+            device_name, use_csingle, split_obs=True
+        ).serialize_observables(tape, self.wires_dict)
         obtained_chunks = pennylane_lightning.core.lightning_base._chunk_iterable(s, obs_chunk)
         assert len(list(obtained_chunks)) == int(np.ceil(len(s) / obs_chunk))
+        assert [0, 3, 4, 5, 6, 7] == offsets