Prevent reallocation of aligned memory (#572)

* Allow aligned alloc to be more useful * Auto update version * Remove comments * Fix testing failures * remove redundant functions from LG & LK * Readd removed import * Auto update version * Auto update version * Trigger CI * Ensure dtype conversion happens when using lower prec SV * Remove unused imports * Auto update version * Trigger CI * Auto update version * Trigger CI * Readd missing block to LQ * Auto update version * Trigger CI * Add TODO and update changelog * Ensure pip installed clang-format on path --------- Co-authored-by: Dev version update bot <github-actions[bot]@users.noreply.github.com>
PennyLaneAI · Jan 4, 2024 · 3c648f7 · 3c648f7
1 parent dfad9d7
commit 3c648f7
Show file tree

Hide file tree

Showing 11 changed files with 58 additions and 44 deletions.
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -36,6 +36,9 @@
 
 ### Improvements
 
+* Ensure aligned memory used for numpy arrays with state-vector without reallocations.
+  [(#572)](https://github.com/PennyLaneAI/pennylane-lightning/pull/572)
+
 * Unify error messages of shot measurement related unsupported observables to better Catalyst.
   [(#577)](https://github.com/PennyLaneAI/pennylane-lightning/pull/577)
 

diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml
@@ -37,10 +37,10 @@ jobs:
         uses: actions/checkout@v3
 
       - name: Install dependencies
-        run: sudo apt update && sudo apt -y install python3 python3-pip && python3 -m pip install -r requirements-dev.txt
+        run: sudo apt update && sudo apt -y install python3 python3-pip && python -m pip install pip~=22.0 && python3 -m pip install -r requirements-dev.txt
 
       - name: Run formatter
-        run: PATH=$PATH:$(dirname $(which python3)) ./bin/format --check ./pennylane_lightning/core/src
+        run: PATH=$PATH:$(dirname $(which python3))/ ./bin/format --check ./pennylane_lightning/core/src
 
   tidy-cpp:
     strategy:

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.34.0-dev26"
+__version__ = "0.34.0-dev27"
diff --git a/pennylane_lightning/core/src/bindings/Bindings.hpp b/pennylane_lightning/core/src/bindings/Bindings.hpp
@@ -149,11 +149,12 @@ auto getNumpyArrayAlignment(const py::array &numpyArray) -> CPUMemoryModel {
  * @return Numpy array
  */
 template <typename T>
-auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size) -> py::array {
+auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size,
+                       bool zeroInit = false) -> py::array {
     using Pennylane::Util::alignedAlloc;
     if (getAlignment<T>(memory_model) > alignof(std::max_align_t)) {
-        void *ptr =
-            alignedAlloc(getAlignment<T>(memory_model), sizeof(T) * size);
+        void *ptr = alignedAlloc(getAlignment<T>(memory_model),
+                                 sizeof(T) * size, zeroInit);
         auto capsule = py::capsule(ptr, &Util::alignedFree);
         return py::array{py::dtype::of<T>(), {size}, {sizeof(T)}, ptr, capsule};
     }
@@ -172,20 +173,24 @@ auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size) -> py::array {
  * @param size Size of the array to create
  * @param dt Pybind11's datatype object
  */
-auto allocateAlignedArray(size_t size, const py::dtype &dt) -> py::array {
+auto allocateAlignedArray(size_t size, const py::dtype &dt,
+                          bool zeroInit = false) -> py::array {
+    // TODO: Move memset operations to here to reduce zeroInit pass-throughs.
     auto memory_model = bestCPUMemoryModel();
 
     if (dt.is(py::dtype::of<float>())) {
-        return alignedNumpyArray<float>(memory_model, size);
+        return alignedNumpyArray<float>(memory_model, size, zeroInit);
     }
     if (dt.is(py::dtype::of<double>())) {
-        return alignedNumpyArray<double>(memory_model, size);
+        return alignedNumpyArray<double>(memory_model, size, zeroInit);
     }
     if (dt.is(py::dtype::of<std::complex<float>>())) {
-        return alignedNumpyArray<std::complex<float>>(memory_model, size);
+        return alignedNumpyArray<std::complex<float>>(memory_model, size,
+                                                      zeroInit);
     }
     if (dt.is(py::dtype::of<std::complex<double>>())) {
-        return alignedNumpyArray<std::complex<double>>(memory_model, size);
+        return alignedNumpyArray<std::complex<double>>(memory_model, size,
+                                                       zeroInit);
     }
     throw py::type_error("Unsupported datatype.");
 }

diff --git a/pennylane_lightning/core/src/utils/Memory.hpp b/pennylane_lightning/core/src/utils/Memory.hpp
@@ -14,6 +14,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <limits>
 #include <memory>
 #include <new>
@@ -35,23 +36,28 @@ namespace Pennylane::Util {
  * @param bytes Number of bytes to allocate
  * @return Pointer to the allocated memory
  */
-inline auto alignedAlloc(uint32_t alignment, size_t bytes) -> void * {
+inline auto alignedAlloc(uint32_t alignment, size_t bytes,
+                         bool zero_init = false) -> void * {
     if (bytes % alignment != 0) {
         bytes = alignment * (bytes / alignment + 1);
     }
+    void *p = nullptr;
+
 #if defined(__clang__) && defined(__APPLE__)
     /*
      * We use `posix_memalign` for MacOS as Mac does not support
      * `std::aligned_alloc` properly yet (even in MacOS 10.15).
      */
-    void *p;
     posix_memalign(&p, alignment, bytes);
-    return p;
 #elif defined(_MSC_VER)
-    return _aligned_malloc(bytes, alignment);
+    p = _aligned_malloc(bytes, alignment);
 #else
-    return std::aligned_alloc(alignment, bytes);
+    p = std::aligned_alloc(alignment, bytes);
 #endif
+    if (zero_init) {
+        std::memset(p, 0, bytes);
+    }
+    return p;
 }
 
 /**

diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -28,10 +28,7 @@
 
 try:
     from pennylane_lightning.lightning_gpu_ops import (
-        allocate_aligned_array,
         backend_info,
-        best_alignment,
-        get_alignment,
         StateVectorC128,
         StateVectorC64,
         MeasurementsC128,
@@ -331,14 +328,6 @@ def _asarray(arr, dtype=None):
             if not dtype:
                 dtype = arr.dtype
 
-            # We allocate a new aligned memory and copy data to there if alignment
-            # or dtype mismatches
-            # Note that get_alignment does not necessarily return CPUMemoryModel(Unaligned) even for
-            # numpy allocated memory as the memory location happens to be aligned.
-            if int(get_alignment(arr)) < int(best_alignment()) or arr.dtype != dtype:
-                new_arr = allocate_aligned_array(arr.size, np.dtype(dtype)).reshape(arr.shape)
-                np.copyto(new_arr, arr)
-                arr = new_arr
             return arr
 
         # pylint disable=missing-function-docstring

diff --git a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
@@ -32,8 +32,6 @@
     from pennylane_lightning.lightning_kokkos_ops import (
         allocate_aligned_array,
         backend_info,
-        best_alignment,
-        get_alignment,
         InitializationSettings,
         MeasurementsC128,
         MeasurementsC64,
@@ -232,8 +230,10 @@ def _asarray(arr, dtype=None):
             # or dtype mismatches
             # Note that get_alignment does not necessarily return CPUMemoryModel(Unaligned) even for
             # numpy allocated memory as the memory location happens to be aligned.
-            if int(get_alignment(arr)) < int(best_alignment()) or arr.dtype != dtype:
-                new_arr = allocate_aligned_array(arr.size, np.dtype(dtype)).reshape(arr.shape)
+            if arr.dtype != dtype:
+                new_arr = allocate_aligned_array(arr.size, np.dtype(dtype), False).reshape(
+                    arr.shape
+                )
                 np.copyto(new_arr, arr)
                 arr = new_arr
             return arr

diff --git a/pennylane_lightning/lightning_qubit/lightning_qubit.py b/pennylane_lightning/lightning_qubit/lightning_qubit.py
@@ -227,6 +227,7 @@ def __init__(  # pylint: disable=too-many-arguments
             # state as an array of dimension [2]*wires.
             self._state = self._create_basis_state(0)
             self._pre_rotated_state = self._state
+            self._c_dtype = c_dtype
 
             self._batch_obs = batch_obs
             self._mcmc = mcmc
@@ -259,8 +260,13 @@ def _asarray(arr, dtype=None):
             # Note that get_alignment does not necessarily return CPUMemoryModel(Unaligned)
             # numpy allocated memory as the memory location happens to be aligned.
             if int(get_alignment(arr)) < int(best_alignment()) or arr.dtype != dtype:
-                new_arr = allocate_aligned_array(arr.size, np.dtype(dtype)).reshape(arr.shape)
-                np.copyto(new_arr, arr)
+                new_arr = allocate_aligned_array(arr.size, np.dtype(dtype), False).reshape(
+                    arr.shape
+                )
+                if len(arr.shape):
+                    new_arr[:] = arr
+                else:
+                    np.copyto(new_arr, arr)
                 arr = new_arr
             return arr
 
@@ -273,17 +279,17 @@ def _create_basis_state(self, index):
                 representing the statevector of the basis state
             Note: This function does not support broadcasted inputs yet.
             """
-            state = np.zeros(2**self.num_wires, dtype=np.complex128)
+            state = allocate_aligned_array(2**self.num_wires, np.dtype(self.C_DTYPE), True)
             state[index] = 1
-            state = self._asarray(state, dtype=self.C_DTYPE)
             return self._reshape(state, [2] * self.num_wires)
 
         def reset(self):
             """Reset the device"""
             super().reset()
 
             # init the state vector to |00..0>
-            self._state = self._create_basis_state(0)
+            if not self.state[0] == 1.0 + 0j:
+                self._state = self._create_basis_state(0)
             self._pre_rotated_state = self._state
 
         @property

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,4 +1,4 @@
-pip==23.0
+pip~=22.0
 git+https://github.com/PennyLaneAI/pennylane.git@master
 ninja
 flaky

diff --git a/tests/test_adjoint_jacobian.py b/tests/test_adjoint_jacobian.py
@@ -583,7 +583,7 @@ def test_provide_starting_state(self, tol, dev):
 
         dM1 = dev.adjoint_jacobian(tape)
 
-        if device_name != "lightning.gpu":
+        if device_name == "lightning.kokkos":
             dev._pre_rotated_state = dev.state_vector  # necessary for lightning.kokkos
 
             qml.execute([tape], dev, None)

diff --git a/tests/test_arrays.py b/tests/test_arrays.py
@@ -22,14 +22,19 @@
 try:
     from pennylane_lightning.lightning_qubit_ops import allocate_aligned_array
 except (ImportError, ModuleNotFoundError):
-    try:
-        from pennylane_lightning.lightning_kokkos_ops import allocate_aligned_array
-    except (ImportError, ModuleNotFoundError):
-        pytest.skip("No binary module found. Skipping.", allow_module_level=True)
+    pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
 
 @pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
 @pytest.mark.parametrize("dt", [np.dtype(np.complex64), np.dtype(np.complex128)])
-def test_allocate_aligned_array(dt):
-    arr = allocate_aligned_array(1024, dt)
+def test_allocate_aligned_array_unset(dt):
+    arr = allocate_aligned_array(1024, dt, False)
     assert arr.dtype == dt
+
+
+@pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+@pytest.mark.parametrize("dt", [np.dtype(np.complex64), np.dtype(np.complex128)])
+def test_allocate_aligned_array_set(dt):
+    arr = allocate_aligned_array(1024, dt, True)
+    assert arr.dtype == dt
+    assert np.all(arr == 0)