diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md index 9e94ac7ac..fbfcdabb9 100644 --- a/.github/CHANGELOG.md +++ b/.github/CHANGELOG.md @@ -36,6 +36,9 @@ ### Improvements +* Ensure aligned memory used for numpy arrays with state-vector without reallocations. + [(#572)](https://github.com/PennyLaneAI/pennylane-lightning/pull/572) + * Unify error messages of shot measurement related unsupported observables to better Catalyst. [(#577)](https://github.com/PennyLaneAI/pennylane-lightning/pull/577) diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml index 08a143106..560864d32 100644 --- a/.github/workflows/format.yml +++ b/.github/workflows/format.yml @@ -37,10 +37,10 @@ jobs: uses: actions/checkout@v3 - name: Install dependencies - run: sudo apt update && sudo apt -y install python3 python3-pip && python3 -m pip install -r requirements-dev.txt + run: sudo apt update && sudo apt -y install python3 python3-pip && python -m pip install pip~=22.0 && python3 -m pip install -r requirements-dev.txt - name: Run formatter - run: PATH=$PATH:$(dirname $(which python3)) ./bin/format --check ./pennylane_lightning/core/src + run: PATH=$PATH:$(dirname $(which python3))/ ./bin/format --check ./pennylane_lightning/core/src tidy-cpp: strategy: diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py index 64f71a994..361236ed2 100644 --- a/pennylane_lightning/core/_version.py +++ b/pennylane_lightning/core/_version.py @@ -16,4 +16,4 @@ Version number (major.minor.patch[-label]) """ -__version__ = "0.34.0-dev26" +__version__ = "0.34.0-dev27" diff --git a/pennylane_lightning/core/src/bindings/Bindings.hpp b/pennylane_lightning/core/src/bindings/Bindings.hpp index f20640e43..f4e2718af 100644 --- a/pennylane_lightning/core/src/bindings/Bindings.hpp +++ b/pennylane_lightning/core/src/bindings/Bindings.hpp @@ -149,11 +149,12 @@ auto getNumpyArrayAlignment(const py::array &numpyArray) -> CPUMemoryModel { * @return Numpy array */ template -auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size) -> py::array { +auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size, + bool zeroInit = false) -> py::array { using Pennylane::Util::alignedAlloc; if (getAlignment(memory_model) > alignof(std::max_align_t)) { - void *ptr = - alignedAlloc(getAlignment(memory_model), sizeof(T) * size); + void *ptr = alignedAlloc(getAlignment(memory_model), + sizeof(T) * size, zeroInit); auto capsule = py::capsule(ptr, &Util::alignedFree); return py::array{py::dtype::of(), {size}, {sizeof(T)}, ptr, capsule}; } @@ -172,20 +173,24 @@ auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size) -> py::array { * @param size Size of the array to create * @param dt Pybind11's datatype object */ -auto allocateAlignedArray(size_t size, const py::dtype &dt) -> py::array { +auto allocateAlignedArray(size_t size, const py::dtype &dt, + bool zeroInit = false) -> py::array { + // TODO: Move memset operations to here to reduce zeroInit pass-throughs. auto memory_model = bestCPUMemoryModel(); if (dt.is(py::dtype::of())) { - return alignedNumpyArray(memory_model, size); + return alignedNumpyArray(memory_model, size, zeroInit); } if (dt.is(py::dtype::of())) { - return alignedNumpyArray(memory_model, size); + return alignedNumpyArray(memory_model, size, zeroInit); } if (dt.is(py::dtype::of>())) { - return alignedNumpyArray>(memory_model, size); + return alignedNumpyArray>(memory_model, size, + zeroInit); } if (dt.is(py::dtype::of>())) { - return alignedNumpyArray>(memory_model, size); + return alignedNumpyArray>(memory_model, size, + zeroInit); } throw py::type_error("Unsupported datatype."); } diff --git a/pennylane_lightning/core/src/utils/Memory.hpp b/pennylane_lightning/core/src/utils/Memory.hpp index 7796a8cd2..b9886d5a3 100644 --- a/pennylane_lightning/core/src/utils/Memory.hpp +++ b/pennylane_lightning/core/src/utils/Memory.hpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -35,23 +36,28 @@ namespace Pennylane::Util { * @param bytes Number of bytes to allocate * @return Pointer to the allocated memory */ -inline auto alignedAlloc(uint32_t alignment, size_t bytes) -> void * { +inline auto alignedAlloc(uint32_t alignment, size_t bytes, + bool zero_init = false) -> void * { if (bytes % alignment != 0) { bytes = alignment * (bytes / alignment + 1); } + void *p = nullptr; + #if defined(__clang__) && defined(__APPLE__) /* * We use `posix_memalign` for MacOS as Mac does not support * `std::aligned_alloc` properly yet (even in MacOS 10.15). */ - void *p; posix_memalign(&p, alignment, bytes); - return p; #elif defined(_MSC_VER) - return _aligned_malloc(bytes, alignment); + p = _aligned_malloc(bytes, alignment); #else - return std::aligned_alloc(alignment, bytes); + p = std::aligned_alloc(alignment, bytes); #endif + if (zero_init) { + std::memset(p, 0, bytes); + } + return p; } /** diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py index f424d1e14..3d0861d4a 100644 --- a/pennylane_lightning/lightning_gpu/lightning_gpu.py +++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py @@ -28,10 +28,7 @@ try: from pennylane_lightning.lightning_gpu_ops import ( - allocate_aligned_array, backend_info, - best_alignment, - get_alignment, StateVectorC128, StateVectorC64, MeasurementsC128, @@ -331,14 +328,6 @@ def _asarray(arr, dtype=None): if not dtype: dtype = arr.dtype - # We allocate a new aligned memory and copy data to there if alignment - # or dtype mismatches - # Note that get_alignment does not necessarily return CPUMemoryModel(Unaligned) even for - # numpy allocated memory as the memory location happens to be aligned. - if int(get_alignment(arr)) < int(best_alignment()) or arr.dtype != dtype: - new_arr = allocate_aligned_array(arr.size, np.dtype(dtype)).reshape(arr.shape) - np.copyto(new_arr, arr) - arr = new_arr return arr # pylint disable=missing-function-docstring diff --git a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py index ea1c877ab..da7c58e5b 100644 --- a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py +++ b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py @@ -32,8 +32,6 @@ from pennylane_lightning.lightning_kokkos_ops import ( allocate_aligned_array, backend_info, - best_alignment, - get_alignment, InitializationSettings, MeasurementsC128, MeasurementsC64, @@ -232,8 +230,10 @@ def _asarray(arr, dtype=None): # or dtype mismatches # Note that get_alignment does not necessarily return CPUMemoryModel(Unaligned) even for # numpy allocated memory as the memory location happens to be aligned. - if int(get_alignment(arr)) < int(best_alignment()) or arr.dtype != dtype: - new_arr = allocate_aligned_array(arr.size, np.dtype(dtype)).reshape(arr.shape) + if arr.dtype != dtype: + new_arr = allocate_aligned_array(arr.size, np.dtype(dtype), False).reshape( + arr.shape + ) np.copyto(new_arr, arr) arr = new_arr return arr diff --git a/pennylane_lightning/lightning_qubit/lightning_qubit.py b/pennylane_lightning/lightning_qubit/lightning_qubit.py index ad70d8789..32e6a1d5f 100644 --- a/pennylane_lightning/lightning_qubit/lightning_qubit.py +++ b/pennylane_lightning/lightning_qubit/lightning_qubit.py @@ -227,6 +227,7 @@ def __init__( # pylint: disable=too-many-arguments # state as an array of dimension [2]*wires. self._state = self._create_basis_state(0) self._pre_rotated_state = self._state + self._c_dtype = c_dtype self._batch_obs = batch_obs self._mcmc = mcmc @@ -259,8 +260,13 @@ def _asarray(arr, dtype=None): # Note that get_alignment does not necessarily return CPUMemoryModel(Unaligned) # numpy allocated memory as the memory location happens to be aligned. if int(get_alignment(arr)) < int(best_alignment()) or arr.dtype != dtype: - new_arr = allocate_aligned_array(arr.size, np.dtype(dtype)).reshape(arr.shape) - np.copyto(new_arr, arr) + new_arr = allocate_aligned_array(arr.size, np.dtype(dtype), False).reshape( + arr.shape + ) + if len(arr.shape): + new_arr[:] = arr + else: + np.copyto(new_arr, arr) arr = new_arr return arr @@ -273,9 +279,8 @@ def _create_basis_state(self, index): representing the statevector of the basis state Note: This function does not support broadcasted inputs yet. """ - state = np.zeros(2**self.num_wires, dtype=np.complex128) + state = allocate_aligned_array(2**self.num_wires, np.dtype(self.C_DTYPE), True) state[index] = 1 - state = self._asarray(state, dtype=self.C_DTYPE) return self._reshape(state, [2] * self.num_wires) def reset(self): @@ -283,7 +288,8 @@ def reset(self): super().reset() # init the state vector to |00..0> - self._state = self._create_basis_state(0) + if not self.state[0] == 1.0 + 0j: + self._state = self._create_basis_state(0) self._pre_rotated_state = self._state @property diff --git a/requirements-dev.txt b/requirements-dev.txt index 1b2a7c1a5..10e01a32c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,4 +1,4 @@ -pip==23.0 +pip~=22.0 git+https://github.com/PennyLaneAI/pennylane.git@master ninja flaky diff --git a/tests/test_adjoint_jacobian.py b/tests/test_adjoint_jacobian.py index 5783c81bf..58a156aa4 100644 --- a/tests/test_adjoint_jacobian.py +++ b/tests/test_adjoint_jacobian.py @@ -583,7 +583,7 @@ def test_provide_starting_state(self, tol, dev): dM1 = dev.adjoint_jacobian(tape) - if device_name != "lightning.gpu": + if device_name == "lightning.kokkos": dev._pre_rotated_state = dev.state_vector # necessary for lightning.kokkos qml.execute([tape], dev, None) diff --git a/tests/test_arrays.py b/tests/test_arrays.py index c3aef1fdf..2f3fccb8a 100644 --- a/tests/test_arrays.py +++ b/tests/test_arrays.py @@ -22,14 +22,19 @@ try: from pennylane_lightning.lightning_qubit_ops import allocate_aligned_array except (ImportError, ModuleNotFoundError): - try: - from pennylane_lightning.lightning_kokkos_ops import allocate_aligned_array - except (ImportError, ModuleNotFoundError): - pytest.skip("No binary module found. Skipping.", allow_module_level=True) + pytest.skip("No binary module found. Skipping.", allow_module_level=True) @pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required") @pytest.mark.parametrize("dt", [np.dtype(np.complex64), np.dtype(np.complex128)]) -def test_allocate_aligned_array(dt): - arr = allocate_aligned_array(1024, dt) +def test_allocate_aligned_array_unset(dt): + arr = allocate_aligned_array(1024, dt, False) assert arr.dtype == dt + + +@pytest.mark.skipif(not ld._CPP_BINARY_AVAILABLE, reason="Lightning binary required") +@pytest.mark.parametrize("dt", [np.dtype(np.complex64), np.dtype(np.complex128)]) +def test_allocate_aligned_array_set(dt): + arr = allocate_aligned_array(1024, dt, True) + assert arr.dtype == dt + assert np.all(arr == 0)