From cd5120c9c717420bbd363586fc37568a209d3d59 Mon Sep 17 00:00:00 2001
From: Roll249 <80768386+Roll249@users.noreply.github.com>
Date: Wed, 4 Jun 2025 05:04:36 +0700
Subject: [PATCH 01/32] implemented cross-platform RAM probe (#636)

Mai (Roll249) implemented the probe, and Tyson updated the tests and authorlist

---------

Co-authored-by: Tyson Jones <tyson.jones.input@gmail.com>
---
 AUTHORS.txt               |  2 ++
 README.md                 |  1 +
 quest/src/core/memory.cpp | 33 ++++++++++++++++++++++++++++-----
 tests/unit/channels.cpp   | 12 ++++++++++--
 tests/unit/matrices.cpp   | 24 ++++++++++++++++++++----
 tests/unit/qureg.cpp      | 36 ++++++++++++++++++++++++++++++------
 6 files changed, 91 insertions(+), 17 deletions(-)
diff --git a/AUTHORS.txt b/AUTHORS.txt
index 5b2cdd084..33ca79a1c 100644
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@@ -44,6 +44,8 @@ Dr Ian Bush [consultant]
     HPC
 
 External contributors:
+Mai Đức Khang
+    implemented RAM probe (for unitaryHACK issue #600)
 James Richings
     patched overflow in bitwise.hpp logic
 Luc Jaulmes
diff --git a/README.md b/README.md
index 260d4b8bb..9e05597a2 100644
--- a/README.md
+++ b/README.md
@@ -257,6 +257,7 @@ See the [docs](docs/README.md) for enabling acceleration and running the unit te
 
 In addition to QuEST's [authors](AUTHORS.txt), we sincerely thank the following external contributors to QuEST.
 
+- [Mai Đức Khang](https://github.com/Roll249) for implementing a RAM probe (unitaryHACK 2025 [#600](https://github.com/QuEST-Kit/QuEST/issues/600)). 
 - [James Richings](https://github.com/JPRichings) for patching a v4 overflow bug.
 - [Luc Jaulmes](https://github.com/lucjaulmes) for patching v4's CMake installation.
 - [Jakub Adamski](https://github.com/jjacobx) for optimising distributed communication of max-size messages.
diff --git a/quest/src/core/memory.cpp b/quest/src/core/memory.cpp
index 36eedb596..fc2a4c39c 100644
--- a/quest/src/core/memory.cpp
+++ b/quest/src/core/memory.cpp
@@ -21,6 +21,17 @@
 
 #include <cstdlib>
 
+// Platform-specific includes for RAM querying
+#if defined(__linux__)
+    #include <sys/sysinfo.h>
+#elif defined(__APPLE__)
+    #include <sys/types.h>
+    #include <sys/sysctl.h>
+#elif defined(_WIN32)
+    #define NOMINMAX
+    #include <windows.h>
+#endif
+
 
 
 /*
@@ -196,11 +207,23 @@ qindex mem_getMaxNumKrausMapMatricesBeforeLocalMemSizeofOverflow(int numQubits)
 
 
 qindex mem_tryGetLocalRamCapacityInBytes() {
-
-    /// @todo attempt to find total Ram
-
-    // if we're unable to find total RAM, throw an exception
-    // (which the caller should catch and gracefully continue)
+    #if defined(__linux__)
+        struct sysinfo info;
+        if (sysinfo(&info) == 0)
+            return (qindex) info.totalram * info.mem_unit;
+    #elif defined(__APPLE__)
+        int mib[2] = {CTL_HW, HW_MEMSIZE};
+        int64_t memsize = 0;
+        size_t len = sizeof(memsize);
+        if (sysctl(mib, 2, &memsize, &len, NULL, 0) == 0 && memsize > 0)
+            return (qindex) memsize;
+    #elif defined(_WIN32)
+        MEMORYSTATUSEX statex;
+        statex.dwLength = sizeof(statex);
+        if (GlobalMemoryStatusEx(&statex))
+            return (qindex) statex.ullTotalPhys;
+    #endif
+    // fallback: throw exception
     throw (mem::COULD_NOT_QUERY_RAM) false;
 }
 
diff --git a/tests/unit/channels.cpp b/tests/unit/channels.cpp
index b0a7ab314..e068368c5 100644
--- a/tests/unit/channels.cpp
+++ b/tests/unit/channels.cpp
@@ -126,7 +126,11 @@ TEST_CASE( "createKrausMap", TEST_CATEGORY ) {
             // GPU-accel or distributed) and whether memory-probers realised there was insufficient memory in
             // advance or whether it proceeded to malloc() which subsequently failed
             #ifndef SANITIZER_IS_ACTIVE
-            REQUIRE_THROWS_WITH( createKrausMap(12,1), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
+            REQUIRE_THROWS_WITH( createKrausMap(12,1), 
+                ContainsSubstring("failed") || 
+                ContainsSubstring("insufficient available memory") || 
+                ContainsSubstring("exceeds the available memory") ||
+                ContainsSubstring("available GPU memory") );
             #endif
         }
 
@@ -512,7 +516,11 @@ TEST_CASE( "createSuperOp", TEST_CATEGORY ) {
             // GPU-accel or distributed) and whether memory-probers realised there was insufficient memory in
             // advance or whether it proceeded to malloc() which subsequently failed
             #ifndef SANITIZER_IS_ACTIVE
-            REQUIRE_THROWS_WITH( createSuperOp(12), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
+            REQUIRE_THROWS_WITH( createSuperOp(12), 
+                ContainsSubstring("failed") || 
+                ContainsSubstring("insufficient available memory") || 
+                ContainsSubstring("available GPU memory") ||
+                ContainsSubstring("exceeds that available") );
             #endif
         }
     }
diff --git a/tests/unit/matrices.cpp b/tests/unit/matrices.cpp
index 591e3bc7b..ef92dbcda 100644
--- a/tests/unit/matrices.cpp
+++ b/tests/unit/matrices.cpp
@@ -456,7 +456,11 @@ TEST_CASE( "createCompMatr", TEST_CATEGORY ) {
             // GPU-accel or distributed) and whether memory-probers realised there was insufficient memory in
             // advance or whether it proceeded to malloc() which subsequently failed
             #ifndef SANITIZER_IS_ACTIVE
-            REQUIRE_THROWS_WITH( createCompMatr(25), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
+            REQUIRE_THROWS_WITH( createCompMatr(25), 
+                ContainsSubstring("failed") || 
+                ContainsSubstring("insufficient available memory") || 
+                ContainsSubstring("available GPU memory") ||
+                ContainsSubstring("exceeds the available RAM") );
             #endif
         }
     }
@@ -529,7 +533,11 @@ TEST_CASE( "createDiagMatr", TEST_CATEGORY ) {
             // GPU-accel or distributed) and whether memory-probers realised there was insufficient memory in
             // advance or whether it proceeded to malloc() which subsequently failed
             #ifndef SANITIZER_IS_ACTIVE
-            REQUIRE_THROWS_WITH( createDiagMatr(50), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
+            REQUIRE_THROWS_WITH( createDiagMatr(50), 
+                ContainsSubstring("failed") || 
+                ContainsSubstring("insufficient available memory") || 
+                ContainsSubstring("available GPU memory") ||
+                ContainsSubstring("exceeds the available RAM") );
             #endif
         }
     }
@@ -602,7 +610,11 @@ TEST_CASE( "createFullStateDiagMatr", TEST_CATEGORY ) {
             // GPU-accel or distributed) and whether memory-probers realised there was insufficient memory in
             // advance or whether it proceeded to malloc() which subsequently failed
             #ifndef SANITIZER_IS_ACTIVE
-            REQUIRE_THROWS_WITH( createFullStateDiagMatr(50), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
+            REQUIRE_THROWS_WITH( createFullStateDiagMatr(50), 
+                ContainsSubstring("failed") || 
+                ContainsSubstring("insufficient available memory") || 
+                ContainsSubstring("available GPU memory") ||
+                ContainsSubstring("exceeds the available RAM") );
             #endif
         }
 
@@ -698,7 +710,11 @@ TEST_CASE( "createCustomFullStateDiagMatr", TEST_CATEGORY ) {
                 // GPU-accel or distributed) and whether memory-probers realised there was insufficient memory in
                 // advance or whether it proceeded to malloc() which subsequently failed
                 #ifndef SANITIZER_IS_ACTIVE
-                REQUIRE_THROWS_WITH( createCustomFullStateDiagMatr(50, mpi,gpu,omp), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
+                REQUIRE_THROWS_WITH( createCustomFullStateDiagMatr(50, mpi,gpu,omp), 
+                    ContainsSubstring("failed") || 
+                    ContainsSubstring("insufficient available memory") || 
+                    ContainsSubstring("available GPU memory") ||
+                    ContainsSubstring("exceeds the available RAM") );
                 #endif
             }
         }
diff --git a/tests/unit/qureg.cpp b/tests/unit/qureg.cpp
index aaf3bb0cd..4b2b46311 100644
--- a/tests/unit/qureg.cpp
+++ b/tests/unit/qureg.cpp
@@ -125,7 +125,11 @@ TEST_CASE( "createQureg", TEST_CATEGORY ) {
             // GPU-accel or distributed) and whether memory-probers realised there was insufficient memory in
             // advance or whether it proceeded to malloc() which subsequently failed
             #ifndef SANITIZER_IS_ACTIVE
-            REQUIRE_THROWS_WITH( createQureg(50), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
+            REQUIRE_THROWS_WITH( createQureg(50), 
+                ContainsSubstring("failed") || 
+                ContainsSubstring("insufficient available memory") || 
+                ContainsSubstring("available GPU memory") ||
+                ContainsSubstring("RAM") );
             #endif
         }
     }
@@ -210,7 +214,11 @@ TEST_CASE( "createDensityQureg", TEST_CATEGORY ) {
             // GPU-accel or distributed) and whether memory-probers realised there was insufficient memory in
             // advance or whether it proceeded to malloc() which subsequently failed
             #ifndef SANITIZER_IS_ACTIVE
-            REQUIRE_THROWS_WITH( createDensityQureg(25), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
+            REQUIRE_THROWS_WITH( createDensityQureg(25), 
+                ContainsSubstring("failed") || 
+                ContainsSubstring("insufficient available memory") || 
+                ContainsSubstring("available GPU memory") ||
+                ContainsSubstring("RAM") );
             #endif
         }
     }
@@ -302,7 +310,11 @@ TEST_CASE( "createForcedQureg", TEST_CATEGORY ) {
             // GPU-accel or distributed) and whether memory-probers realised there was insufficient memory in
             // advance or whether it proceeded to malloc() which subsequently failed
             #ifndef SANITIZER_IS_ACTIVE
-            REQUIRE_THROWS_WITH( createForcedQureg(50), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
+            REQUIRE_THROWS_WITH( createForcedQureg(50), 
+                ContainsSubstring("failed") || 
+                ContainsSubstring("insufficient available memory") || 
+                ContainsSubstring("available GPU memory") ||
+                ContainsSubstring("RAM") );
             #endif
         }
     }
@@ -395,7 +407,11 @@ TEST_CASE( "createForcedDensityQureg", TEST_CATEGORY ) {
             // GPU-accel or distributed) and whether memory-probers realised there was insufficient memory in
             // advance or whether it proceeded to malloc() which subsequently failed
             #ifndef SANITIZER_IS_ACTIVE
-            REQUIRE_THROWS_WITH( createForcedDensityQureg(25), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
+            REQUIRE_THROWS_WITH( createForcedDensityQureg(25), 
+                ContainsSubstring("failed") || 
+                ContainsSubstring("insufficient available memory") || 
+                ContainsSubstring("available GPU memory") ||
+                ContainsSubstring("RAM") );
             #endif
         }
     }
@@ -531,8 +547,16 @@ TEST_CASE( "createCustomQureg", TEST_CATEGORY ) {
             // GPU-accel or distributed) and whether memory-probers realised there was insufficient memory in
             // advance or whether it proceeded to malloc() which subsequently failed
             #ifndef SANITIZER_IS_ACTIVE
-            REQUIRE_THROWS_WITH( createCustomQureg(50, 0, 0,0,0), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
-            REQUIRE_THROWS_WITH( createCustomQureg(25, 1, 0,0,0), ContainsSubstring("failed") || ContainsSubstring("insufficient available memory") || ContainsSubstring("available GPU memory") );
+            REQUIRE_THROWS_WITH( createCustomQureg(50, 0, 0,0,0), 
+                ContainsSubstring("failed") || 
+                ContainsSubstring("insufficient available memory") || 
+                ContainsSubstring("available GPU memory") ||
+                ContainsSubstring("RAM") );
+            REQUIRE_THROWS_WITH( createCustomQureg(25, 1, 0,0,0), 
+                ContainsSubstring("failed") || 
+                ContainsSubstring("insufficient available memory") || 
+                ContainsSubstring("available GPU memory") ||
+                ContainsSubstring("RAM") );
             #endif
         }
     }

From 5fa4f609847a80e1303e5a12c2024a8259db8515 Mon Sep 17 00:00:00 2001
From: diogomaia00 <128249937+diogomaia00@users.noreply.github.com>
Date: Wed, 4 Jun 2025 15:29:19 +0100
Subject: [PATCH 02/32] added non-unitary Pauli gadgets (#637)

as part of unitaryHACK 2025, challenge issue #594

---------

Co-authored-by: Tyson Jones <tyson.jones.input@gmail.com>
---
 AUTHORS.txt                                   |  2 ++
 README.md                                     |  1 +
 .../automated/apply_no_unitary_pauli_gadget.c | 18 ++++++++++++
 quest/include/operations.h                    |  3 ++
 quest/src/api/operations.cpp                  | 16 ++++++++++
 quest/src/core/localiser.cpp                  |  8 ++---
 quest/src/core/localiser.hpp                  |  4 +--
 quest/src/core/memory.cpp                     |  1 +
 quest/src/core/utilities.cpp                  |  3 ++
 quest/src/core/utilities.hpp                  |  1 +
 tests/unit/operations.cpp                     | 29 +++++++++++++++++++
 tests/utils/linalg.cpp                        |  2 +-
 tests/utils/linalg.hpp                        |  2 +-
 13 files changed, 82 insertions(+), 8 deletions(-)
 create mode 100644 examples/automated/apply_no_unitary_pauli_gadget.c

diff --git a/AUTHORS.txt b/AUTHORS.txt
index 33ca79a1c..089bcc1c2 100644
--- a/AUTHORS.txt
+++ b/AUTHORS.txt
@@ -44,6 +44,8 @@ Dr Ian Bush [consultant]
     HPC
 
 External contributors:
+Diogo Pratas Maia
+    added non-unitary Pauli gadget (for unitaryHACK issue #594)
 Mai Đức Khang
     implemented RAM probe (for unitaryHACK issue #600)
 James Richings
diff --git a/README.md b/README.md
index 9e05597a2..9b03e0578 100644
--- a/README.md
+++ b/README.md
@@ -257,6 +257,7 @@ See the [docs](docs/README.md) for enabling acceleration and running the unit te
 
 In addition to QuEST's [authors](AUTHORS.txt), we sincerely thank the following external contributors to QuEST.
 
+- [Diogo Pratas Maia](https://github.com/diogomaia00) for implementing non-unitary Pauli gadgets (unitaryHACK 2025 [#594](https://github.com/QuEST-Kit/QuEST/issues/594)).
 - [Mai Đức Khang](https://github.com/Roll249) for implementing a RAM probe (unitaryHACK 2025 [#600](https://github.com/QuEST-Kit/QuEST/issues/600)). 
 - [James Richings](https://github.com/JPRichings) for patching a v4 overflow bug.
 - [Luc Jaulmes](https://github.com/lucjaulmes) for patching v4's CMake installation.
diff --git a/examples/automated/apply_no_unitary_pauli_gadget.c b/examples/automated/apply_no_unitary_pauli_gadget.c
new file mode 100644
index 000000000..41d679a50
--- /dev/null
+++ b/examples/automated/apply_no_unitary_pauli_gadget.c
@@ -0,0 +1,18 @@
+#include "quest.h"
+
+int main() {
+    initQuESTEnv();
+
+    Qureg qureg = createQureg(3);
+    PauliStr str = getInlinePauliStr("XYZ", {0,1,2});
+    qcomp angle = getQcomp(.4, .8);
+
+    initPlusState(qureg);
+    applyNonUnitaryPauliGadget(qureg, str, angle);
+
+    qreal norm = calcTotalProb(qureg);
+    reportScalar("norm", norm);
+
+    finalizeQuESTEnv();
+    return 0;
+}
\ No newline at end of file
diff --git a/quest/include/operations.h b/quest/include/operations.h
index 57e1dab31..3054802c2 100644
--- a/quest/include/operations.h
+++ b/quest/include/operations.h
@@ -5,6 +5,7 @@
  * instead exposed in decoherence.h
  * 
  * @author Tyson Jones
+ * @author Diogo Pratas Maia (non-unitary Pauli gadget)
  * 
  * @defgroup operations Operations
  * @ingroup api
@@ -1853,6 +1854,8 @@ void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
  */
 void applyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
 
+/// @notyetdoced
+void applyNonUnitaryPauliGadget(Qureg qureg, PauliStr str, qcomp angle);
 
 /// @notyetdoced
 void applyControlledPauliGadget(Qureg qureg, int control, PauliStr str, qreal angle);
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index dbef67707..f64336457 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -1432,6 +1432,22 @@ void applyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
     applyMultiStateControlledPauliGadget(qureg, nullptr, nullptr, 0, str, angle);
 }
 
+void applyNonUnitaryPauliGadget(Qureg qureg, PauliStr str, qcomp angle) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrTargets(qureg, str, __func__);
+
+    qcomp phase = util_getPhaseFromGateAngle(angle);
+    localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
+
+    if (!qureg.isDensityMatrix)
+        return;
+
+    // conj(e^i(a)XZ) = e^(-i conj(a)XZ) but conj(Y)=-Y, so odd-Y undoes phase negation
+    phase = std::conj(phase) * (paulis_hasOddNumY(str) ? 1 : -1);
+    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+    localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
+}
+
 void applyControlledPauliGadget(Qureg qureg, int control, PauliStr str, qreal angle) {
     validate_quregFields(qureg, __func__);
     validate_controlAndPauliStrTargets(qureg, control, str, __func__);
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 9ef6148f1..022ace7a6 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -1244,7 +1244,7 @@ extern int paulis_getPrefixZSign(Qureg qureg, vector<int> prefixZ) ;
 extern qcomp paulis_getPrefixPaulisElem(Qureg qureg, vector<int> prefixY, vector<int> prefixZ);
 
 
-void anyCtrlZTensorOrGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, bool isGadget, qreal phase) {
+void anyCtrlZTensorOrGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, bool isGadget, qcomp phase) {     
     assertValidCtrlStates(ctrls, ctrlStates);
     setDefaultCtrlStates(ctrls, ctrlStates);
 
@@ -1339,14 +1339,14 @@ void localiser_statevec_anyCtrlPauliTensor(Qureg qureg, vector<int> ctrls, vecto
 }
 
 
-void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qreal phase) {
+void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qcomp phase) {
 
     bool isGadget = true;
-    anyCtrlZTensorOrGadget(qureg, ctrls, ctrlStates, targs, isGadget, phase); 
+    anyCtrlZTensorOrGadget(qureg, ctrls, ctrlStates, targs, isGadget, phase);
 }
 
 
-void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, PauliStr str, qreal phase) {
+void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, PauliStr str, qcomp phase) {
 
     // when str=IZ, we must use the above bespoke algorithm
     if (!paulis_containsXOrY(str)) {
diff --git a/quest/src/core/localiser.hpp b/quest/src/core/localiser.hpp
index a9b0391a8..8b9975aca 100644
--- a/quest/src/core/localiser.hpp
+++ b/quest/src/core/localiser.hpp
@@ -118,9 +118,9 @@ void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg qureg, vector<int> ctrls, ve
 
 void localiser_statevec_anyCtrlPauliTensor(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, PauliStr str, qcomp globalFactor=1);
 
-void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, PauliStr str, qreal phase);
+void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, PauliStr str, qcomp phase);
 
-void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qreal phase);
+void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qcomp phase);
 
 
 /*
diff --git a/quest/src/core/memory.cpp b/quest/src/core/memory.cpp
index fc2a4c39c..79d4301a4 100644
--- a/quest/src/core/memory.cpp
+++ b/quest/src/core/memory.cpp
@@ -10,6 +10,7 @@
  * allocators in gpu_config.cpp, and use NUMA strategies.
  * 
  * @author Tyson Jones
+ * @author Mai Đức Khang (CPU memory query)
  */
 
 #include "quest/include/types.h"
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index b1238f71e..471d14254 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -913,7 +913,10 @@ qreal util_getPhaseFromGateAngle(qreal angle) {
     return - angle / 2;
 }
 
+qcomp util_getPhaseFromGateAngle(qcomp angle) {
 
+    return -angle / qcomp(2.0, 0.0);
+}
 
 /*
  * DECOHERENCE FACTORS
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index cb6280ee8..f28055ca9 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -350,6 +350,7 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
 
 qreal util_getPhaseFromGateAngle(qreal angle);
 
+qcomp util_getPhaseFromGateAngle(qcomp angle);
 
 
 /*
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index 6676dfba9..74bca9046 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -1920,6 +1920,35 @@ TEST_CASE( "multiplyPauliStrSum", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
 }
 
 
+TEST_CASE( "applyNonUnitaryPauliGadget", TEST_CATEGORY ) {
+
+    PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // prepare a random Pauli string and angle
+        int numTargs = GENERATE_COPY( range(1, numQubits+1) );
+        auto targs = GENERATE_TARGS( numQubits, numTargs );
+        PauliStr str = getRandomPauliStr(targs);
+        qcomp angle = getRandomComplex();
+
+        // prepare the corresponding reference matrix exp(-i angle pauli)
+        auto matrRef = getExponentialOfPauliMatrix(angle, getMatrix(str, numQubits));
+
+        auto testFunc = [&](Qureg qureg, auto& stateRef) {
+            applyNonUnitaryPauliGadget(qureg, str, angle);
+            applyReferenceOperator(stateRef, matrRef);
+        };
+
+        CAPTURE( targs, angle );
+        SECTION( LABEL_STATEVEC ) { TEST_ON_CACHED_QUREGS(statevecQuregs, statevecRef, testFunc); }
+        SECTION( LABEL_DENSMATR ) { TEST_ON_CACHED_QUREGS(densmatrQuregs, densmatrRef, testFunc); }
+    }
+
+    /// @todo input validation
+}
+
+
 /** @} (end defgroup) */
 
 
diff --git a/tests/utils/linalg.cpp b/tests/utils/linalg.cpp
index f4686fc44..c7477521c 100644
--- a/tests/utils/linalg.cpp
+++ b/tests/utils/linalg.cpp
@@ -345,7 +345,7 @@ qmatrix getExponentialOfDiagonalMatrix(qmatrix m) {
 }
 
 
-qmatrix getExponentialOfPauliMatrix(qreal arg, qmatrix m) {
+qmatrix getExponentialOfPauliMatrix(qcomp arg, qmatrix m) {
     
     // exp(-i arg/2 m) where m = prod(paulis)
     qmatrix id = getIdentityMatrix(m.size());
diff --git a/tests/utils/linalg.hpp b/tests/utils/linalg.hpp
index 87cd116b5..c761b5fe7 100644
--- a/tests/utils/linalg.hpp
+++ b/tests/utils/linalg.hpp
@@ -49,7 +49,7 @@ qmatrix getConjugate(qmatrix);
 qmatrix getConjugateTranspose(qmatrix);
 qmatrix getPowerOfDiagonalMatrix(qmatrix diag, qcomp power);
 qmatrix getExponentialOfDiagonalMatrix(qmatrix);
-qmatrix getExponentialOfPauliMatrix(qreal arg, qmatrix pauli);
+qmatrix getExponentialOfPauliMatrix(qcomp arg, qmatrix pauli);
 qmatrix getExponentialOfNormalisedPauliVector(qreal arg, qreal x, qreal y, qreal z);
 qmatrix getOrthonormalisedRows(qmatrix);
 qmatrix getOrthonormalisedRows(qmatrix);

From f3baf3455956c03f98f351e69bcdb5d47b45562b Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 12 Jun 2025 23:34:34 +0200
Subject: [PATCH 03/32] patched Trotter sign and added non-unitary Trotter
 (#647)

- applyTrotterizedPauliStrSumGadget() was documented to effect exp(i t H) but actually effected exp(-i t H), eep! Thankfully the doc warned the function was untested. It now correctly effects exp(i t H).
- defensively removed hardcoding of the scalar responsible for the above bug, which undoes the coefficient convention of the applyPauliGadget() functions
- added applyNonUnitaryTrotterizedPauliStrSumGadget() which permits a non-Hermitian Hamiltonian (i.e. with non-negligible imaginary components of the coefficients) and a complex angle parameter. Among other things, this permits simple imaginary-time evolution
- added the full doc for both functions
- fixed defunct doxygen command (@cppoverload)

Note that both these Trotter functions remain without unit tests since impending new functions are anticipated which will generalise the tests.
---
 quest/include/calculations.h |   6 +-
 quest/include/operations.h   | 178 ++++++++++++++++++++++++++++++++++-
 quest/src/api/operations.cpp |  34 +++++--
 quest/src/core/utilities.cpp |   5 +-
 quest/src/core/utilities.hpp |   1 -
 tests/unit/operations.cpp    |   4 +-
 6 files changed, 207 insertions(+), 21 deletions(-)

diff --git a/quest/include/calculations.h b/quest/include/calculations.h
index 3d721b573..b54af4492 100644
--- a/quest/include/calculations.h
+++ b/quest/include/calculations.h
@@ -336,7 +336,7 @@ qcomp calcExpecNonHermitianFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr
 /// @notyettested
 /// @notyetdoced
 /// @notyetvalidated
-/// @cppoverload
+/// @cppvectoroverload
 /// @see calcProbOfMultiQubitOutcome()
 qreal calcProbOfMultiQubitOutcome(Qureg qureg, std::vector<int> qubits, std::vector<int> outcomes);
 
@@ -354,7 +354,7 @@ std::vector<qreal> calcProbsOfAllMultiQubitOutcomes(Qureg qureg, std::vector<int
 /// @notyettested
 /// @notyetdoced
 /// @notyetvalidated
-/// @cppoverload
+/// @cppvectoroverload
 /// @see calcPartialTrace()
 Qureg calcPartialTrace(Qureg qureg, std::vector<int> traceOutQubits);
 
@@ -363,7 +363,7 @@ Qureg calcPartialTrace(Qureg qureg, std::vector<int> traceOutQubits);
 /// @notyettested
 /// @notyetdoced
 /// @notyetvalidated
-/// @cppoverload
+/// @cppvectoroverload
 /// @see calcReducedDensityMatrix()
 Qureg calcReducedDensityMatrix(Qureg qureg, std::vector<int> retainQubits);
 
diff --git a/quest/include/operations.h b/quest/include/operations.h
index 3054802c2..0b11980e7 100644
--- a/quest/include/operations.h
+++ b/quest/include/operations.h
@@ -1834,6 +1834,7 @@ void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
      qcomp factor = cexp(- theta / 2 * 1.i);
      setQuregToSuperposition(factor, qureg, 0,qureg,0,qureg);
  *   ```
+ * - Passing @p angle=0 is equivalent to effecting the identity, leaving the state unchanged.
  *
  * @myexample
  * ```
@@ -1850,13 +1851,20 @@ void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
     // concisely
     applyPauliGadget(qureg, getInlinePauliStr("XYZ",{0,1,7}), theta);
  * ```
- * - Passing @p angle=0 is equivalent to effecting the identity, leaving the state unchanged.
+ *
+ * @see
+ *  - applyNonUnitaryPauliGadget()
  */
 void applyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
 
-/// @notyetdoced
+
+/** @notyetdoced
+ * 
+ * This function generalises applyPauliGadget() to accept a complex angle.
+ */
 void applyNonUnitaryPauliGadget(Qureg qureg, PauliStr str, qcomp angle);
 
+
 /// @notyetdoced
 void applyControlledPauliGadget(Qureg qureg, int control, PauliStr str, qreal angle);
 
@@ -2265,8 +2273,13 @@ extern "C" {
 void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
 
 
-/** @notyetdoced
- * @notyettested
+/** @notyettested
+ * 
+ * Effects (an approximation to) the exponential of @p sum, weighted by @p angle, upon @p qureg,
+ * via the symmetrized Trotter-Suzuki decomposition (<a href="https://arxiv.org/abs/math-ph/0506007">arXiv</a>).
+ * Increasing @p reps (the number of Trotter repetitions) or @p order (an even, positive integer or one) 
+ * improves the accuracy of the approximation (reducing the "Trotter error" due to non-commuting 
+ * terms of @p sum), though increases the runtime linearly and exponentially respectively.
  * 
  * @formulae 
  * 
@@ -2275,14 +2288,18 @@ void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
       \exp \left(\iu \, \theta \, \hat{H} \right)
  * @f]
  * via a Trotter-Suzuki decomposition of the specified @p order and number of repetitions (@p reps).
+ * Simulation is exact, regardless of @p order or @p reps, only when all terms in @p sum commute.
  * 
+ * @important
+ *   Note that @f$ \theta @f$ lacks the @f$ -\frac{1}{2} @f$ prefactor present in other functions like
+ *   applyPauliGadget().
  * 
  * To be precise, let @f$ r = @f$ @p reps and assume @p sum is composed of
  * @f$ T @f$-many terms of the form
  * @f[
       \hat{H} = \sum\limits_j^T c_j \, \hat{\sigma}_j
  * @f]
- * where @f$ c_j @f$ is the (necessarily real) coefficient of the @f$ j @f$-th PauliStr @f$ \hat{\sigma}_j @f$.
+ * where @f$ c_j @f$ is the coefficient of the @f$ j @f$-th PauliStr @f$ \hat{\sigma}_j @f$.
  * 
  * - When @p order=1, this function performs first-order Trotterisation, whereby
  *   @f[
@@ -2315,10 +2332,161 @@ void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
  * 
  * > These formulations are taken from 'Finding Exponential Product Formulas
  * > of Higher Orders', Naomichi Hatano and Masuo Suzuki (2005) (<a href="https://arxiv.org/abs/math-ph/0506007">arXiv</a>).
+ * 
+ * @equivalences
+ * 
+ * - Time evolution of duration @f$ t @f$ under a time-independent Hamiltonian @p sum = @f$ \hat{H} @f$, as
+ *   per the unitary time evolution operator
+ *   @f[
+        \hat{U}(t) = \exp(- \iu \, t  \,\hat{H} \, / \, \hbar) 
+ *   @f]
+ *   is approximated via @f$ \theta = - t / \hbar @f$.
+ *   ```
+     qreal time = 3.14;
+     qreal angle = - time / hbar;
+     applyTrotterizedPauliStrSumGadget(qureg, sum, angle, order, reps);
+ *   ```
+ * - This function is equivalent to applyNonUnitaryTrotterizedPauliStrSumGadget() when passing
+ *   a @p qcomp instance with a zero imaginary component as the @p angle parameter. This latter 
+ *   function is useful for generalising dynamical simulation to imaginary-time evolution.
+ * 
+ * @constraints
+ * - Unitarity of the prescribed exponential(s) requires that @p sum is Hermitian, ergo containing
+ *   only real coefficients. Validation will check that @p sum is approximately Hermitian, permitting
+ *   coefficients with imaginary components smaller (in magnitude) than epsilon.
+ *   @f[ 
+        \max\limits_{i} \Big|c_i| \le \valeps
+ *   @f]
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon().
+ *   Otherwise, use applyNonUnitaryTrotterizedPauliStrSumGadget() to permit non-Hermitian @p sum
+ *   and ergo effect a non-unitary exponential(s). 
+ * - The @p angle parameter is necessarily real despite the validation epsilon, but can be relaxed
+ *   to an arbitrary complex scalar using applyNonUnitaryTrotterizedPauliStrSumGadget().
+ * - This function only ever effects @f$ \exp \left(\iu \, \theta \, \hat{H} \right) @f$ exactly
+ *   when all PauliStr in @p sum = @f$ \hat{H} @f$ commute. 
+ * 
+ * @param[in,out] qureg  the state to modify.
+ * @param[in]     sum    a weighted sum of Pauli strings to approximately exponentiate.
+ * @param[in]     angle  an effective prefactor of @p sum in the exponent.
+ * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...)
+ * @param[in]     reps   the number of Trotter repetitions
+ * 
+ * @throws @validationerror
+ * - if @p qureg or @p sum are uninitialised.
+ * - if @p sum is not approximately Hermitian.
+ * - if @p sum contains non-identities on qubits beyond the size of @p qureg.
+ * - if @p order is not 1 nor a positive, @b even integer.
+ * - if @p reps is not a positive integer.
+ * 
+ * @see
+ *  - applyPauliGadget()
+ *  - applyNonUnitaryTrotterizedPauliStrSumGadget()
+ * 
+ * @author Tyson Jones
  */
 void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps);
 
 
+/** @notyettested
+ * 
+ * A generalisation of applyTrotterizedPauliStrSumGadget() which accepts a complex angle and permits
+ * @p sum to be non-Hermitian, thereby effecting a potentially non-unitary and non-CPTP operation.
+ * 
+ * @formulae 
+ * 
+ * Let @f$ \hat{H} = @f$ @p sum and @f$ \theta = @f$ @p angle. This function approximates the action of
+ * @f[
+      \exp \left(\iu \, \theta \, \hat{H} \right)
+ * @f]
+ * via a Trotter-Suzuki decomposition of the specified @p order and number of repetitions (@p reps). 
+ * 
+ * See applyTrotterizedPauliStrSumGadget() for more information about the decomposition.
+ *
+ * @equivalences
+ * 
+ * - When @p angle is set to @f$ \theta = \iu \, \tau @f$ and @p sum = @f$ \hat{H} @f$ is Hermitian,
+ *   this function (approximately) evolves @p qureg in imaginary-time. That is, letting 
+ *   @f$ \hat{U}(t) = \exp(-\iu \, t \, \hat{H}) @f$ be the normalised unitary evolution operator, this 
+ *   function effects the imaginary-time operator
+     @f[
+        \hat{V}(\tau) = \hat{U}(t=-\iu \tau) = \exp(- \tau \hat{H}).
+ *   @f]
+ *   This operation drives the system toward the (unnormalised) groundstate.
+ *   Let @f$ \{ \ket{\phi_i} \} @f$ and @f$ \{ \ket{\lambda_i} \} @f$ be the eigenstates and respective
+ *   eigenvalues of @f$ \hat{H} @f$, which are real due to Hermiticity.
+ *   @f[
+         \hat{H} = \sum \limits_i \lambda_i \ket{\phi_i}\bra{\phi_i},
+         \;\;\;\;\; \lambda_i \in \mathbb{R}.
+ *   @f]
+ *   
+ *   - When @p qureg is a statevector @f$ \svpsi @f$ and can ergo be expressed in the basis of 
+ *     @f$ \{ \ket{\phi_i} \} @f$ as @f$ \svpsi = \sum_i \alpha_i \ket{\phi_i} @f$, 
+ *     this function approximates
+ *     @f[
+          \svpsi \, \rightarrow  \, \hat{V}(\tau) \svpsi =
+          \sum\limits_i \alpha_i \exp(- \tau \, \lambda_i) \ket{\phi_i}.
+ *     @f]
+ *   - When @p qureg is a density matrix and is ergo expressible as
+ *     @f$ \dmrho = \sum\limits_{ij} \alpha_{ij} \ket{\phi_i}\bra{\phi_j} @f$, this function effects
+ *     @f[
+          \dmrho \, \rightarrow \, \hat{V}(\tau) \dmrho \hat{V}(\tau)^\dagger =
+          \sum\limits_{ij} \alpha_{ij} \exp(-\tau (\lambda_i + \lambda_j)) \ket{\phi_i}\bra{\phi_j}.
+ *     @f]
+ *
+ *   As @f$ \tau \rightarrow \infty @f$, the resulting unnormalised state approaches statevector
+ *   @f$ \svpsi \rightarrow \alpha_0 \exp(-\tau \lambda_0) \ket{\phi_0} @f$ or density matrix
+ *   @f$ \dmrho \rightarrow \alpha_{0,0} \exp(-2 \tau \lambda_0) \ket{\phi_0}\bra{\phi_0} @f$,
+ *   where @f$ \lambda_0 @f$ is the minimum eigenvalue and @f$ \ket{\phi_0} @f$ is the groundstate.
+ *   Assuming the initial overlap @f$ \alpha_0 @f$ is not zero (or exponentially tiny), 
+ *   subsequent renormalisation via setQuregToRenormalized() produces the pure 
+ *   ground-state @f$ \ket{\phi_0} @f$.
+ *
+ *   ```
+     // pray for a non-zero initial overlap
+     initRandomPureState(qureg); // works even for density matrices
+
+     // minimize then renormalise
+     qreal tau = 10; // impatient infinity
+     int order = 4;
+     int reps = 100;
+     applyNonUnitaryTrotterizedPauliStrSumGadget(qureg, hamil, tau * 1i, order, reps);
+     setQuregToRenormalized(qureg);
+
+     // ground-state (phi_0)
+     reportQureg(qureg);
+
+     // lowest lying eigenvalue (lambda_0)
+     qreal expec = calcExpecPauliStrSum(qureg, hamil);
+     reportScalar("expec", expec);
+ *   ```
+ *
+ *   Note degenerate eigenvalues will yield a pure superposition of the corresponding eigenstates, with 
+ *   coefficients informed by the initial, relative populations.
+ * 
+ * - When @p angle is real and @p sum is Hermitian (has approximately real coefficients), this
+ *   function is equivalent to applyTrotterizedPauliStrSumGadget()
+ * 
+ * @constraints
+ * - This function only ever effects @f$ \exp \left(\iu \, \theta \, \hat{H} \right) @f$ exactly
+ *   when all PauliStr in @p sum = @f$ \hat{H} @f$ commute. 
+ * 
+ * @param[in,out] qureg  the state to modify.
+ * @param[in]     sum    a weighted sum of Pauli strings to approximately exponentiate.
+ * @param[in]     angle  an effective prefactor of @p sum in the exponent.
+ * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...)
+ * @param[in]     reps   the number of Trotter repetitions
+ * 
+ * @throws @validationerror
+ * - if @p qureg or @p sum are uninitialised.
+ * - if @p sum contains non-identities on qubits beyond the size of @p qureg.
+ * - if @p order is not 1 nor a positive, @b even integer.
+ * - if @p reps is not a positive integer.
+ * 
+ * @author Tyson Jones
+ */
+void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);
+
+
 // end de-mangler
 #ifdef __cplusplus
 }
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index f64336457..3b58f298f 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -1130,18 +1130,20 @@ void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
     // workspace -> qureg, and qureg -> sum * qureg
 }
 
-void applyFirstOrderTrotter(Qureg qureg, PauliStrSum sum, qreal angle, bool reverse) {
+void applyFirstOrderTrotter(Qureg qureg, PauliStrSum sum, qcomp angle, bool reverse) {
 
     // (internal, invoked by applyTrotterizedPauliStrSumGadget)
 
     for (qindex i=0; i<sum.numTerms; i++) {
         int j = reverse? sum.numTerms - i - 1 : i;
-        qreal arg = 2 * angle * std::real(sum.coeffs[j]); // 2 undoes Gadget convention
-        applyPauliGadget(qureg, sum.strings[j], arg); // caller disabled valiation therein
+
+        // effect exp(i angle * sum) by undoing gadget pre-factor
+        qcomp arg = angle * sum.coeffs[j] / util_getPhaseFromGateAngle(1);
+        applyNonUnitaryPauliGadget(qureg, sum.strings[j], arg); // caller disabled valiation therein
     }
 }
 
-void applyHigherOrderTrotter(Qureg qureg, PauliStrSum sum, qreal angle, int order) {
+void applyHigherOrderTrotter(Qureg qureg, PauliStrSum sum, qcomp angle, int order) {
 
     // (internal, invoked by applyTrotterizedPauliStrSumGadget)
 
@@ -1154,8 +1156,8 @@ void applyHigherOrderTrotter(Qureg qureg, PauliStrSum sum, qreal angle, int orde
     
     } else {
         qreal p = 1. / (4 - std::pow(4, 1./(order-1)));
-        qreal a = p * angle;
-        qreal b = (1-4*p) * angle;
+        qcomp a = p * angle;
+        qcomp b = (1-4*p) * angle;
 
         int lower = order - 2;
         applyFirstOrderTrotter(qureg, sum, a, lower);
@@ -1166,15 +1168,15 @@ void applyHigherOrderTrotter(Qureg qureg, PauliStrSum sum, qreal angle, int orde
     }
 }
 
-void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps) {
+void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrSumFields(sum, __func__);
     validate_pauliStrSumTargets(sum, qureg, __func__);
-    validate_pauliStrSumIsHermitian(sum, __func__);
     validate_trotterParams(qureg, order, reps, __func__);
+    // sum is permitted to be non-Hermitian
 
     // exp(i angle sum) = identity when angle=0
-    if (angle == 0)
+    if (angle == qcomp(0,0))
         return;
 
     // record validation state then disable to avoid repeated
@@ -1196,6 +1198,20 @@ void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle
     /// implement these above or into another function?
 }
 
+void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps) {
+
+    // validate inputs here despite re-validation below so that func name is correct in error message
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_pauliStrSumTargets(sum, qureg, __func__);
+    validate_trotterParams(qureg, order, reps, __func__);
+
+    // crucially, require that sum coefficients are real
+    validate_pauliStrSumIsHermitian(sum, __func__);
+
+    applyNonUnitaryTrotterizedPauliStrSumGadget(qureg, sum, angle, order, reps);
+}
+
 } // end de-mangler
 
 
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 471d14254..1c22d43d8 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -912,12 +912,13 @@ qreal util_getPhaseFromGateAngle(qreal angle) {
 
     return - angle / 2;
 }
-
 qcomp util_getPhaseFromGateAngle(qcomp angle) {
 
-    return -angle / qcomp(2.0, 0.0);
+    return - angle / 2;
 }
 
+
+
 /*
  * DECOHERENCE FACTORS
  */
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index f28055ca9..6d741312b 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -349,7 +349,6 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
  */
 
 qreal util_getPhaseFromGateAngle(qreal angle);
-
 qcomp util_getPhaseFromGateAngle(qcomp angle);
 
 
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index 74bca9046..0fc33851c 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -200,7 +200,7 @@ void TEST_ON_CACHED_QUREG_AND_MATRIX(quregCache quregs, matrixCache matrices, au
  * - applyDiagMatrPower:    diagpower
  * - applyCompMatr:         compmatr
  * - applyPauliStr:         paulistr
- * - applyPauliGadgt:       pauligad
+ * - applyPauliGadget:      pauligad
 */
 
 enum ArgsFlag { none, scalar, axisrots, diagmatr, diagpower, compmatr, paulistr, pauligad };
@@ -1959,3 +1959,5 @@ TEST_CASE( "applyNonUnitaryPauliGadget", TEST_CATEGORY ) {
  */
 
 void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps);
+
+void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);

From d4706b92a4f66cc541e6ba1a9f9bde0c4d85a075 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 13 Jun 2025 20:42:14 +0200
Subject: [PATCH 04/32] added controlled Trotter

New API functions:
- applyControlledTrotterizedPauliStrSumGadget()
- applyMultiControlledTrotterizedPauliStrSumGadget()
- applyMultiStateControlledTrotterizedPauliStrSumGadget()
- C++-only std::vector overloads of the latter two.

Additionally:
- renamed the internal constituent functions, like applyFirstOrderTrotter(), to explicit internal_applyFirstOrderTrotterRepetition()
- renamed paulis_getInds() to paulis_getTargetInds()

Note that new validation was required to check that no PauliStrSum non-identity Paulis overlapped the control qubits. This is relatively expensive; we build a PauliStrSum target-mask in O(#terms * #qubits) time whereas the previous most expensive validation (checking PauliStrSum targets do not exceed Qureg) costs O(#terms * log(#qubits)). Such costs are still completely occluded by those of simulating/processing a PauliStrSum in the backend, but might still attract lazy evaluation of the target-mask which is bound to the PauliStrSum instance. We have deferred any such optimisation and the associated struct changes since it necessitates an update to the PauliStrSum design (like new sync functions)
---
 quest/include/operations.h    |  44 +++++++++++
 quest/src/api/operations.cpp  | 135 ++++++++++++++++++++++++----------
 quest/src/api/paulis.cpp      |  35 ++++++++-
 quest/src/core/localiser.cpp  |   6 +-
 quest/src/core/validation.cpp |  42 +++++++++--
 quest/src/core/validation.hpp |   4 +
 tests/unit/operations.cpp     |   8 +-
 7 files changed, 221 insertions(+), 53 deletions(-)

diff --git a/quest/include/operations.h b/quest/include/operations.h
index 0b11980e7..0898165af 100644
--- a/quest/include/operations.h
+++ b/quest/include/operations.h
@@ -2387,6 +2387,30 @@ void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
 void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps);
 
 
+/// @notyetdoced
+/// @notyettested
+/// @see
+///  - applyTrotterizedPauliStrSumGadget()
+///  - applyControlledCompMatr1()
+void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see
+///  - applyTrotterizedPauliStrSumGadget()
+///  - applyMultiControlledCompMatr1()
+void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see
+///  - applyTrotterizedPauliStrSumGadget()
+///  - applyMultiStateControlledCompMatr1()
+void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
+
+
 /** @notyettested
  * 
  * A generalisation of applyTrotterizedPauliStrSumGadget() which accepts a complex angle and permits
@@ -2492,6 +2516,26 @@ void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, q
 }
 #endif
 
+#ifdef __cplusplus
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see applyMultiControlledTrotterizedPauliStrSumGadget()
+void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, std::vector<int> controls, PauliStrSum sum, qreal angle, int order, int reps);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see applyMultiStateControlledTrotterizedPauliStrSumGadget()
+void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, std::vector<int> controls, std::vector<int> states, PauliStrSum sum, qreal angle, int order, int reps);
+
+
+#endif // __cplusplus
 
 /** @} */
 
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index 3b58f298f..ad31411c3 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -1130,29 +1130,40 @@ void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
     // workspace -> qureg, and qureg -> sum * qureg
 }
 
-void applyFirstOrderTrotter(Qureg qureg, PauliStrSum sum, qcomp angle, bool reverse) {
-
-    // (internal, invoked by applyTrotterizedPauliStrSumGadget)
-
+void internal_applyFirstOrderTrotterRepetition(
+    Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
+    vector<int>& states, PauliStrSum sum, qcomp angle, bool reverse
+) {
+    // apply each sum term as a gadget, in forward or reverse order
     for (qindex i=0; i<sum.numTerms; i++) {
         int j = reverse? sum.numTerms - i - 1 : i;
+        qcomp coeff = sum.coeffs[j];
+        PauliStr str = sum.strings[j];
 
-        // effect exp(i angle * sum) by undoing gadget pre-factor
-        qcomp arg = angle * sum.coeffs[j] / util_getPhaseFromGateAngle(1);
-        applyNonUnitaryPauliGadget(qureg, sum.strings[j], arg); // caller disabled valiation therein
-    }
-}
+        // effect |psi> -> exp(i angle * sum)|psi>
+        qcomp arg = angle * coeff;
+        localiser_statevec_anyCtrlPauliGadget(qureg, ketCtrls, states, str, arg);
 
-void applyHigherOrderTrotter(Qureg qureg, PauliStrSum sum, qcomp angle, int order) {
+        if (!qureg.isDensityMatrix)
+            continue;
 
-    // (internal, invoked by applyTrotterizedPauliStrSumGadget)
+        // effect rho -> rho dagger(i angle * sum)
+        arg *= paulis_hasOddNumY(str) ? 1 : -1;
+        str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+        localiser_statevec_anyCtrlPauliGadget(qureg, braCtrls, states, str, arg);
+    }
+}
 
+void internal_applyHigherOrderTrotterRepetition(
+    Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
+    vector<int>& states, PauliStrSum sum, qcomp angle, int order
+) {
     if (order == 1) {
-        applyFirstOrderTrotter(qureg, sum, angle, false);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle, false);
     
     } else if (order == 2) {
-        applyFirstOrderTrotter(qureg, sum, angle/2, false);
-        applyFirstOrderTrotter(qureg, sum, angle/2, true);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle/2, false);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle/2, true);
     
     } else {
         qreal p = 1. / (4 - std::pow(4, 1./(order-1)));
@@ -1160,37 +1171,33 @@ void applyHigherOrderTrotter(Qureg qureg, PauliStrSum sum, qcomp angle, int orde
         qcomp b = (1-4*p) * angle;
 
         int lower = order - 2;
-        applyFirstOrderTrotter(qureg, sum, a, lower);
-        applyFirstOrderTrotter(qureg, sum, a, lower);
-        applyFirstOrderTrotter(qureg, sum, b, lower);
-        applyFirstOrderTrotter(qureg, sum, a, lower);
-        applyFirstOrderTrotter(qureg, sum, a, lower);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, b, lower);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
     }
 }
 
-void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps) {
-    validate_quregFields(qureg, __func__);
-    validate_pauliStrSumFields(sum, __func__);
-    validate_pauliStrSumTargets(sum, qureg, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
-    // sum is permitted to be non-Hermitian
-
+void internal_applyAllTrotterRepetitions(
+    Qureg qureg, int* controls, int* states, int numControls, 
+    PauliStrSum sum, qcomp angle, int order, int reps
+) {
     // exp(i angle sum) = identity when angle=0
     if (angle == qcomp(0,0))
         return;
 
-    // record validation state then disable to avoid repeated
-    // re-validations in each invoked applyPauliGadget() below
-    bool wasValidationEnabled = validateconfig_isEnabled();
-    validateconfig_disable();
+    // prepare control-qubit lists once for all invoked gadgets below
+    auto ketCtrlsVec = util_getVector(controls, numControls);
+    auto braCtrlsVec = (qureg.isDensityMatrix)? util_getBraQubits(ketCtrlsVec, qureg) : vector<int>{};
+    auto statesVec = util_getVector(states, numControls);
 
-    // perform sequence of applyPauliGadget()
-    for (int r=0; r<reps; r++)
-        applyHigherOrderTrotter(qureg, sum, angle/reps, order);
+    qcomp arg = angle / reps;
 
-    // potentially restore validation
-    if (wasValidationEnabled)
-        validateconfig_enable();
+    // perform carefully-ordered sequence of gadgets
+    for (int r=0; r<reps; r++)
+        internal_applyHigherOrderTrotterRepetition(
+            qureg, ketCtrlsVec, braCtrlsVec, statesVec, sum, arg, order);
 
     /// @todo
     /// the accuracy of Trotterisation is greatly improved by randomisation
@@ -1198,22 +1205,70 @@ void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, q
     /// implement these above or into another function?
 }
 
-void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps) {
+void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_pauliStrSumTargets(sum, qureg, __func__);
+    validate_trotterParams(qureg, order, reps, __func__);
+    // sum is permitted to be non-Hermitian
+
+    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps);
+}
 
-    // validate inputs here despite re-validation below so that func name is correct in error message
+void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrSumFields(sum, __func__);
     validate_pauliStrSumTargets(sum, qureg, __func__);
     validate_trotterParams(qureg, order, reps, __func__);
+    validate_pauliStrSumIsHermitian(sum, __func__);
+
+    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps);
+}
+
+void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_controlAndPauliStrSumTargets(qureg, control, sum, __func__);
+    validate_trotterParams(qureg, order, reps, __func__);
+    validate_pauliStrSumIsHermitian(sum, __func__);
 
-    // crucially, require that sum coefficients are real
+    internal_applyAllTrotterRepetitions(qureg, &control, nullptr, 1, sum, angle, order, reps);
+}
+
+void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_controlsAndPauliStrSumTargets(qureg, controls, numControls, sum, __func__);
+    validate_trotterParams(qureg, order, reps, __func__);
     validate_pauliStrSumIsHermitian(sum, __func__);
 
-    applyNonUnitaryTrotterizedPauliStrSumGadget(qureg, sum, angle, order, reps);
+    internal_applyAllTrotterRepetitions(qureg, controls, nullptr, numControls, sum, angle, order, reps);
+}
+
+void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_controlsAndPauliStrSumTargets(qureg, controls, numControls, sum, __func__);
+    validate_controlStates(states, numControls, __func__); // permits states==nullptr
+    validate_trotterParams(qureg, order, reps, __func__);
+    validate_pauliStrSumIsHermitian(sum, __func__);
+
+    internal_applyAllTrotterRepetitions(qureg, controls, states, numControls, sum, angle, order, reps);
 }
 
 } // end de-mangler
 
+void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, vector<int> controls, PauliStrSum sum, qreal angle, int order, int reps) {
+
+    applyMultiControlledTrotterizedPauliStrSumGadget(qureg, controls.data(), controls.size(), sum, angle, order, reps);
+}
+
+void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, vector<int> controls, vector<int> states, PauliStrSum sum, qreal angle, int order, int reps) {
+    validate_controlsMatchStates(controls.size(), states.size(), __func__);
+
+    applyMultiStateControlledTrotterizedPauliStrSumGadget(qureg, controls.data(), states.data(), controls.size(), sum, angle, order, reps);
+}
+
 
 
 /*
diff --git a/quest/src/api/paulis.cpp b/quest/src/api/paulis.cpp
index ad5d5c641..c770e5fb9 100644
--- a/quest/src/api/paulis.cpp
+++ b/quest/src/api/paulis.cpp
@@ -207,7 +207,7 @@ qcomp paulis_getPrefixPaulisElem(Qureg qureg, vector<int> prefixY, vector<int> p
 }
 
 
-vector<int> paulis_getInds(PauliStr str) {
+vector<int> paulis_getTargetInds(PauliStr str) {
 
     int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
 
@@ -215,16 +215,33 @@ vector<int> paulis_getInds(PauliStr str) {
     inds.reserve(maxInd+1);
 
     for (int i=0; i<=maxInd; i++)
-        if (paulis_getPauliAt(str, i) != 0)
+        if (paulis_getPauliAt(str, i) != 0) // Id
             inds.push_back(i);
 
     return inds;
 }
 
 
+qindex paulis_getTargetBitMask(PauliStr str) {
+    
+    /// @todo 
+    /// would compile-time MAX_NUM_PAULIS_PER_STR bound be faster here,
+    /// since this function is invoked upon every PauliStrSum element?
+    int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
+
+    qindex mask = 0;
+
+    for (int i=0; i<=maxInd; i++)
+        if (paulis_getPauliAt(str, i) != 0) // Id
+            mask = flipBit(mask, i);
+
+    return mask;
+}
+
+
 array<vector<int>,3> paulis_getSeparateInds(PauliStr str, Qureg qureg) {
 
-    vector<int> iXYZ = paulis_getInds(str);
+    vector<int> iXYZ = paulis_getTargetInds(str);
     vector<int> iX, iY, iZ;
 
     vector<int>* ptrs[] = {&iX, &iY, &iZ};
@@ -295,6 +312,18 @@ PAULI_MASK_TYPE paulis_getKeyOfSameMixedAmpsGroup(PauliStr str) {
 }
 
 
+qindex paulis_getTargetBitMask(PauliStrSum sum) {
+
+    qindex mask = 0;
+
+    // mask has 1 where any str has a != Id
+    for (int t=0; t<sum.numTerms; t++)
+        mask |= paulis_getTargetBitMask(sum.strings[t]);
+
+    return mask;
+}
+
+
 
 /*
  * PAULI STRING INITIALISATION
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 022ace7a6..cdaef4c24 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -1238,7 +1238,7 @@ template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, vector<int>, vecto
 
 
 extern bool paulis_containsXOrY(PauliStr str);
-extern vector<int> paulis_getInds(PauliStr str);
+extern vector<int> paulis_getTargetInds(PauliStr str);
 extern std::array<vector<int>,3> paulis_getSeparateInds(PauliStr str, Qureg qureg);
 extern int paulis_getPrefixZSign(Qureg qureg, vector<int> prefixZ) ;
 extern qcomp paulis_getPrefixPaulisElem(Qureg qureg, vector<int> prefixY, vector<int> prefixZ);
@@ -1334,7 +1334,7 @@ void localiser_statevec_anyCtrlPauliTensor(Qureg qureg, vector<int> ctrls, vecto
 
         bool isGadget = false;
         qreal phase = 0; // ignored
-        anyCtrlZTensorOrGadget(qureg, ctrls, ctrlStates, paulis_getInds(str), isGadget, phase);
+        anyCtrlZTensorOrGadget(qureg, ctrls, ctrlStates, paulis_getTargetInds(str), isGadget, phase);
     }
 }
 
@@ -1350,7 +1350,7 @@ void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, vector<int> ctrls, vecto
 
     // when str=IZ, we must use the above bespoke algorithm
     if (!paulis_containsXOrY(str)) {
-        localiser_statevec_anyCtrlPhaseGadget(qureg, ctrls, ctrlStates, paulis_getInds(str), phase);
+        localiser_statevec_anyCtrlPhaseGadget(qureg, ctrls, ctrlStates, paulis_getTargetInds(str), phase);
         return;
     }
 
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 4d316cdff..4b3406975 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -739,6 +739,10 @@ namespace report {
     string PAULI_STR_SUM_EXCEEDS_MATR_NUM_QUBITS =
         "The given PauliStrSum includes non-identity upon a qubit of index ${MAX_IND} and so is only compatible with FullStateDiagMatr containing at least ${NUM_PSS_QUBITS}. It cannot initialise the given ${NUM_MATR_QUBITS}-qubit FullStateDiagMatr.";
 
+    
+    string PAULI_STR_SUM_OVERLAPS_CONTROLS =
+        "A control qubit overlaps a non-identity Pauli operator in the given PauliStrSum.";
+
 
     /*
      * BASIS STATE INDICES
@@ -3250,8 +3254,11 @@ void validate_parsedStringIsNotEmpty(bool stringIsNotEmpty, const char* caller)
  */
 
 extern bool paulis_containsXOrY(PauliStrSum sum);
+extern qindex paulis_getTargetBitMask(PauliStrSum sum);
 extern int paulis_getIndOfLefmostNonIdentityPauli(PauliStrSum sum);
 
+bool areQubitsDisjoint(qindex qubitsMaskA, int* qubitsB, int numQubitsB);
+
 void validate_pauliStrSumFields(PauliStrSum sum, const char* caller) {
 
     assertThat(sum.numTerms > 0, report::INVALID_PAULI_STR_SUM_FIELDS, {{"${NUM_TERMS}", sum.numTerms}}, caller);
@@ -3292,6 +3299,22 @@ void validate_pauliStrSumTargets(PauliStrSum sum, Qureg qureg, const char* calle
     assertThat(qureg.numQubits >= minNumQb, report::PAULI_STR_SUM_EXCEEDS_QUREG_NUM_QUBITS, vars, caller);
 }
 
+void validate_controlsAndPauliStrSumTargets(Qureg qureg, int* ctrls, int numCtrls, PauliStrSum sum, const char* caller) {
+
+    // validate targets and controls in isolation
+    validate_pauliStrSumTargets(sum, qureg, caller);
+    validate_controls(qureg, ctrls, numCtrls, caller);
+
+    // validate that they do not overlap (i.e. sum has only I at ctrls, never X Y Z)
+    qindex targetMask = paulis_getTargetBitMask(sum);
+    assertThat(areQubitsDisjoint(targetMask, ctrls, numCtrls), report::PAULI_STR_SUM_OVERLAPS_CONTROLS, caller);
+}
+
+void validate_controlAndPauliStrSumTargets(Qureg qureg, int ctrl, PauliStrSum sum, const char* caller) {
+
+    validate_controlsAndPauliStrSumTargets(qureg, &ctrl, 1, sum, caller);
+}
+
 void validate_pauliStrSumCanInitMatrix(FullStateDiagMatr matr, PauliStrSum sum, const char* caller) {
 
     assertThat(!paulis_containsXOrY(sum), report::PAULI_STR_SUM_NOT_ALL_I_Z, caller);
@@ -3445,9 +3468,12 @@ void validate_localAmpIndices(Qureg qureg, qindex localStartInd, qindex numInds,
 
 bool areQubitsUnique(int* qubits, int numQubits) {
 
-    // assumes all elemtns of qubits are < 64
+    // assumes all elements of qubits are < 64
     qindex mask = 0;
 
+    // avoids calling getBitMask() so as to avoid
+    // gratuitous, full enumeration of qubits when
+    // numQubits is ridiculously long
     for (int n=0; n<numQubits; n++)
         if (getBit(mask, qubits[n]))
             return false;
@@ -3457,18 +3483,22 @@ bool areQubitsUnique(int* qubits, int numQubits) {
     return true;
 }
 
-bool areQubitsDisjoint(int* qubitsA, int numQubitsA, int* qubitsB, int numQubitsB) {
-
-    // assumes all elemtns of qubits are < 64
-    qindex maskA = getBitMask(qubitsA, numQubitsA);
+bool areQubitsDisjoint(qindex qubitsMaskA, int* qubitsB, int numQubitsB) {
 
     for (int n=0; n<numQubitsB; n++)
-        if (getBit(maskA, qubitsB[n]))
+        if (getBit(qubitsMaskA, qubitsB[n]))
             return false;
     
     return true;
 }
 
+bool areQubitsDisjoint(int* qubitsA, int numQubitsA, int* qubitsB, int numQubitsB) {
+
+    return areQubitsDisjoint(
+        getBitMask(qubitsA, numQubitsA), 
+        qubitsB, numQubitsB);
+}
+
 void assertValidQubit(Qureg qureg, int qubitInd, string msg, const char* caller) {
 
     tokenSubs vars = {
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 2aa9da71a..0bf48b409 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -342,6 +342,10 @@ void validate_pauliStrSumIsHermitian(PauliStrSum sum, const char* caller);
 
 void validate_pauliStrSumTargets(PauliStrSum sum, Qureg qureg, const char* caller);
 
+void validate_controlAndPauliStrSumTargets(Qureg qureg, int ctrl, PauliStrSum sum, const char* caller);
+
+void validate_controlsAndPauliStrSumTargets(Qureg qureg, int* ctrls, int numCtrls, PauliStrSum sum, const char* caller);
+
 void validate_pauliStrSumCanInitMatrix(FullStateDiagMatr matr, PauliStrSum sum, const char* caller);
 
 
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index 0fc33851c..55bed11ab 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -1958,6 +1958,12 @@ TEST_CASE( "applyNonUnitaryPauliGadget", TEST_CATEGORY ) {
  * UNTESTED FUNCTIONS
  */
 
+void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);
+
 void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps);
 
-void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);
+void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps);
+
+void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
+
+void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps);

From c357267563e8fa9701908f6ff8057c1f7277df63 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sat, 14 Jun 2025 10:04:04 +0200
Subject: [PATCH 05/32] bound isCuQuantumEnabled to QuESTEnv

so that the COMPILE_CUQUANTUM preprocessor need only ever be consulted by the source during compilation, as proposed by Oliver in #645
---
 quest/include/environment.h   |  5 ++++-
 quest/include/qureg.h         |  2 +-
 quest/src/api/environment.cpp | 21 ++++++++++++---------
 tests/main.cpp                | 14 +++++++-------
 tests/unit/environment.cpp    |  7 ++++---
 5 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/quest/include/environment.h b/quest/include/environment.h
index df24ab258..a71454f0e 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -32,11 +32,14 @@ extern "C" {
 /// @notyetdoced
 typedef struct {
 
-    // deployment mode
+    // deployment modes which can be runtime disabled
     int isMultithreaded;
     int isGpuAccelerated;
     int isDistributed;
 
+    // deployment modes which cannot be directly changed after compilation
+    int isCuQuantumEnabled;
+
     // distributed configuration
     int rank;
     int numNodes;
diff --git a/quest/include/qureg.h b/quest/include/qureg.h
index 39d744f0a..f3284fa14 100644
--- a/quest/include/qureg.h
+++ b/quest/include/qureg.h
@@ -329,7 +329,7 @@ Qureg createForcedDensityQureg(int numQubits);
  * </center>
  * 
  * @constraints
- * - Cannot use any deployment which has not been prior enabled during compilation, or disabled by createCustomQuESTEnv().
+ * - Cannot use any deployment which has not been prior enabled during compilation, or disabled by initCustomQuESTEnv().
  * - Cannot distribute @f$ N @f$ qubits over more than @f$ 2^N @f$ nodes (regardless of @p isDensMatr).
  * - Cannot distribute when the executable was not launched using MPI (e.g. via @c mpirun).
  * - Cannot GPU-accelerate when a GPU is not available at runtime, or has insufficient memory.
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 5c29cbcb8..63b6f41ef 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -114,8 +114,9 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
     /// should we warn here if each machine contains
     /// more GPUs than deployed MPI-processes (some GPUs idle)?
 
-    // use cuQuantum if compiled
-    if (useGpuAccel && gpu_isCuQuantumCompiled()) {
+    // cuQuantum is always used in GPU-accelerated envs when available
+    bool useCuQuantum = useGpuAccel && gpu_isCuQuantumCompiled();
+    if (useCuQuantum) {
         validate_gpuIsCuQuantumCompatible(caller); // assesses above bound GPU
         gpu_initCuQuantum();
     }
@@ -131,9 +132,10 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
         error_allocOfQuESTEnvFailed();
 
     // bind deployment info to global instance
-    globalEnvPtr->isMultithreaded  = useMultithread;
-    globalEnvPtr->isGpuAccelerated = useGpuAccel;
-    globalEnvPtr->isDistributed    = useDistrib;
+    globalEnvPtr->isMultithreaded    = useMultithread;
+    globalEnvPtr->isGpuAccelerated   = useGpuAccel;
+    globalEnvPtr->isDistributed      = useDistrib;
+    globalEnvPtr->isCuQuantumEnabled = useCuQuantum;
 
     // bind distributed info
     globalEnvPtr->rank     = (useDistrib)? comm_getRank()     : 0;
@@ -174,7 +176,7 @@ void printCompilationInfo() {
 
     print_table(
         "compilation", {
-        {"isMpiCompiled",      comm_isMpiCompiled()},
+        {"isMpiCompiled",       comm_isMpiCompiled()},
         {"isGpuCompiled",       gpu_isGpuCompiled()},
         {"isOmpCompiled",       cpu_isOpenmpCompiled()},
         {"isCuQuantumCompiled", gpu_isCuQuantumCompiled()},
@@ -186,9 +188,10 @@ void printDeploymentInfo() {
 
     print_table(
         "deployment", {
-        {"isMpiEnabled", globalEnvPtr->isDistributed},
-        {"isGpuEnabled", globalEnvPtr->isGpuAccelerated},
-        {"isOmpEnabled", globalEnvPtr->isMultithreaded},
+        {"isMpiEnabled",       globalEnvPtr->isDistributed},
+        {"isGpuEnabled",       globalEnvPtr->isGpuAccelerated},
+        {"isOmpEnabled",       globalEnvPtr->isMultithreaded},
+        {"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled},
     });
 }
 
diff --git a/tests/main.cpp b/tests/main.cpp
index d2ddd52cc..03294857a 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -88,13 +88,13 @@ class startListener : public Catch::EventListenerBase {
         QuESTEnv env = getQuESTEnv();
         std::cout << std::endl;
         std::cout << "QuEST execution environment:" << std::endl;
-        std::cout << "  precision:       " << FLOAT_PRECISION      << std::endl;
-        std::cout << "  multithreaded:   " << env.isMultithreaded  << std::endl;
-        std::cout << "  distributed:     " << env.isDistributed    << std::endl;
-        std::cout << "  GPU-accelerated: " << env.isGpuAccelerated << std::endl;
-        std::cout << "  cuQuantum:       " << (env.isGpuAccelerated && COMPILE_CUQUANTUM) << std::endl;
-        std::cout << "  num nodes:       " << env.numNodes         << std::endl;
-        std::cout << "  num qubits:      " << getNumCachedQubits() << std::endl;
+        std::cout << "  precision:       " << FLOAT_PRECISION        << std::endl;
+        std::cout << "  multithreaded:   " << env.isMultithreaded    << std::endl;
+        std::cout << "  distributed:     " << env.isDistributed      << std::endl;
+        std::cout << "  GPU-accelerated: " << env.isGpuAccelerated   << std::endl;
+        std::cout << "  cuQuantum:       " << env.isCuQuantumEnabled << std::endl;
+        std::cout << "  num nodes:       " << env.numNodes           << std::endl;
+        std::cout << "  num qubits:      " << getNumCachedQubits()   << std::endl;
         std::cout << "  num qubit perms: " << TEST_MAX_NUM_QUBIT_PERMUTATIONS << std::endl;
         std::cout << std::endl;
 
diff --git a/tests/unit/environment.cpp b/tests/unit/environment.cpp
index 6fc6d21ec..db9a1516c 100644
--- a/tests/unit/environment.cpp
+++ b/tests/unit/environment.cpp
@@ -133,9 +133,10 @@ TEST_CASE( "getQuESTEnv", TEST_CATEGORY ) {
 
         QuESTEnv env = getQuESTEnv();
 
-        REQUIRE( (env.isMultithreaded  == 0 || env.isMultithreaded  == 1) );
-        REQUIRE( (env.isGpuAccelerated == 0 || env.isGpuAccelerated == 1) );
-        REQUIRE( (env.isDistributed    == 0 || env.isDistributed    == 1) );
+        REQUIRE( (env.isMultithreaded    == 0 || env.isMultithreaded    == 1) );
+        REQUIRE( (env.isGpuAccelerated   == 0 || env.isGpuAccelerated   == 1) );
+        REQUIRE( (env.isDistributed      == 0 || env.isDistributed      == 1) );
+        REQUIRE( (env.isCuQuantumEnabled == 0 || env.isCuQuantumEnabled == 1) );
         
         REQUIRE( env.rank     >= 0 );
         REQUIRE( env.numNodes >= 0 );

From f28691cede4c66bcf106271be530667c448e31fa Mon Sep 17 00:00:00 2001
From: Oliver Thomson Brown <otbrown@users.noreply.github.com>
Date: Sat, 21 Jun 2025 13:05:55 +0100
Subject: [PATCH 06/32] Patched quest.h generation (#645)

- Promoted variables set in compile_option CMake function to parent scope, which ensures correct values are generated in the header file.
- Separated compilation configuration defines into config.h
- Added guards to check at least one of the required compiler macros is undefined as proposed by @TysonRayJones
- Removed guards from modes.h which prevented COMPILE macros from being defined, as proposed by @TysonRayJones
---
 CMakeLists.txt                        |  9 +++++++--
 quest/include/CMakeLists.txt          |  2 +-
 quest/include/config.h.in             | 22 ++++++++++++++++++++++
 quest/include/modes.h                 | 21 +--------------------
 quest/include/{quest.h.in => quest.h} | 12 ++----------
 5 files changed, 33 insertions(+), 33 deletions(-)
 create mode 100644 quest/include/config.h.in
 rename quest/include/{quest.h.in => quest.h} (87%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1080c55fc..d10721947 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,7 +79,7 @@ else()
   set(MULTI_LIB_HEADERS 0)
   function(compile_option VAR VALUE)
     target_compile_definitions(QuEST PRIVATE ${VAR}=${VALUE})
-    set(${VAR} ${VALUE})
+    set(${VAR} ${VALUE} PARENT_SCOPE)
   endfunction()
 endif()
 
@@ -576,13 +576,18 @@ install(FILES
         DESTINATION "${QuEST_INSTALL_CONFIGDIR}"
 )
 
-install(FILES "${CMAKE_CURRENT_BINARY_DIR}/include/quest.h"
+install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/quest/include/quest.h"
         DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
 )
 
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/include/quest/include/config.h"
+        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/quest/include"
+)
+
 install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/quest/include"
         DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/quest"
         FILES_MATCHING PATTERN "*.h"
+        PATTERN "quest.h" EXCLUDE
 )
 
 install(EXPORT QuESTTargets
diff --git a/quest/include/CMakeLists.txt b/quest/include/CMakeLists.txt
index 86a542dc5..bf0d1cd09 100644
--- a/quest/include/CMakeLists.txt
+++ b/quest/include/CMakeLists.txt
@@ -2,4 +2,4 @@
 # @author Erich Essmann
 # @author Luc Jaulmes (using config file)
 
-configure_file(quest.h.in "${CMAKE_BINARY_DIR}/include/quest.h" @ONLY)
+configure_file(config.h.in "${CMAKE_BINARY_DIR}/include/quest/include/config.h" @ONLY)
diff --git a/quest/include/config.h.in b/quest/include/config.h.in
new file mode 100644
index 000000000..8326259b6
--- /dev/null
+++ b/quest/include/config.h.in
@@ -0,0 +1,22 @@
+#ifndef CONFIG_H
+#define CONFIG_H
+
+// be warned, the below is sensitive to whitespace after the slash
+#if !defined(FLOAT_PRECISION)\
+    || !defined(COMPILE_MPI)\
+    || !defined(COMPILE_OPENMP)\
+    || !defined(COMPILE_CUDA)\
+    || !defined(COMPILE_CUQUANTUM)
+
+// bind compile settings to installed exec
+#if !@MULTI_LIB_HEADERS@
+#cmakedefine FLOAT_PRECISION @FLOAT_PRECISION@
+#cmakedefine01 COMPILE_MPI
+#cmakedefine01 COMPILE_OPENMP
+#cmakedefine01 COMPILE_CUDA
+#cmakedefine01 COMPILE_CUQUANTUM
+#endif
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/quest/include/modes.h b/quest/include/modes.h
index b90797acd..667a3f053 100644
--- a/quest/include/modes.h
+++ b/quest/include/modes.h
@@ -15,27 +15,8 @@
 
 
 
-// ensure all mode flags are defined
-
-#ifndef COMPILE_MPI
-    #error "Compiler must define COMPILE_MPI"
-#endif
-
-#ifndef COMPILE_OPENMP
-    #error "Compiler must define COMPILE_OPENMP"
-#endif
-
-#ifndef COMPILE_CUDA
-    #error "Compiler must define COMPILE_CUDA"
-#endif
-
-#ifndef COMPILE_CUQUANTUM
-    #error "Compiler must define COMPILE_CUQUANTUM"
-#endif
-
-
-
 // ensure all mode flags are valid values
+// undefined allowed as undefined == 0 in C/C++ standards
 
 #if ! (COMPILE_MPI == 0 || COMPILE_MPI == 1)
     #error "Macro COMPILE_MPI must have value 0 or 1"
diff --git a/quest/include/quest.h.in b/quest/include/quest.h
similarity index 87%
rename from quest/include/quest.h.in
rename to quest/include/quest.h
index 4ba420478..fcc49ab76 100644
--- a/quest/include/quest.h.in
+++ b/quest/include/quest.h
@@ -31,20 +31,12 @@
 #define QUEST_H
 
 
-// bind compile settings to installed exec
-#if !@MULTI_LIB_HEADERS@
-#cmakedefine FLOAT_PRECISION @FLOAT_PRECISION@
-#cmakedefine01 COMPILE_MPI
-#cmakedefine01 COMPILE_OPENMP
-#cmakedefine01 COMPILE_CUDA
-#cmakedefine01 COMPILE_CUQUANTUM
-#endif
-
-
 // include version first so it is accessible to 
 // debuggers in case a subsequent include fails
 #include "quest/include/version.h"
 
+#include "quest/include/config.h"
+
 // include before API headers since it validates
 // preprocessor configuration, and affirms macro
 // preconditions assumed by subsequent header

From eeedec7002b1a09f3412ee910cff3c54e22e4ae6 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Wed, 25 Jun 2025 23:09:16 +0200
Subject: [PATCH 07/32] added environment variables (#653)

which enable configuring QuEST's execution after compilation, before QuEST environment initialisation, solving some of the issues lamented in #645 and generally being more sensible/convenient. It also patched an esoteric bug in the parsing of floating-point numbers, affecting functions like initInlinePauliStrSum().

Refactor included:
- adding (basic) utilities for parsing environment variables.
- changing PERMIT_NODES_TO_SHARE_GPU and DEFAULT_EPSILON_ENV_VAR_NOT_A_REAL from macros to environment variables. The latter empowers users to disable all numerically-sensitive validation without modifying or recompiling their code.
- patching the parsing of non-quadruple-precision floats which would previously see numbers beyond the qcomp-range silently over or underflow instead of throwing an error (see commit a66f797b69d2ed7cea07ca09b19b9d5487511bdb).
- inserted whitespaces into cmake error message about MacOS multithreading to make the advised commands clearer.

A subsequent commit will refactor some unit-testing macros to non-QuEST-managed environment variables.
---
 .github/workflows/test_paid.yml  |   5 +-
 CMakeLists.txt                   |  19 +--
 docs/launch.md                   |  24 +++-
 quest/include/environment.h      |   3 +
 quest/include/modes.h            |  76 ++++++++--
 quest/include/precision.h        |  31 ++--
 quest/src/api/environment.cpp    |  40 ++++--
 quest/src/comm/comm_routines.cpp |   6 +
 quest/src/core/CMakeLists.txt    |   1 +
 quest/src/core/envvars.cpp       | 158 ++++++++++++++++++++
 quest/src/core/envvars.hpp       |  37 +++++
 quest/src/core/errors.cpp        |  16 +++
 quest/src/core/errors.hpp        |  10 ++
 quest/src/core/parser.cpp        | 240 ++++++++++++++++++++++---------
 quest/src/core/parser.hpp        |  16 +++
 quest/src/core/validation.cpp    |  67 +++++++--
 quest/src/core/validation.hpp    |  13 +-
 quest/src/gpu/gpu_cuquantum.cuh  |   7 +-
 tests/main.cpp                   |  15 +-
 tests/unit/environment.cpp       |  16 ++-
 tests/unit/paulis.cpp            |  17 ++-
 utils/docs/Doxyfile              |   2 +
 22 files changed, 652 insertions(+), 167 deletions(-)
 create mode 100644 quest/src/core/envvars.cpp
 create mode 100644 quest/src/core/envvars.hpp

diff --git a/.github/workflows/test_paid.yml b/.github/workflows/test_paid.yml
index 878336f7f..c8fe34c03 100644
--- a/.github/workflows/test_paid.yml
+++ b/.github/workflows/test_paid.yml
@@ -257,12 +257,15 @@ jobs:
           -DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }}
           -DTEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}
           -DTEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${{ env.test_repetitions }}
-          -DPERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}
           -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}
 
       - name: Compile
         run: cmake --build ${{ env.build_dir }} --parallel
 
+      # permit use of single GPU by multiple MPI processes (detriments performance)
+      - name: Set env-var to permit GPU sharing
+        run: echo "PERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}" >> $GITHUB_ENV
+
       # cannot use ctests when distributed, grr!
       - name: Run GPU + distributed v4 mixed tests (4 nodes sharing 1 GPU)
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d10721947..33e2dff42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -231,15 +231,6 @@ if ((ENABLE_CUDA OR ENABLE_HIP) AND FLOAT_PRECISION STREQUAL 4)
   message(FATAL_ERROR "Quad precision is not supported on GPU. Please disable GPU acceleration or lower precision.")
 endif()
 
-option(
-  PERMIT_NODES_TO_SHARE_GPU
-  "Whether to permit multiple distributed nodes to share a single GPU at the detriment of performance. Turned OFF by default."
-  OFF
-)
-if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
-  message(STATUS "Permitting nodes to share GPUs is turned ${PERMIT_NODES_TO_SHARE_GPU}. Set PERMIT_NODES_TO_SHARE_GPU to modify.")
-endif()
-
 # Deprecated API
 option(
   ENABLE_DEPRECATED_API
@@ -318,7 +309,7 @@ if (ENABLE_MULTITHREADING)
   if (NOT OpenMP_FOUND)
     set(ErrorMsg "Could not find OpenMP, necessary for enabling multithreading.")
     if (APPLE AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-      string(APPEND ErrorMsg " Try first calling `brew install libomp` then `export OpenMP_ROOT=$(brew --prefix)/opt/libomp`")
+      string(APPEND ErrorMsg " Try first calling \n\tbrew install libomp\nthen\n\texport OpenMP_ROOT=$(brew --prefix)/opt/libomp")
     endif()
     message(FATAL_ERROR ${ErrorMsg})
   endif()
@@ -434,14 +425,6 @@ else()
 endif()
 
 
-if (ENABLE_DISTRIBUTION AND (ENABLE_CUDA OR ENABLE_HIP))
-  target_compile_definitions(
-    QuEST PRIVATE 
-    PERMIT_NODES_TO_SHARE_GPU=$<IF:$<BOOL:${PERMIT_NODES_TO_SHARE_GPU}>,1,0>
-  )
-endif()
-
-
 # add math library
 if (NOT MSVC)
   target_link_libraries(QuEST PRIVATE ${MATH_LIBRARY})
diff --git a/docs/launch.md b/docs/launch.md
index aadd02ca7..44a0f7fd7 100644
--- a/docs/launch.md
+++ b/docs/launch.md
@@ -22,6 +22,7 @@ Launching your [compiled](compile.md) QuEST application can be as straightforwar
 > - <a href="#launch_tests">Tests</a>
 >    * <a href="#launch_v4">v4</a>
 >    * <a href="#launch_v3">v3</a>
+> - <a href="#launch_configuring">Configuring</a>
 > - <a href="#launch_multithreading">Multithreading</a>
 >    * <a href="#launch_choosing-threads">Choosing threads</a>
 >    * <a href="#launch_monitoring-utilisation">Monitoring utilisation</a>
@@ -29,11 +30,11 @@ Launching your [compiled](compile.md) QuEST application can be as straightforwar
 > - <a href="#launch_gpu-acceleration">GPU-acceleration</a>
 >    * <a href="#launch_launching">Launching</a>
 >    * <a href="#launch_monitoring">Monitoring</a>
->    * <a href="#launch_configuring">Configuring</a>
+>    * <a href="#launch_configuring-1">Configuring</a>
 >    * <a href="#launch_benchmarking">Benchmarking</a>
 > - <a href="#launch_distribution">Distribution</a>
 >    * <a href="#launch_launching-1">Launching</a>
->    * <a href="#launch_configuring-1">Configuring</a>
+>    * <a href="#launch_configuring-2">Configuring</a>
 >    * <a href="#launch_benchmarking-1">Benchmarking</a>
 > - <a href="#launch_multi-gpu">Multi-GPU</a>
 > - <a href="#launch_supercomputers">Supercomputers</a>
@@ -243,6 +244,21 @@ ctest
 
 
 
+---------------------
+
+<!-- permit doxygen to reference section -->
+<a id="launch_configuring"></a>
+
+## Configuring
+
+QuEST execution can be configured prior to runtime using the below [environment variables](https://en.wikipedia.org/wiki/Environment_variable).
+
+- [`PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga7e12922138caa68ddaa6221e40f62dda)
+- [`DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#ga55810d6f3d23de810cd9b12a2bbb8cc2)
+
+
+
+
 ---------------------
 
 
@@ -429,7 +445,7 @@ Usage of GPU-acceleration can be (inadvisably) forced using [`createForcedQureg(
 
 
 <!-- permit doxygen to reference section -->
-<a id="launch_configuring"></a>
+<a id="launch_configuring-1"></a>
 
 ### Configuring
 
@@ -514,7 +530,7 @@ mpirun -np 1024 --oversubscribe ./mytests
 
 
 <!-- permit doxygen to reference section -->
-<a id="launch_configuring-1"></a>
+<a id="launch_configuring-2"></a>
 
 ### Configuring
 
diff --git a/quest/include/environment.h b/quest/include/environment.h
index a71454f0e..04f24bfe2 100644
--- a/quest/include/environment.h
+++ b/quest/include/environment.h
@@ -40,6 +40,9 @@ typedef struct {
     // deployment modes which cannot be directly changed after compilation
     int isCuQuantumEnabled;
 
+    // deployment configurations which can be changed via environment variables
+    int isGpuSharingEnabled;
+
     // distributed configuration
     int rank;
     int numNodes;
diff --git a/quest/include/modes.h b/quest/include/modes.h
index 667a3f053..2bb608be9 100644
--- a/quest/include/modes.h
+++ b/quest/include/modes.h
@@ -56,10 +56,6 @@
 
 // define optional-macro defaults (mostly to list them)
 
-#ifndef PERMIT_NODES_TO_SHARE_GPU
-#define PERMIT_NODES_TO_SHARE_GPU 0
-#endif
-
 #ifndef INCLUDE_DEPRECATED_FUNCTIONS
 #define INCLUDE_DEPRECATED_FUNCTIONS 0
 #endif
@@ -74,11 +70,6 @@
 #if 0
 
 
-    /// @notyetdoced
-    /// @macrodoc
-    const int PERMIT_NODES_TO_SHARE_GPU = 0;
-
-
     /// @notyetdoced
     /// @macrodoc
     const int INCLUDE_DEPRECATED_FUNCTIONS = 0;
@@ -93,6 +84,73 @@
 
 
 
+// document environment variables
+
+// spoof env-vars as consts to doc (hackily and hopefully temporarily)
+#if 0
+
+
+    /** @envvardoc
+     * 
+     * Specifies whether to permit multiple MPI processes to deploy to the same GPU.
+     * 
+     * @attention 
+     * This environment variable has no effect when either (or both) of distribution or 
+     * GPU-acceleration are disabled.
+     * 
+     * In multi-GPU execution, which combines distribution with GPU-acceleration, it is 
+     * prudent to assign each GPU to at most one MPI process in order to avoid superfluous 
+     * slowdown. Hence by default, initQuESTEnv() will forbid assigning multiple MPI processes 
+     * to the same GPU. This environment variable can be set to `1` to disable this validation, 
+     * permitting sharing of a single GPU, as is often useful for debugging or unit testing 
+     * (for example, testing multi-GPU execution when only a single GPU is available).
+     * 
+     * @warning
+     * Permitting GPU sharing may cause unintended behaviour when additionally using cuQuantum.
+     * 
+     * @envvarvalues
+     *  - forbid sharing: @p 0, @p '0', @p '', @p , (unspecified)
+     *  - permit sharing: @p 1, @p '1'
+     * 
+     * @author Tyson Jones
+     */
+    const int PERMIT_NODES_TO_SHARE_GPU = 0;
+
+
+    /** @envvardoc
+     * 
+     * Specifies the default validation epsilon. 
+     * 
+     * Specifying `DEFAULT_VALIDATION_EPSILON` to a positive, real number overrides the 
+     * precision-specific default (`1E-5`, `1E-12`, `1E-13` for single, double and quadruple 
+     * precision respectively). The specified epsilon is used by QuEST for numerical validation
+     * unless overriden at runtime via setValidationEpsilon(), in which case it can be
+     * restored to that specified by this environment variable using setValidationEpsilonToDefault().
+     * 
+     * @envvarvalues
+     *  - setting @p DEFAULT_VALIDATION_EPSILON=0 disables numerical validation, as if the value
+     *    were instead infinity.
+     *  - setting @p DEFAULT_VALIDATION_EPSILON='' is equivalent to _not_ specifying the variable,
+     *    adopting instead the precision-specific default above.
+     *  - setting @p DEFAULT_VALIDATION_EPSILON=x where `x` is a positive, valid `qreal` in any
+     *    format accepted by `C` or `C++` (e.g. `0.01`, `1E-2`, `+1e-2`) will use `x` as the
+     *    default validation epsilon.
+     * 
+     * @constraints
+     * The function initQuESTEnv() will throw a validation error if:
+     *   - The specified epsilon must be `0` or positive.
+     *   - The specified epsilon must not exceed that maximum or minimum value which can be stored
+     *     in a `qreal`, which is specific to its precision.
+     * 
+     * @author Tyson Jones
+     */
+    const qreal DEFAULT_VALIDATION_EPSILON = 0;
+
+
+#endif
+
+
+
 // user flags for choosing automatic deployment; only accessible by C++ 
 // backend and C++ users; C users must hardcode -1 
 
diff --git a/quest/include/precision.h b/quest/include/precision.h
index cfd150855..f7a18e416 100644
--- a/quest/include/precision.h
+++ b/quest/include/precision.h
@@ -121,34 +121,19 @@
 
 
 /*
- * RE-CONFIGURABLE DEFAULT VALIDATION PRECISION
+ * DEFAULT VALIDATION PRECISION
  *
- * which is compile-time overridable by pre-defining DEFAULT_VALIDATION_EPSILON (e.g. 
- * in user code before importing QuEST, or passed as a preprocessor constant by the
- * compiler using argument -D), and runtime overridable using setValidationEpsilon()
+ * which is pre-run-time overridable by specifying the corresponding environment variable.
  */
 
-#ifndef DEFAULT_VALIDATION_EPSILON
-
-    #if FLOAT_PRECISION == 1
-        #define DEFAULT_VALIDATION_EPSILON 1E-5
-
-    #elif FLOAT_PRECISION == 2
-        #define DEFAULT_VALIDATION_EPSILON 1E-12
-
-    #elif FLOAT_PRECISION == 4
-        #define DEFAULT_VALIDATION_EPSILON 1E-13
-
-    #endif
-
-#endif
+#if FLOAT_PRECISION == 1
+    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-5
 
-// spoofing above macros as typedefs and consts to doc
-#if 0
+#elif FLOAT_PRECISION == 2
+    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-12
 
-    /// @notyetdoced
-    /// @macrodoc
-    const qreal DEFAULT_VALIDATION_EPSILON = 1E-12;
+#elif FLOAT_PRECISION == 4
+    #define UNSPECIFIED_DEFAULT_VALIDATION_EPSILON 1E-13
 
 #endif
 
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 63b6f41ef..6eef515c4 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -11,7 +11,9 @@
 
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/memory.hpp"
+#include "quest/src/core/parser.hpp"
 #include "quest/src/core/printer.hpp"
+#include "quest/src/core/envvars.hpp"
 #include "quest/src/core/autodeployer.hpp"
 #include "quest/src/core/validation.hpp"
 #include "quest/src/core/randomiser.hpp"
@@ -75,6 +77,9 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
     // this leads to undefined behaviour in distributed mode, as per the MPI
     validate_envNeverInit(globalEnvPtr != nullptr, hasEnvBeenFinalized, caller);
 
+    envvars_validateAndLoadEnvVars(caller);
+    validateconfig_setEpsilonToDefault();
+
     // ensure the chosen deployment is compiled and supported by hardware.
     // note that these error messages will be printed by every node because
     // validation occurs before comm_init() below, so all processes spawned
@@ -102,12 +107,17 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
     if (useGpuAccel)
         gpu_bindLocalGPUsToNodes();
 
-    // each MPI process must use a unique GPU. This is critical when
-    // initializing cuQuantum, so we don't re-init cuStateVec on any
-    // paticular GPU (causing runtime error), but still ensures we 
-    // keep good performance in our custom backend GPU code; there is
-    // no reason to use multi-nodes-per-GPU except for dev/debugging.
-    if (useGpuAccel && useDistrib && ! PERMIT_NODES_TO_SHARE_GPU)
+    // consult environment variable to decide whether to allow GPU sharing 
+    // (default = false) which informs whether below validation is triggered
+    bool permitGpuSharing = envvars_getWhetherGpuSharingIsPermitted();
+
+    // each MPI process should ordinarily use a unique GPU. This is 
+    // critical when initializing cuQuantum so that we don't re-init 
+    // cuStateVec on any paticular GPU (which can apparently cause a
+    // so-far-unwitnessed runtime error), but is otherwise essential
+    // for good performance. GPU sharing is useful for unit testing
+    // however permitting a single GPU to test CUDA+MPI deployment
+    if (useGpuAccel && useDistrib && ! permitGpuSharing)
         validate_newEnvNodesEachHaveUniqueGpu(caller);
 
     /// @todo
@@ -132,10 +142,11 @@ void validateAndInitCustomQuESTEnv(int useDistrib, int useGpuAccel, int useMulti
         error_allocOfQuESTEnvFailed();
 
     // bind deployment info to global instance
-    globalEnvPtr->isMultithreaded    = useMultithread;
-    globalEnvPtr->isGpuAccelerated   = useGpuAccel;
-    globalEnvPtr->isDistributed      = useDistrib;
-    globalEnvPtr->isCuQuantumEnabled = useCuQuantum;
+    globalEnvPtr->isMultithreaded     = useMultithread;
+    globalEnvPtr->isGpuAccelerated    = useGpuAccel;
+    globalEnvPtr->isDistributed       = useDistrib;
+    globalEnvPtr->isCuQuantumEnabled  = useCuQuantum;
+    globalEnvPtr->isGpuSharingEnabled = permitGpuSharing;
 
     // bind distributed info
     globalEnvPtr->rank     = (useDistrib)? comm_getRank()     : 0;
@@ -188,10 +199,11 @@ void printDeploymentInfo() {
 
     print_table(
         "deployment", {
-        {"isMpiEnabled",       globalEnvPtr->isDistributed},
-        {"isGpuEnabled",       globalEnvPtr->isGpuAccelerated},
-        {"isOmpEnabled",       globalEnvPtr->isMultithreaded},
-        {"isCuQuantumEnabled", globalEnvPtr->isCuQuantumEnabled},
+        {"isMpiEnabled",        globalEnvPtr->isDistributed},
+        {"isGpuEnabled",        globalEnvPtr->isGpuAccelerated},
+        {"isOmpEnabled",        globalEnvPtr->isMultithreaded},
+        {"isCuQuantumEnabled",  globalEnvPtr->isCuQuantumEnabled},
+        {"isGpuSharingEnabled", globalEnvPtr->isGpuSharingEnabled},
     });
 }
 
diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp
index 3c03d23f6..6e161db18 100644
--- a/quest/src/comm/comm_routines.cpp
+++ b/quest/src/comm/comm_routines.cpp
@@ -76,6 +76,12 @@ using std::vector;
  * 
  * - look into UCX CUDA multi-rail:
  *   https://docs.nvidia.com/networking/display/hpcxv215/unified+communication+-+x+framework+library#src-119764120_UnifiedCommunicationXFrameworkLibrary-Multi-RailMulti-Rail 
+ * 
+ * - by default, we validate to prevent sharing a GPU between multiple MPI processes since it is
+ *   easy to do unintentionally yet is rarely necessary (outside of unit testing) and can severely 
+ *   degrade performance. If we motivated a strong non-testing use-case for this however, we could
+ *   improve performance through use of CUDA's Multi-Process Service (MPS) which will prevent
+ *   serialisation of memcpy to distinct memory partitions and improve kernel scheduling.  
  */
 
 
diff --git a/quest/src/core/CMakeLists.txt b/quest/src/core/CMakeLists.txt
index e498d4569..9d11d16d7 100644
--- a/quest/src/core/CMakeLists.txt
+++ b/quest/src/core/CMakeLists.txt
@@ -4,6 +4,7 @@ target_sources(QuEST
   PRIVATE
   accelerator.cpp
   autodeployer.cpp
+  envvars.cpp
   errors.cpp
   localiser.cpp
   memory.cpp
diff --git a/quest/src/core/envvars.cpp b/quest/src/core/envvars.cpp
new file mode 100644
index 000000000..c88647e0e
--- /dev/null
+++ b/quest/src/core/envvars.cpp
@@ -0,0 +1,158 @@
+/** @file
+ * Functions for loading environment variables, useful for
+ * configuring QuEST ahead of calling initQuESTEnv(), after
+ * compilation.
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest/include/precision.h"
+#include "quest/include/types.h"
+
+#include "quest/src/core/errors.hpp"
+#include "quest/src/core/parser.hpp"
+#include "quest/src/core/validation.hpp"
+
+#include <string>
+#include <cstdlib>
+
+using std::string;
+
+
+
+/*
+ * FIXED ENV-VAR NAMES
+ */
+
+
+namespace envvar_names {
+    string PERMIT_NODES_TO_SHARE_GPU = "PERMIT_NODES_TO_SHARE_GPU";
+    string DEFAULT_VALIDATION_EPSILON = "DEFAULT_VALIDATION_EPSILON";
+}
+
+
+
+/*
+ * USER-OVERRIDABLE DEFAULT ENV-VAR VALUES
+ */
+
+
+namespace envvar_values {
+
+    // by default, do not permit GPU sharing since it sabotages performance
+    // and should only ever be carefully, deliberately enabled
+    bool PERMIT_NODES_TO_SHARE_GPU = false;
+
+    // by default, the initial validation epsilon (before being overriden
+    // by users at runtime) should depend on qreal (i.e. FLOAT_PRECISION)
+    qreal DEFAULT_VALIDATION_EPSILON = UNSPECIFIED_DEFAULT_VALIDATION_EPSILON;
+}
+
+
+// indicates whether envvars_validateAndLoadEnvVars() has been called
+bool global_areEnvVarsLoaded = false;
+
+
+
+/*
+ * PRIVATE UTILITIES
+ */
+
+
+bool isEnvVarSpecified(string name) {
+
+    // note var="" is considered unspecified, but var=" " is specified
+    const char* ptr = std::getenv(name.c_str());
+    return (ptr != nullptr) && (ptr[0] != '\0');
+}
+
+
+string getSpecifiedEnvVarValue(string name) {
+
+    // assumes isEnvVarSpecified returned true
+    // (calling getenv() a second time is fine)
+    return std::string(std::getenv(name.c_str()));
+}
+
+
+void assertEnvVarsAreLoaded() {
+
+    if (!global_areEnvVarsLoaded)
+        error_envVarsNotYetLoaded();
+}
+
+
+
+/*
+ * PRIVATE BESPOKE ENV-VAR LOADERS
+ *
+ * which we have opted to not-yet make generic 
+ * (e.g. for each type) since YAGNI
+ */
+
+
+void validateAndSetWhetherGpuSharingIsPermitted(const char* caller) {
+
+    // permit unspecified, falling back to default value
+    string name = envvar_names::PERMIT_NODES_TO_SHARE_GPU;
+    if (!isEnvVarSpecified(name))
+        return;
+
+    // otherwise ensure value == '0' or '1' precisely (no whitespace)
+    string value = getSpecifiedEnvVarValue(name);
+    validate_envVarPermitNodesToShareGpu(value, caller);
+
+    // overwrite default env-var value
+    envvar_values::PERMIT_NODES_TO_SHARE_GPU = (value[0] == '1');
+}
+
+
+void validateAndSetDefaultValidationEpsilon(const char* caller) {
+
+    // permit unspecified, falling back to the hardcoded precision-specific default
+    string name = envvar_names::DEFAULT_VALIDATION_EPSILON;
+    if (!isEnvVarSpecified(name))
+        return;
+    
+    // otherwise, validate user passed a positive real integer (or zero)
+    string value = getSpecifiedEnvVarValue(name);
+    validate_envVarDefaultValidationEpsilon(value, caller);
+
+    // overwrite default env-var value
+    envvar_values::DEFAULT_VALIDATION_EPSILON = parser_parseReal(value);    
+}
+
+
+
+/*
+ * PUBLIC
+ */
+
+
+void envvars_validateAndLoadEnvVars(const char* caller) {
+
+    // error if loaded twice since this indicates spaghetti
+    if (global_areEnvVarsLoaded)
+        error_envVarsAlreadyLoaded();
+
+    // load all env-vars
+    validateAndSetWhetherGpuSharingIsPermitted(caller);
+    validateAndSetDefaultValidationEpsilon(caller);
+
+    // ensure no re-loading
+    global_areEnvVarsLoaded = true;
+}
+
+
+bool envvars_getWhetherGpuSharingIsPermitted() {
+    assertEnvVarsAreLoaded();
+
+    return envvar_values::PERMIT_NODES_TO_SHARE_GPU;
+}
+
+
+qreal envvars_getDefaultValidationEpsilon() {
+    assertEnvVarsAreLoaded();
+
+    return envvar_values::DEFAULT_VALIDATION_EPSILON;
+}
diff --git a/quest/src/core/envvars.hpp b/quest/src/core/envvars.hpp
new file mode 100644
index 000000000..828d5605e
--- /dev/null
+++ b/quest/src/core/envvars.hpp
@@ -0,0 +1,37 @@
+/** @file
+ * Functions for loading environment variables, useful for
+ * configuring QuEST ahead of calling initQuESTEnv(), after
+ * compilation.
+ * 
+ * @author Tyson Jones
+ */
+
+#ifndef ENVVARS_HPP
+#define ENVVARS_HPP
+
+#include <string>
+
+
+namespace envvar_names { 
+    extern std::string PERMIT_NODES_TO_SHARE_GPU;
+    extern std::string DEFAULT_VALIDATION_EPSILON;
+}
+
+
+/*
+ * LOAD VARS
+ */
+
+void envvars_validateAndLoadEnvVars(const char* caller);
+
+
+/*
+ * GET VAR
+ */
+
+bool envvars_getWhetherGpuSharingIsPermitted();
+
+qreal envvars_getDefaultValidationEpsilon();
+
+
+#endif // ENVVARS_HPP
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index a2c2649ca..2f44127c8 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -818,3 +818,19 @@ void assert_printerGivenPositiveNumNewlines() {
     if (printer_getNumTrailingNewlines() < min)
         raiseInternalError("A printer utility attempted to print one fewer than the user-set number of trailing newlines; but that number was zero! This violates prior validation.");
 }
+
+
+
+/*
+ * ENVIRONMENT VARIABLE ERRORS
+ */
+
+void error_envVarsNotYetLoaded() {
+
+    raiseInternalError("An environment variable was queried but all environment variables have not yet been loaded.");
+}
+
+void error_envVarsAlreadyLoaded() {
+
+    raiseInternalError("All environment variables were already loaded and validated yet re-loading was attempted.");
+}
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 8c39ee756..ce8f7e68c 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -339,4 +339,14 @@ void assert_printerGivenPositiveNumNewlines();
 
 
 
+/*
+ * ENVIRONMENT VARIABLE ERRORS
+ */
+
+void error_envVarsNotYetLoaded();
+
+void error_envVarsAlreadyLoaded();
+
+
+
 #endif // ERRORS_HPP
\ No newline at end of file
diff --git a/quest/src/core/parser.cpp b/quest/src/core/parser.cpp
index 31dad4e5f..8884acc4c 100644
--- a/quest/src/core/parser.cpp
+++ b/quest/src/core/parser.cpp
@@ -10,6 +10,7 @@
  * @author Tyson Jones
  */
 
+#include "quest/include/precision.h"
 #include "quest/include/types.h"
 #include "quest/include/paulis.h"
 
@@ -25,7 +26,6 @@
 #include <stdexcept>
 #include <algorithm>
 
-using std::stold;
 using std::regex;
 using std::vector;
 using std::string;
@@ -82,9 +82,9 @@ namespace patterns {
     string num = group(comp) + "|" + group(imag) + "|" + group(real);
 
     // no capturing because 'num' pollutes captured groups, and pauli syntax overlaps real integers
-    string pauli  = "[" + parser_RECOGNISED_PAULI_CHARS + "]";
+    string pauli = "[" + parser_RECOGNISED_PAULI_CHARS + "]";
     string paulis = group(optSpace + pauli + optSpace) + "+";
-    string line   = "^" + group(num) + space + optSpace + paulis + "$";
+    string weightedPaulis = "^" + group(num) + space + optSpace + paulis + "$";
 }
 
 
@@ -95,8 +95,8 @@ namespace regexes {
     regex imag(patterns::imag);
     regex comp(patterns::comp);
     regex num(patterns::num);
-    regex line(patterns::line);
     regex paulis(patterns::paulis);
+    regex weightedPaulis(patterns::weightedPaulis);
 }
 
 
@@ -172,6 +172,165 @@ int getNumPaulisInLine(string line) {
 
 
 
+/*
+ * REAL NUMBER PARSING
+ */
+
+
+qreal precisionAgnosticStringToFloat(string str) {
+
+    // remove whitespace which stold() et al cannot handle after the sign.
+    // beware this means that e.g. "1 0" (invalid number) would become "10"
+    // (valid) so this function cannot be used for duck-typing, though that
+    // is anyway the case since stold() et al permit "10abc"
+    removeWhiteSpace(str);
+
+    // below throws exception when the (prefix) of str cannot be/fit into a qreal
+    if (FLOAT_PRECISION == 1) return static_cast<qreal>(std::stof (str));
+    if (FLOAT_PRECISION == 2) return static_cast<qreal>(std::stod (str));
+    if (FLOAT_PRECISION == 4) return static_cast<qreal>(std::stold(str));
+
+    // unreachable
+    return -1;
+}
+
+
+bool parser_isAnySizedReal(string str) {
+
+    // we assume that all strings which match the regex can be parsed by
+    // precisionAgnosticStringToFloat() above (once whitespace is removed)
+    // EXCEPT strings which contain a number too large to store in the qreal 
+    // type (as is separately checked below). Note it is insufficient to merely 
+    // duck-type using stold() et al because such functions permit non-numerical 
+    // characters to follow the contained number which are silently removed (grr!)
+    smatch match;
+    return regex_match(str, match, regexes::real);
+}
+
+
+bool parser_isValidReal(string str) {
+
+    // reject str if it doesn't match regex
+    if (!parser_isAnySizedReal(str))
+        return false;
+
+    // check number is in-range of qreal via duck-typing
+    try {
+        precisionAgnosticStringToFloat(str);
+    } catch (const out_of_range&) {
+        return false;
+
+    // error if our regex permitted an unparsable string
+    } catch (const invalid_argument&) {
+        error_attemptedToParseRealFromInvalidString();
+    }
+
+    return true;
+}
+
+
+qreal parser_parseReal(string str) {
+
+    try {
+        return precisionAgnosticStringToFloat(str);
+    } catch (const invalid_argument&) {
+        error_attemptedToParseRealFromInvalidString();
+    } catch (const out_of_range&) {
+        error_attemptedToParseOutOfRangeReal();
+    }
+
+    // unreachable
+    return -1;
+}
+
+
+
+/*
+ * COMPLEX NUMBER PARSING
+ */
+
+
+bool parser_isAnySizedComplex(string str) {
+
+    // we assume that all strings which match the regex can be parsed to
+    // a qcomp (once whitespace is removed) EXCEPT strings which contain a 
+    // number too large to store in the qcomp type (as is separately checked 
+    // below). Note it is insufficient to merely duck-type each component using
+    // using stold() et al because such functions permit non-numerical chars to 
+    // follow the contained number (grr!)
+    smatch match;
+
+    // must match real, imaginary or complex number regex
+    if (regex_match(str, match, regexes::real)) return true;
+    if (regex_match(str, match, regexes::imag)) return true;
+    if (regex_match(str, match, regexes::comp)) return true;
+
+    return false;
+}
+
+
+bool parser_isValidComplex(string str) {
+
+    // reject str if it doesn't match complex regex
+    if (!parser_isAnySizedComplex(str))
+        return false;
+
+    // we've so far gauranteed str has a valid form, but we must now check 
+    // each included complex component (which we enumerate) is in range of a qreal
+    sregex_iterator it(str.begin(), str.end(), regexes::real);
+    sregex_iterator end;
+
+    // valid coeffs contain 1 or 2 reals, never 0, which regex should have caught
+    if (it == end)
+        error_attemptedToParseComplexFromInvalidString();
+
+    // for each of the 1 or 2 components...
+    for (; it != end; it++) {
+
+        // check component is in-range of qreal via duck-typing
+        try {
+            precisionAgnosticStringToFloat(it->str(0));
+        } catch (const out_of_range&) {
+            return false;
+
+        // error if our regex permitted an unparsable component
+        } catch (const invalid_argument&) {
+            error_attemptedToParseComplexFromInvalidString();
+        }
+    }
+
+    // report that each/all detected components of str can form a valid qcomp
+    return true;
+}
+
+
+qcomp parser_parseComplex(string str) {
+
+    if (!parser_isValidComplex(str))
+        error_attemptedToParseComplexFromInvalidString();
+
+    // we are gauranteed to fully match real, imag or comp after prior validation
+    smatch match;
+
+    // extract and parse components and their signs (excluding imaginary symbol)
+    if (regex_match(str, match, regexes::real))
+        return qcomp(parser_parseReal(match.str(1)), 0);
+
+    if (regex_match(str, match, regexes::imag))
+        return qcomp(0, parser_parseReal(match.str(1)));
+
+    if (regex_match(str, match, regexes::comp))
+        return qcomp(
+            parser_parseReal(match.str(1)),
+            parser_parseReal(match.str(2)));
+    
+    // should be unreachable
+    error_attemptedToParseComplexFromInvalidString();
+    return qcomp(0,0); 
+}
+
+
+
 /*
  * VALIDATION
  *
@@ -188,15 +347,14 @@ bool isInterpretablePauliStrSumLine(string line) {
     // notation) followed by 1 or more space characters, then one or
     // more pauli codes/chars. It does NOT determine whether the coeff
     // can actually be instantiated as a qcomp
-    return regex_match(line, regexes::line);
+    return regex_match(line, regexes::weightedPaulis);
 }
 
 
-bool isCoeffValidInPauliStrSumLine(string line) {
+bool isPauliStrSumCoeffWithinQcompRange(string line) {
 
     // it is gauranteed that line is interpretable and contains a regex-matching
-    // coefficient, but we must additionally verify it is within range of stold,
-    // and isn't unexpectedly incompatible with stold in a way uncaptured by regex.
+    // coefficient, but we must additionally verify it is within range of qreal.
     // So we duck type each of the 1 or 2 matches with the real regex (i.e. one or 
     // both of the real and imaginary components of a complex coeff).
 
@@ -215,17 +373,17 @@ bool isCoeffValidInPauliStrSumLine(string line) {
     // enumerate all matches of 'real' regex in line
     for (; it != end; it++) {
 
-        // removed whitespace (stold cannot handle space between sign and number)
+        // remove whitespace (stold cannot handle space between sign and number)
         string match = it->str(0);
         removeWhiteSpace(match);
 
-        // return false if stold cannot parse the real as a long double
+        // return false if number cannot become a qreal
         try {
-            stold(match);
-        } catch (const invalid_argument&) {
-            return false;
+            precisionAgnosticStringToFloat(match);
         } catch (const out_of_range&) {
             return false;
+        } catch (const invalid_argument&) { // should be impossible (indicates bad regex)
+            return false;
         }
     }
 
@@ -256,8 +414,8 @@ void assertStringIsValidPauliStrSum(string lines, const char* caller) {
         validate_parsedPauliStrSumLineIsInterpretable(validLine, line, lineIndex, caller);
 
         // assert the coeff is parsable (e.g. doesn't exceed valid number range)
-        bool validCoeff = isCoeffValidInPauliStrSumLine(line);
-        validate_parsedPauliStrSumCoeffIsValid(validCoeff, line, lineIndex, caller);
+        bool validCoeff = isPauliStrSumCoeffWithinQcompRange(line);
+        validate_parsedPauliStrSumCoeffWithinQcompRange(validCoeff, line, lineIndex, caller);
 
         // assert the line has a consistent number of Paulis as previous
         int numLinePaulis = getNumPaulisInLine(line);
@@ -299,52 +457,6 @@ int parser_getPauliIntFromChar(char ch) {
  */
 
 
-qreal parseReal(string real) {
-    
-    // attempt to parse at max precision (long double) then cast down if necessary
-    try {
-        return static_cast<qreal>(stold(real));
-
-    // should be impossible if regex and validation works correctly
-    } catch (const invalid_argument&) {
-        error_attemptedToParseRealFromInvalidString();
-
-    // should be prior caught by validation
-    } catch (const out_of_range&) {
-        error_attemptedToParseOutOfRangeReal();
-    }
-
-    // unreachable
-    return -1;
-}
-
-
-qcomp parseCoeff(string coeff) {
-
-    // remove all superfluous spaces in coeff so stold is happy (it cannot tolerate spaces after +-)
-    removeWhiteSpace(coeff);
-
-    // we are gauranteed to fully match real, imag or comp after prior validation
-    smatch match;
-
-    // extract and parse components and their signs (excluding imaginary symbol)
-    if (regex_match(coeff, match, regexes::real))
-        return qcomp(parseReal(match.str(1)), 0);
-
-    if (regex_match(coeff, match, regexes::imag))
-        return qcomp(0, parseReal(match.str(1)));
-
-    if (regex_match(coeff, match, regexes::comp))
-        return qcomp(
-            parseReal(match.str(1)),
-            parseReal(match.str(2)));
-    
-    // should be unreachable
-    error_attemptedToParseComplexFromInvalidString();
-    return qcomp(0,0); 
-}
-
-
 PauliStr parsePaulis(string paulis, bool rightIsLeastSignificant) {
 
     // remove whitespace to make string compatible with getPauliStr()
@@ -363,14 +475,14 @@ PauliStr parsePaulis(string paulis, bool rightIsLeastSignificant) {
 }
 
 
-void parseLine(string line, qcomp &coeff, PauliStr &pauli, bool rightIsLeastSignificant) {
+void parseWeightedPaulis(string line, qcomp &coeff, PauliStr &pauli, bool rightIsLeastSignificant) {
 
     // separate line into substrings
     string coeffStr, pauliStr;
     separateStringIntoCoeffAndPaulis(line, coeffStr, pauliStr);
 
     // parse each, overwriting calller primitives
-    coeff = parseCoeff(coeffStr);
+    coeff = parser_parseComplex(coeffStr);
     pauli = parsePaulis(pauliStr, rightIsLeastSignificant);
 }
 
@@ -402,7 +514,7 @@ PauliStrSum parser_validateAndParsePauliStrSum(string lines, bool rightIsLeastSi
 
         qcomp coeff;
         PauliStr string;
-        parseLine(line, coeff, string, rightIsLeastSignificant); // validates
+        parseWeightedPaulis(line, coeff, string, rightIsLeastSignificant); // validates
 
         coeffs.push_back(coeff);
         strings.push_back(string);
diff --git a/quest/src/core/parser.hpp b/quest/src/core/parser.hpp
index 3e9d18c11..4a9df2d02 100644
--- a/quest/src/core/parser.hpp
+++ b/quest/src/core/parser.hpp
@@ -7,6 +7,7 @@
 #ifndef PARSER_HPP
 #define PARSER_HPP
 
+#include "quest/include/types.h"
 #include "quest/include/paulis.h"
 
 #include <string>
@@ -15,6 +16,21 @@ using std::string;
 
 
 
+/*
+ * PARSING NUMBERS
+ */
+
+bool parser_isAnySizedReal(string str);
+bool parser_isAnySizedComplex(string str);
+
+bool parser_isValidReal(string str);
+bool parser_isValidComplex(string str);
+
+qreal parser_parseReal(string str);
+qcomp parser_parseComplex(string str);
+
+
+
 /*
  * PARSING INDIVIDUAL PAULIS
  */
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 4b3406975..3f242e6df 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -23,6 +23,7 @@
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/parser.hpp"
 #include "quest/src/core/printer.hpp"
+#include "quest/src/core/envvars.hpp"
 #include "quest/src/comm/comm_config.hpp"
 #include "quest/src/comm/comm_routines.hpp"
 #include "quest/src/cpu/cpu_config.hpp"
@@ -31,6 +32,7 @@
 #include <algorithm>
 #include <iostream>
 #include <cstdlib>
+#include <cstring>
 #include <string>
 #include <vector>
 #include <map>
@@ -708,8 +710,8 @@ namespace report {
     string PARSED_PAULI_STR_SUM_INCONSISTENT_NUM_PAULIS_IN_LINE =
         "Line ${LINE_NUMBER} specified ${NUM_LINE_PAULIS} Pauli operators which is inconsistent with the number of Paulis of the previous lines (${NUM_PAULIS}).";
 
-    string PARSED_PAULI_STR_SUM_COEFF_IS_INVALID =
-        "The coefficient of line ${LINE_NUMBER} could not be converted to a qcomp, possibly due to it exceeding the valid numerical range.";
+    string PARSED_PAULI_STR_SUM_COEFF_EXCEEDS_QCOMP_RANGE =
+        "The coefficient of line ${LINE_NUMBER} is a valid floating-point number but exceeds the range which can be stored in a qcomp. Consider increasing FLOAT_PRECISION.";
 
     string PARSED_STRING_IS_EMPTY =
         "The given string was empty (contained only whitespace characters) and could not be parsed.";
@@ -1066,6 +1068,22 @@ namespace report {
     string TEMP_ALLOC_FAILED =
         "A temporary allocation of ${NUM_ELEMS} elements (each of ${NUM_BYTES_PER_ELEM} bytes) failed, possibly because of insufficient memory.";
 
+
+    /*
+     * ENVIRONMENT VARIABLES
+     */
+
+    string INVALID_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR =
+        "The optional, boolean '" + envvar_names::PERMIT_NODES_TO_SHARE_GPU + "' environment variable was specified to an invalid value. The variable can be unspecified, or set to '', '0' or '1'.";
+
+    string DEFAULT_EPSILON_ENV_VAR_NOT_A_REAL =
+        "The optional '" + envvar_names::DEFAULT_VALIDATION_EPSILON + "' environment variable was not a recognisable real number.";
+
+    string DEFAULT_EPSILON_ENV_VAR_EXCEEDS_QREAL_RANGE = 
+        "The optional '" + envvar_names::DEFAULT_VALIDATION_EPSILON + "' environment variable was larger (in magnitude) than the maximum value which can be stored in a qreal.";
+
+    string DEFAULT_EPSILON_ENV_VAR_IS_NEGATIVE =
+        "The optional '" + envvar_names::DEFAULT_VALIDATION_EPSILON + "' environment variable was negative. The value must be zero or positive.";
 }
 
 
@@ -1153,13 +1171,18 @@ qreal REDUCTION_EPSILON_FACTOR = 100;
  * overwritten (so will stay validate_STRUCT_PROPERTY_UNKNOWN_FLAG)
  */
 
-static qreal global_validationEpsilon = DEFAULT_VALIDATION_EPSILON;
+// the default epsilon is not known until runtime since the macro
+// UNSPECIFIED_DEFAULT_VALIDATION_EPSILON may be overriden by the
+// DEFAULT_VALIDATION_EPSILON environment variable. We do not read
+// the env-var immediately since it may malformed; we must wait for
+// initQuESTEnv() to validate and potentially throw an error
+static qreal global_validationEpsilon = -1; // must be overriden
 
 void validateconfig_setEpsilon(qreal eps) {
     global_validationEpsilon = eps;
 }
 void validateconfig_setEpsilonToDefault() {
-    global_validationEpsilon = DEFAULT_VALIDATION_EPSILON;
+    global_validationEpsilon = envvars_getDefaultValidationEpsilon();
 }
 qreal validateconfig_getEpsilon() {
     return global_validationEpsilon;
@@ -1364,13 +1387,8 @@ void validate_newEnvDistributedBetweenPower2Nodes(const char* caller) {
 
 void validate_newEnvNodesEachHaveUniqueGpu(const char* caller) {
 
-    // this validation can be disabled for debugging/dev purposes
-    // (caller should explicitly check this preprocessor too for clarity)
-    if (PERMIT_NODES_TO_SHARE_GPU)
-        return;
-
-    bool uniqueGpus = ! gpu_areAnyNodesBoundToSameGpu();
-    assertAllNodesAgreeThat(uniqueGpus, report::MULTIPLE_NODES_BOUND_TO_SAME_GPU, caller);
+    bool sharedGpus = gpu_areAnyNodesBoundToSameGpu();
+    assertAllNodesAgreeThat(!sharedGpus, report::MULTIPLE_NODES_BOUND_TO_SAME_GPU, caller);
 }
 
 void validate_gpuIsCuQuantumCompatible(const char* caller) {
@@ -3234,12 +3252,12 @@ void validate_parsedPauliStrSumLineHasConsistentNumPaulis(int numPaulis, int num
     assertThat(numPaulis == numLinePaulis, report::PARSED_PAULI_STR_SUM_INCONSISTENT_NUM_PAULIS_IN_LINE, vars, caller);
 }
 
-void validate_parsedPauliStrSumCoeffIsValid(bool isCoeffValid, string line, qindex lineIndex, const char* caller) {
+void validate_parsedPauliStrSumCoeffWithinQcompRange(bool isCoeffValid, string line, qindex lineIndex, const char* caller) {
 
     /// @todo we cannot yet report 'line' because tokenSubs so far only accepts integers :(
 
     tokenSubs vars = {{"${LINE_NUMBER}", lineIndex + 1}}; // lines begin at 1
-    assertThat(isCoeffValid, report::PARSED_PAULI_STR_SUM_COEFF_IS_INVALID, vars, caller);
+    assertThat(isCoeffValid, report::PARSED_PAULI_STR_SUM_COEFF_EXCEEDS_QCOMP_RANGE, vars, caller);
 }
 
 void validate_parsedStringIsNotEmpty(bool stringIsNotEmpty, const char* caller) {
@@ -4165,3 +4183,26 @@ void validate_tempAllocSucceeded(bool succeeded, qindex numElems, qindex numByte
 
     assertThat(succeeded, report::TEMP_ALLOC_FAILED, vars, caller);
 }
+
+
+
+/*
+ * ENVIRONMENT VARIABLES
+ */
+
+void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller) {
+
+    // though caller should gaurantee varValue contains at least one character, 
+    // we'll still check to avoid a segfault if this gaurantee is broken
+    bool isValid = (varValue.size() == 1) && (varValue[0] == '0' || varValue[0] == '1');
+    assertThat(isValid, report::INVALID_PERMIT_NODES_TO_SHARE_GPU_ENV_VAR, caller);
+}
+
+void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller) {
+
+    assertThat(parser_isAnySizedReal(varValue), report::DEFAULT_EPSILON_ENV_VAR_NOT_A_REAL, caller);
+    assertThat(parser_isValidReal(varValue), report::DEFAULT_EPSILON_ENV_VAR_EXCEEDS_QREAL_RANGE, caller);
+
+    qreal eps = parser_parseReal(varValue);
+    assertThat(eps >= 0, report::DEFAULT_EPSILON_ENV_VAR_IS_NEGATIVE, caller);
+}
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 0bf48b409..92baac843 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -312,7 +312,7 @@ void validate_newPauliStrSumAllocs(PauliStrSum sum, qindex numBytesStrings, qind
 
 void validate_parsedPauliStrSumLineIsInterpretable(bool isInterpretable, string line, qindex lineIndex, const char* caller);
 
-void validate_parsedPauliStrSumCoeffIsValid(bool isCoeffValid, string line, qindex lineIndex, const char* caller);
+void validate_parsedPauliStrSumCoeffWithinQcompRange(bool isCoeffValid, string line, qindex lineIndex, const char* caller);
 
 void validate_parsedPauliStrSumLineHasConsistentNumPaulis(int numPaulis, int numLinePaulis, string line, qindex lineIndex, const char* caller);
 
@@ -488,7 +488,6 @@ void validate_densMatrExpecDiagMatrValueIsReal(qcomp value, qcomp exponent, cons
  * PARTIAL TRACE
  */
 
-
 void validate_quregCanBeReduced(Qureg qureg, int numTraceQubits, const char* caller);
 
 void validate_quregCanBeSetToReducedDensMatr(Qureg out, Qureg in, int numTraceQubits, const char* caller);
@@ -511,4 +510,14 @@ void validate_tempAllocSucceeded(bool succeeded, qindex numElems, qindex numByte
 
 
 
+/*
+ * ENVIRONMENT VARIABLES
+ */
+
+void validate_envVarPermitNodesToShareGpu(string varValue, const char* caller);
+
+void validate_envVarDefaultValidationEpsilon(string varValue, const char* caller);
+
+
+
 #endif // VALIDATION_HPP
\ No newline at end of file
diff --git a/quest/src/gpu/gpu_cuquantum.cuh b/quest/src/gpu/gpu_cuquantum.cuh
index 9f80881a1..3b1c55fdb 100644
--- a/quest/src/gpu/gpu_cuquantum.cuh
+++ b/quest/src/gpu/gpu_cuquantum.cuh
@@ -134,9 +134,10 @@ int deallocMemInPool(void* ctx, void* ptr, size_t size, cudaStream_t stream) {
 void gpu_initCuQuantum() {
 
     // the cuStateVec docs say custatevecCreate() should be called
-    // once per physical GPU, though oversubscribing MPI processes
-    // while setting PERMIT_NODES_TO_SHARE_GPU=1 worked fine in our
-    // testing - we will treat it as tolerable but undefined behaviour
+    // once per physical GPU, though assigning multiple MPI processes
+    // to each GPU with each calling custatevecCreate() below worked
+    // fine in our testing. We here tolerate oversubscription, letting
+    // prior validation prevent it (disabled by an environment variable)
 
     // create new stream and cuQuantum handle, binding to global config
     CUDA_CHECK( custatevecCreate(&config.handle) );
diff --git a/tests/main.cpp b/tests/main.cpp
index 03294857a..29647753e 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -88,13 +88,14 @@ class startListener : public Catch::EventListenerBase {
         QuESTEnv env = getQuESTEnv();
         std::cout << std::endl;
         std::cout << "QuEST execution environment:" << std::endl;
-        std::cout << "  precision:       " << FLOAT_PRECISION        << std::endl;
-        std::cout << "  multithreaded:   " << env.isMultithreaded    << std::endl;
-        std::cout << "  distributed:     " << env.isDistributed      << std::endl;
-        std::cout << "  GPU-accelerated: " << env.isGpuAccelerated   << std::endl;
-        std::cout << "  cuQuantum:       " << env.isCuQuantumEnabled << std::endl;
-        std::cout << "  num nodes:       " << env.numNodes           << std::endl;
-        std::cout << "  num qubits:      " << getNumCachedQubits()   << std::endl;
+        std::cout << "  precision:       " << FLOAT_PRECISION         << std::endl;
+        std::cout << "  multithreaded:   " << env.isMultithreaded     << std::endl;
+        std::cout << "  distributed:     " << env.isDistributed       << std::endl;
+        std::cout << "  GPU-accelerated: " << env.isGpuAccelerated    << std::endl;
+        std::cout << "  GPU-sharing ok:  " << env.isGpuSharingEnabled << std::endl;
+        std::cout << "  cuQuantum:       " << env.isCuQuantumEnabled  << std::endl;
+        std::cout << "  num nodes:       " << env.numNodes            << std::endl;
+        std::cout << "  num qubits:      " << getNumCachedQubits()    << std::endl;
         std::cout << "  num qubit perms: " << TEST_MAX_NUM_QUBIT_PERMUTATIONS << std::endl;
         std::cout << std::endl;
 
diff --git a/tests/unit/environment.cpp b/tests/unit/environment.cpp
index db9a1516c..6d4efb80d 100644
--- a/tests/unit/environment.cpp
+++ b/tests/unit/environment.cpp
@@ -54,6 +54,13 @@ TEST_CASE( "initQuESTEnv", TEST_CATEGORY ) {
     SECTION( LABEL_VALIDATION ) {
 
         REQUIRE_THROWS_WITH( initQuESTEnv(), ContainsSubstring( "already been initialised") );
+
+        // cannot automatically check other validations, such as:
+        // - has env been previously initialised then finalised?
+        // - is env distributed over power-of-2 nodes?
+        // - are environment-variables valid?
+        // - is max 1 MPI process bound to each GPU?
+        // - is GPU compatible with cuQuantum (if enabled)?
     }
 }
 
@@ -133,10 +140,11 @@ TEST_CASE( "getQuESTEnv", TEST_CATEGORY ) {
 
         QuESTEnv env = getQuESTEnv();
 
-        REQUIRE( (env.isMultithreaded    == 0 || env.isMultithreaded    == 1) );
-        REQUIRE( (env.isGpuAccelerated   == 0 || env.isGpuAccelerated   == 1) );
-        REQUIRE( (env.isDistributed      == 0 || env.isDistributed      == 1) );
-        REQUIRE( (env.isCuQuantumEnabled == 0 || env.isCuQuantumEnabled == 1) );
+        REQUIRE( (env.isMultithreaded     == 0 || env.isMultithreaded     == 1) );
+        REQUIRE( (env.isGpuAccelerated    == 0 || env.isGpuAccelerated    == 1) );
+        REQUIRE( (env.isDistributed       == 0 || env.isDistributed       == 1) );
+        REQUIRE( (env.isCuQuantumEnabled  == 0 || env.isCuQuantumEnabled  == 1) );
+        REQUIRE( (env.isGpuSharingEnabled == 0 || env.isGpuSharingEnabled == 1) );
         
         REQUIRE( env.rank     >= 0 );
         REQUIRE( env.numNodes >= 0 );
diff --git a/tests/unit/paulis.cpp b/tests/unit/paulis.cpp
index f18b57228..255d30c4b 100644
--- a/tests/unit/paulis.cpp
+++ b/tests/unit/paulis.cpp
@@ -362,8 +362,9 @@ TEST_CASE( "createInlinePauliStrSum", TEST_CATEGORY ) {
 
         SECTION( "coefficient parsing" ) {
 
-            vector<std::string> strs = {"1 X", "0 X", "0.1 X", "5E2-1i X", "-1E-50i X",  "1 - 6E-5i X", "-1.5E-15  -   5.123E-30i  0"};
-            vector<qcomp> coeffs     = { 1,     0,     0.1,     5E2-1_i,   -(1E-50)*1_i,  1 -(6E-5)*1_i, qcomp(-1.5E-15, -5.123E-30) };
+            // beware that when FLOAT_PRECISION=1, qcomp cannot store smaller than 1E-37 (triggering a validation error)
+            vector<std::string> strs = {"1 X", "0 X", "0.1 X", "5E2-1i X", "-1E-25i X",  "1 - 6E-5i X", "-1.5E-15  -   5.123E-30i  0"};
+            vector<qcomp> coeffs     = { 1,     0,     0.1,     5E2-1_i,   -(1E-25)*1_i,  1 -(6E-5)*1_i, qcomp(-1.5E-15, -5.123E-30) };
 
             size_t i = GENERATE_REF( range(0, (int) strs.size()) );
             CAPTURE( strs[i], coeffs[i] );
@@ -377,7 +378,7 @@ TEST_CASE( "createInlinePauliStrSum", TEST_CATEGORY ) {
 
             PauliStrSum sum = createInlinePauliStrSum(R"(
                 + 5E2-1i     XYZ 
-                - 1E-50i     IXY 
+                - 1E-20i     IXY 
                 + 1 - 6E-5i  IIX 
                   0          III 
                   5.         XXX 
@@ -416,6 +417,12 @@ TEST_CASE( "createInlinePauliStrSum", TEST_CATEGORY ) {
             REQUIRE_NOTHROW( createInlinePauliStrSum("1 2 3") ); // = 1 * YZ and is legal
         }
 
+        SECTION( "out of range" ) {
+
+            // the max/min qcomp depend upon FLOAT_PRECISION but we'll lazily use something even quad-prec cannot store
+            REQUIRE_THROWS_WITH( createInlinePauliStrSum("-1E-9999 XYZ"), ContainsSubstring("exceeds the range which can be stored in a qcomp") );
+        }
+
         SECTION( "inconsistent number of qubits" ) {
 
             REQUIRE_THROWS_WITH( createInlinePauliStrSum("3 XYZ \n 2 YX"), ContainsSubstring("inconsistent") );
@@ -444,7 +451,7 @@ TEST_CASE( "createPauliStrSumFromFile", TEST_CATEGORY ) {
             file.open(fn);
             file << R"(
                 + 5E2-1i     XYZ 
-                - 1E-50i     IXY 
+                - 1E-20i     IXY 
                 + 1 - 6E-5i  IIX 
                 0            III 
                 5.           IXX
@@ -497,7 +504,7 @@ TEST_CASE( "createPauliStrSumFromReversedFile", TEST_CATEGORY ) {
             file.open(fn);
             file << R"(
                 + 5E2-1i     XYZ 
-                - 1E-50i     IXY 
+                - 1E-20i     IXY 
                 + 1 - 6E-5i  IIX 
                 0            III 
                 5.           IXX
diff --git a/utils/docs/Doxyfile b/utils/docs/Doxyfile
index 4eb40b524..daa1f0143 100644
--- a/utils/docs/Doxyfile
+++ b/utils/docs/Doxyfile
@@ -301,6 +301,8 @@ ALIASES += "notyetdoced=@note Documentation for this function or struct is under
 ALIASES += "cpponly=@remark This function is only available in C++."
 ALIASES += "conly=@remark This function is only available in C."
 ALIASES += "macrodoc=@note This entity is actually a macro."
+ALIASES += "envvardoc=@note This entity is actually an environment variable."
+ALIASES += "envvarvalues=@par Values"
 ALIASES += "neverdoced=@warning This entity is a macro, undocumented directly due to a Doxygen limitation. If you see this doc rendered, contact the devs!"
 ALIASES += "myexample=@par Example"
 ALIASES += "equivalences=@par Equivalences"

From cd01c6f79de405d34c227db8b584b9c8add5f176 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 26 Jun 2025 19:47:34 +0200
Subject: [PATCH 08/32] added test environment variables (#655)

which enables post-compilation pre-runtime configuring of the unit tests without hooking into QuEST's internal environment variable facilities. The macros...
- TEST_MAX_NUM_QUBIT_PERMUTATIONS
- TEST_MAX_NUM_SUPEROP_TARGETS
- TEST_ALL_DEPLOYMENTS
- TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS
are now environment variables, along with new variable TEST_NUM_QUBITS_IN_QUREG which controls the size of the Quregs in the unit tests.

With this commit, all preprocessors considered in #645 have become environment variables
---
 .github/workflows/test_free.yml |  9 +---
 .github/workflows/test_paid.yml | 23 +++++---
 CMakeLists.txt                  | 59 ---------------------
 docs/cmake.md                   |  6 +--
 docs/launch.md                  | 17 +++++-
 tests/CMakeLists.txt            |  8 ---
 tests/main.cpp                  | 11 +++-
 tests/unit/calculations.cpp     | 15 +++---
 tests/unit/decoherence.cpp      |  7 +--
 tests/unit/operations.cpp       |  9 ++--
 tests/utils/CMakeLists.txt      |  1 +
 tests/utils/cache.cpp           | 31 +++++------
 tests/utils/config.cpp          | 76 ++++++++++++++++++++++++++
 tests/utils/config.hpp          | 94 +++++++++++++++++++++++++++++++++
 tests/utils/lists.cpp           |  3 +-
 tests/utils/macros.hpp          | 51 ------------------
 16 files changed, 248 insertions(+), 172 deletions(-)
 create mode 100644 tests/utils/config.cpp
 create mode 100644 tests/utils/config.hpp

diff --git a/.github/workflows/test_free.yml b/.github/workflows/test_free.yml
index 3d3aace47..7efd540d3 100644
--- a/.github/workflows/test_free.yml
+++ b/.github/workflows/test_free.yml
@@ -54,10 +54,6 @@ jobs:
       build_dir: "build"
       depr_dir: "build/tests/deprecated"
 
-      # run all slow, rigorous tests (because runner is free)
-      num_qubit_perms: 0
-      test_all_deploys: ON
-
     # perform the job
     steps:
       - name: Get QuEST
@@ -71,14 +67,13 @@ jobs:
           -DENABLE_MULTITHREADING=OFF
           -DENABLE_DEPRECATED_API=${{ matrix.version == 3 && 'ON' || 'OFF' }}
           -DFLOAT_PRECISION=${{ matrix.precision }}
-          -DTEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}
-          -DTEST_MAX_NUM_QUBIT_PERMUTATIONS=${{ env.num_qubit_perms }}
 
       # force 'Release' build (needed by MSVC to enable optimisations)
       - name: Compile
         run: cmake --build ${{ env.build_dir }} --config Release --parallel
 
-      # run v4 unit tests in random order, excluding the integration tests
+      # run v4 unit tests in random order, excluding the integration tests,
+      # using the default environment variables (e.g. test all permutations)
       # TODO:
       # ctest currently doesn't know of our Catch2 tags, so we
       # are manually excluding each integration test by name
diff --git a/.github/workflows/test_paid.yml b/.github/workflows/test_paid.yml
index c8fe34c03..4c17d9439 100644
--- a/.github/workflows/test_paid.yml
+++ b/.github/workflows/test_paid.yml
@@ -90,7 +90,7 @@ jobs:
 
       # tests will be non-comprehensive/faster to save $$$
       num_qubit_perms: 10
-      test_all_deploys: OFF
+      test_all_deploys: 0
 
     # perform the job
     steps:
@@ -139,6 +139,12 @@ jobs:
       - name: Compile
         run: cmake --build ${{ env.build_dir }} --parallel
 
+      # specifying only env-vars with non-default values
+      - name: Configure tests with environment variables
+        run: | 
+          echo "TEST_MAX_NUM_QUBIT_PERMUTATIONS=${{ env.num_qubit_perms }}" >> $GITHUB_ENV
+          echo "TEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}" >> $GITHUB_ENV
+
       # cannot use ctests when distributed, grr!
       - name: Run multithreaded + distributed v4 tests (16 nodes, 4 threads eeach)
         if: ${{ matrix.mpi == 'ON' }}
@@ -206,11 +212,11 @@ jobs:
       # which will be shared between 4 MPI processes
       # (no --oversubscribe flag necessary on MPICH)
       num_mpi_nodes: 4
-      mpi_share_gpu: ON
+      mpi_share_gpu: 1
 
       # we will test all combinations of deployments
       # (e.g. CPU + MPI vs GPU), repeated 5 times each
-      test_all_deploys: ON
+      test_all_deploys: 1
       test_repetitions: 5
 
     # perform the job
@@ -255,16 +261,17 @@ jobs:
           -DENABLE_CUDA=${{ matrix.cuda }}
           -DENABLE_CUQUANTUM=${{ matrix.cuquantum }}
           -DCMAKE_CUDA_ARCHITECTURES=${{ env.cuda_arch }}
-          -DTEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}
-          -DTEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${{ env.test_repetitions }}
           -DCMAKE_CXX_FLAGS=${{ matrix.mpi == 'ON' && matrix.cuda == 'ON' && '-fno-lto' || '' }}
 
       - name: Compile
         run: cmake --build ${{ env.build_dir }} --parallel
 
-      # permit use of single GPU by multiple MPI processes (detriments performance)
-      - name: Set env-var to permit GPU sharing
-        run: echo "PERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}" >> $GITHUB_ENV
+      # specify only env-vars with non-default values
+      - name: Configure tests with environment variables
+        run: | 
+          echo "TEST_ALL_DEPLOYMENTS=${{ env.test_all_deploys }}" >> $GITHUB_ENV
+          echo "TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${{ env.test_repetitions }}" >> $GITHUB_ENV
+          echo "PERMIT_NODES_TO_SHARE_GPU=${{ env.mpi_share_gpu }}" >> $GITHUB_ENV
 
       # cannot use ctests when distributed, grr!
       - name: Run GPU + distributed v4 mixed tests (4 nodes sharing 1 GPU)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 33e2dff42..933e23086 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -123,65 +123,6 @@ option(
   ON
 )
 
-string(CONCAT DESCRIPTON_OF_TEST_MAX_NUM_QUBIT_PERMUTATIONS  # for multiline description
-  "The maximum number of control and target qubit permutations under which to unit test each function. "
-  "Set to 0 (default) to test all permutations, or to a positive integer (e.g. 50) to accelerate the unit tests. "
-  "This is used to accelerate the v4 unit tests, and has no effect when ENABLE_TESTING is OFF, nor on the v3 deprecated tests."
-)
-set(TEST_MAX_NUM_QUBIT_PERMUTATIONS 0
-  CACHE 
-  STRING 
-  "${DESCRIPTON_OF_TEST_MAX_NUM_QUBIT_PERMUTATIONS}"
-)
-if (ENABLE_TESTING)
-  set(PERM_STRING "${TEST_MAX_NUM_QUBIT_PERMUTATIONS} random")
-  if (TEST_MAX_NUM_QUBIT_PERMUTATIONS EQUAL 0)
-    set(PERM_STRING "all")
-  endif()
-  message(STATUS "Tests will use ${PERM_STRING} qubit permutations. Set TEST_MAX_NUM_QUBIT_PERMUTATIONS to modify.")
-endif()
-
-string(CONCAT DESCRIPTON_OF_TEST_MAX_NUM_SUPEROP_TARGETS  # for multiline description
-  "The maximum number of superoperator targets for which to unit test functions mixKrausMap and mixSuperOp. "
-  "This is computationally equivalent to simulating unitaries with double the number of targets upon a density matrix. "
-  "Set to 0 to test all sizes which is likely prohibitively slow, or to a positive integer (e.g. the default of 4) to accelerate the unit tests. "
-  "This is used to accelerate the v4 unit tests, and has no effect when ENABLE_TESTING is OFF, nor on the v3 deprecated tests."
-)
-set(TEST_MAX_NUM_SUPEROP_TARGETS 4
-  CACHE 
-  STRING 
-  "${DESCRIPTON_OF_TEST_MAX_NUM_SUPEROP_TARGETS}"
-)
-if (ENABLE_TESTING)
-  set(PERM_STRING "${TEST_MAX_NUM_SUPEROP_TARGETS} random")
-  if (TEST_MAX_NUM_SUPEROP_TARGETS EQUAL 0)
-    set(PERM_STRING "all")
-  endif()
-  message(STATUS "Tests will use superoperators of up to ${PERM_STRING} qubits. Set TEST_MAX_NUM_SUPEROP_TARGETS to modify.")
-endif()
-
-set(TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS 10
-  CACHE 
-  STRING 
-  "The number of times (minimum=1) to repeat each mixed-deployment unit test for each deployment combination."
-)
-if (ENABLE_TESTING)
-  if (TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS LESS_EQUAL 0)
-    message(FATAL_ERROR "Cannot set TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS <= 0. Must be at least 1.")
-  endif()
-  message(STATUS "Tests will repeat each mixed-deployment unit test ${TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS} times. Set TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS to modify.")
-endif()
-
-string(CONCAT DESCRIPTION_OF_TEST_ALL_DEPLOYMENTS  # for multiline description
-  "Whether unit tests will be run using all possible deployment combinations (i.e. OpenMP, CUDA, MPI) in-turn (ON), "
-  "or only once using all available deployments simultaneously (OFF). Turned ON by default. "
-  "This is used to accelerate the v4 unit tests, and has no effect when ENABLE_TESTING is OFF, nor on the v3 deprecated tests."
-)
-option(TEST_ALL_DEPLOYMENTS "${DESCRIPTION_OF_TEST_ALL_DEPLOYMENTS}" ON)
-if (ENABLE_TESTING)
-  message(STATUS "Testing all deployments is turned ${TEST_ALL_DEPLOYMENTS}. Set TEST_ALL_DEPLOYMENTS to modify.")
-endif()
-
 # Multithreading
 option(
   ENABLE_MULTITHREADING 
diff --git a/docs/cmake.md b/docs/cmake.md
index 5007553e3..336dcf28c 100644
--- a/docs/cmake.md
+++ b/docs/cmake.md
@@ -55,10 +55,8 @@ make
 | `ENABLE_TESTING` | (`OFF`), `ON` | Determines whether to additionally build QuEST's unit and integration tests. If built, tests can be run from the `build` directory with `make test`, or `ctest`, or manually launched with `./tests/tests` which enables distribution (i.e. `mpirun -np 8 ./tests/tests`) |
 | `ENABLE_DEPRECATED_API` | (`OFF`), `ON` | As described above. When enabled alongside testing, the `v3 deprecated` unit tests will additionally be compiled and can be run from within `build` via `cd tests/deprecated; ctest`, or manually launched with `./tests/deprecated/dep_tests` (enabling distribution, as above).
 | `DOWNLOAD_CATCH2` | (`ON`), `OFF` | QuEST's tests require Catch2. By default, if you don't have Catch2 installed (or CMake doesn't find it) it will be downloaded from Github and built for you. If you don't want that to happen, for example because you _do_ have Catch2 installed, set this to `OFF`. |
-| `TEST_MAX_NUM_QUBIT_PERMUTATIONS` | (`0`), Integer | Determines the maximum number of control and target qubit permutations with which to test each API function. Set to `0` to test all permutations, or to a positive integer (e.g. `50`) to accelerate the unit tests. |
-| `TEST_MAX_NUM_SUPEROP_TARGETS` | (`4`), Integer | Determines the maximum number of superoperator targets in the unit tests (for functions `mixKrausMap` and `mixSuperOp`). Set to `0` to impose no maximum (which is extraordinarily slow), or a positive integer (e.g. `3`) to accelerate the unit tests. |
-| `TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS` | (`10`), Integer | Determines the number of times (minimum of `1`) to repeat the randomised unit tests of functions which accept multiple distinctly-deployed `Qureg`s. Set to a small, positive integer to accelerate mixed-deployment unit tests. |
-| `TEST_ALL_DEPLOYMENTS` | (`ON`), `OFF` | Determines whether unit tests will be repeatedly run using all possible combinations of available `Qureg` deployments (i.e. `OpenMP` and/or `CUDA` and/or `MPI`), else only once using all available deployments simultaneously. Set to `OFF` to accelerate unit tests. |
+
+> As of `v4.2`, macros which configure the unit tests such as `TEST_MAX_NUM_QUBIT_PERMUTATIONS` have become environment variables specified before launch. See [`launch.md`](launch.md)
 
 ---------------------------
 
diff --git a/docs/launch.md b/docs/launch.md
index 44a0f7fd7..a76ce612b 100644
--- a/docs/launch.md
+++ b/docs/launch.md
@@ -216,6 +216,21 @@ Test project /build
 Alas tests launched in this way cannot be deployed with distribution.
 
 
+#### Environment variables
+
+The `v4` unit tests make use of the below, optional environment variables to control their rigour and runtime.
+
+
+| Environment variable  | Default | Description |
+| -------- | ------- | ------- |
+| `TEST_NUM_QUBITS_IN_QUREG` | `6` | The number of qubits in the Qureg(s) undergoing unit testing. In addition to operation upon larger Quregs being exponentially slower, beware that more qubits permit more variations and permutations of input parameters like target qubits, factorially increasing the number of tests per operation. |
+| `TEST_MAX_NUM_QUBIT_PERMUTATIONS`  | `0` | The maximum number of control and target qubit permutations under which to unit test each function. Set to `0` (default) to test all permutations, or to a positive integer (e.g. `50`) to accelerate the unit tests. See more info [here](https://quest-kit.github.io/QuEST/group__testutilsconfig.html#gac5adcc10bd26c56f20344f5ae3d9ba41). |
+| `TEST_MAX_NUM_SUPEROP_TARGETS` | `4` | The maximum number of superoperator targets for which to unit test functions `mixKrausMap()` and `mixSuperOp()`. These are computationally equivalent to simulating unitaries with double the number of targets upon a density matrix. Set to `0` to test all sizes which is likely prohibitively slow, or to a positive integer (e.g. the default of `4`) to accelerate the unit tests. |
+| `NUM_MIXED_DEPLOYMENT_REPETITIONS` | `10` | The number of times (minimum of `1`) to repeat each random mixed-deployment unit test for each deployment combination. |
+| `TEST_ALL_DEPLOYMENTS` | `1` | Whether unit tests will be run using all possible deployment combinations (i.e. OpenMP, CUDA, MPI) in-turn (`=1`), or only once using all available deployments simultaneously (`=0`). |
+
+
+
 
 <!-- permit doxygen to reference section -->
 <a id="launch_v3"></a>
@@ -256,7 +271,7 @@ QuEST execution can be configured prior to runtime using the below [environment
 - [`PERMIT_NODES_TO_SHARE_GPU`](https://quest-kit.github.io/QuEST/group__modes.html#ga7e12922138caa68ddaa6221e40f62dda)
 - [`DEFAULT_VALIDATION_EPSILON`](https://quest-kit.github.io/QuEST/group__modes.html#ga55810d6f3d23de810cd9b12a2bbb8cc2)
 
-
+Note the unit tests in the preceding section accept additional environment variables.
 
 
 ---------------------
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ba4fe0d78..31b0ea75a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,4 @@
 # @author Oliver Thomson Brown
-# @author Tyson Jones (macros and MPI ctests)
 
 add_executable(tests
   main.cpp
@@ -16,12 +15,5 @@ if (ENABLE_DEPRECATED_API)
 endif()
 
 
-# map test-related cmake vars to preprocessors
-target_compile_definitions(tests PRIVATE TEST_MAX_NUM_QUBIT_PERMUTATIONS=${TEST_MAX_NUM_QUBIT_PERMUTATIONS})
-target_compile_definitions(tests PRIVATE TEST_MAX_NUM_SUPEROP_TARGETS=${TEST_MAX_NUM_SUPEROP_TARGETS})
-target_compile_definitions(tests PRIVATE TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS=${TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS})
-target_compile_definitions(tests PRIVATE TEST_ALL_DEPLOYMENTS=$<BOOL:${TEST_ALL_DEPLOYMENTS}>)
-
-
 # let Catch2 register all tests with CTest
 catch_discover_tests(tests)
diff --git a/tests/main.cpp b/tests/main.cpp
index 29647753e..fca57f5ff 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -50,6 +50,7 @@
 #include <string>
 
 #include "quest.h"
+#include "tests/utils/config.hpp"
 #include "tests/utils/cache.hpp"
 #include "tests/utils/macros.hpp"
 #include "tests/utils/random.hpp"
@@ -95,8 +96,14 @@ class startListener : public Catch::EventListenerBase {
         std::cout << "  GPU-sharing ok:  " << env.isGpuSharingEnabled << std::endl;
         std::cout << "  cuQuantum:       " << env.isCuQuantumEnabled  << std::endl;
         std::cout << "  num nodes:       " << env.numNodes            << std::endl;
-        std::cout << "  num qubits:      " << getNumCachedQubits()    << std::endl;
-        std::cout << "  num qubit perms: " << TEST_MAX_NUM_QUBIT_PERMUTATIONS << std::endl;
+        std::cout << std::endl;
+
+        std::cout << "Testing configuration:" << std::endl;
+        std::cout << "  test all deployments:  " << getWhetherToTestAllDeployments()         << std::endl;
+        std::cout << "  num qubits in qureg:   " << getNumCachedQubits()                     << std::endl;
+        std::cout << "  max num qubit perms:   " << getMaxNumTestedQubitPermutations()       << std::endl;
+        std::cout << "  max num superop targs: " << getMaxNumTestedSuperoperatorTargets()    << std::endl;
+        std::cout << "  num mixed-deploy reps: " << getNumTestedMixedDeploymentRepetitions() << std::endl;
         std::cout << std::endl;
 
         std::cout << "Tested Qureg deployments:" << std::endl;
diff --git a/tests/unit/calculations.cpp b/tests/unit/calculations.cpp
index e0b4e9962..4b4db284e 100644
--- a/tests/unit/calculations.cpp
+++ b/tests/unit/calculations.cpp
@@ -23,6 +23,7 @@
 #include "tests/utils/lists.hpp"
 #include "tests/utils/macros.hpp"
 #include "tests/utils/random.hpp"
+#include "tests/utils/config.hpp"
 #include "tests/utils/cache.hpp"
 #include "tests/utils/measure.hpp"
 
@@ -412,7 +413,7 @@ TEST_CASE( "calcInnerProduct", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
         qmatrix refDM = getRefDensmatr();
         auto apiFunc = calcInnerProduct;
 
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC LABEL_DELIMITER LABEL_STATEVEC ) {
 
@@ -461,7 +462,7 @@ TEST_CASE( "calcFidelity", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
         qmatrix refDM = getRefDensmatr();
         auto apiFunc = calcFidelity;
 
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC LABEL_DELIMITER LABEL_STATEVEC ) {
 
@@ -503,7 +504,7 @@ TEST_CASE( "calcDistance", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
         qmatrix refDM = getRefDensmatr();
         auto apiFunc = calcDistance;
 
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC LABEL_DELIMITER LABEL_STATEVEC ) {
 
@@ -550,7 +551,7 @@ TEST_CASE( "calcExpecFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG )
         qmatrix refMatr = getRandomDiagonalHermitian(getNumCachedQubits());
         auto apiFunc = calcExpecFullStateDiagMatr;
 
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC ) {
 
@@ -579,7 +580,7 @@ TEST_CASE( "calcExpecNonHermitianFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_D
         qmatrix refMatr = getRandomDiagonalMatrix(getPow2(getNumCachedQubits()));
         auto apiFunc = calcExpecNonHermitianFullStateDiagMatr;
 
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC ) {
 
@@ -635,7 +636,7 @@ TEST_CASE( "calcExpecFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLOY_T
 
         CAPTURE( exponent );
 
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC ) {
 
@@ -676,7 +677,7 @@ TEST_CASE( "calcExpecNonHermitianFullStateDiagMatrPower", TEST_CATEGORY LABEL_MI
 
         CAPTURE( exponent );
 
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC ) {
 
diff --git a/tests/unit/decoherence.cpp b/tests/unit/decoherence.cpp
index 5fc373a44..f36c491bb 100644
--- a/tests/unit/decoherence.cpp
+++ b/tests/unit/decoherence.cpp
@@ -15,6 +15,7 @@
 
 #include "tests/utils/qvector.hpp"
 #include "tests/utils/qmatrix.hpp"
+#include "tests/utils/config.hpp"
 #include "tests/utils/cache.hpp"
 #include "tests/utils/compare.hpp"
 #include "tests/utils/convert.hpp"
@@ -276,7 +277,7 @@ TEST_CASE( "mixKrausMap", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
-        int maxFlag = TEST_MAX_NUM_SUPEROP_TARGETS;
+        int maxFlag = getMaxNumTestedSuperoperatorTargets();
         int numQubits = getNumCachedQubits();
         int maxNumTargs = (maxFlag != 0 && numQubits > maxFlag)?
             maxFlag : numQubits;
@@ -305,7 +306,7 @@ TEST_CASE( "mixSuperOp", TEST_CATEGORY ) {
     SECTION( LABEL_CORRECTNESS ) {
 
         int numQubits = getNumCachedQubits();
-        int maxFlag = TEST_MAX_NUM_SUPEROP_TARGETS;
+        int maxFlag = getMaxNumTestedSuperoperatorTargets();
         int maxNumTargs = (maxFlag != 0 && numQubits > maxFlag)?
             maxFlag : numQubits;
 
@@ -336,7 +337,7 @@ TEST_CASE( "mixQureg", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
 
         CAPTURE( prob );
         
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_DENSMATR LABEL_DELIMITER LABEL_STATEVEC ) { 
 
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index 55bed11ab..041000c4f 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -22,6 +22,7 @@
 #include <catch2/matchers/catch_matchers_string.hpp>
 #include <catch2/generators/catch_generators_range.hpp>
 
+#include "tests/utils/config.hpp"
 #include "tests/utils/cache.hpp"
 #include "tests/utils/qvector.hpp"
 #include "tests/utils/qmatrix.hpp"
@@ -1738,7 +1739,7 @@ TEST_CASE( "multiplyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
         qmatrix refMatr = getRandomDiagonalMatrix(getPow2(numQubits));
         auto apiFunc = multiplyFullStateDiagMatr;
 
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC ) {
 
@@ -1776,7 +1777,7 @@ TEST_CASE( "multiplyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLOY_TA
 
         CAPTURE( exponent );
         
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC ) {
 
@@ -1814,7 +1815,7 @@ TEST_CASE( "applyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
         qmatrix refMatr = getRandomDiagonalUnitary(numQubits);
         auto apiFunc = applyFullStateDiagMatr;
 
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC ) {
 
@@ -1858,7 +1859,7 @@ TEST_CASE( "applyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG )
 
         CAPTURE( exponent );
 
-        GENERATE( range(0, TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS) );
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         if (!testRealExp)
             setValidationEpsilon(0);
diff --git a/tests/utils/CMakeLists.txt b/tests/utils/CMakeLists.txt
index cae21694b..c9c06075f 100644
--- a/tests/utils/CMakeLists.txt
+++ b/tests/utils/CMakeLists.txt
@@ -4,6 +4,7 @@ target_sources(tests
   PUBLIC
   cache.cpp
   compare.cpp
+  config.cpp
   convert.cpp
   evolve.cpp
   linalg.cpp
diff --git a/tests/utils/cache.cpp b/tests/utils/cache.cpp
index 21dee8482..ec5001bcc 100644
--- a/tests/utils/cache.cpp
+++ b/tests/utils/cache.cpp
@@ -12,6 +12,7 @@
 #include "qmatrix.hpp"
 #include "macros.hpp"
 #include "linalg.hpp"
+#include "config.hpp"
 #include "cache.hpp"
 
 #include <unordered_map>
@@ -36,19 +37,15 @@ quregCache densmatrs1;
 quregCache densmatrs2;
 matrixCache matrices;
 
-
-
-/*
- * while the number of qubits in the unit-test Quregs/matr
- * is fixed, it is defined privately here (with internal
- * linkage) so that it can be changed between compilations
- * without having to recompiling the entire test suite
- */
-
-static constexpr int NUM_QUBITS_IN_CACHE = 6;
-
 int getNumCachedQubits() {
-    return NUM_QUBITS_IN_CACHE;
+
+    // we are merely aliasing the below env-var fetching function
+    // to minimise a diff since pre-runtime controlling the tested
+    // Qureg sizes is experimental and not fully designed (we may
+    // eventually wish to specify different sizes for statevectors
+    // vs density matrices, or control integration test Qureg sizes
+    // also through environment variables, etc)
+    return getNumQubitsInUnitTestedQuregs();
 }
 
 
@@ -68,7 +65,7 @@ deployInfo getSupportedDeployments() {
     bool gpu = env.isGpuAccelerated;
 
     // return only the "most-accelerated" deployment, unless all are desired
-    bool one = ! TEST_ALL_DEPLOYMENTS;
+    bool one = ! getWhetherToTestAllDeployments();
 
     // add only those supported to the output list, in order of preference.
     // flag order is (MPI, GPU, OMP), matching createCustomQureg
@@ -100,7 +97,7 @@ quregCache createCachedStatevecsOrDensmatrs(bool isDensMatr) {
 
     // only add supported-deployment quregs to the cache
     for (auto [label, mpi, gpu, omp] : getSupportedDeployments())
-        out[label] = createCustomQureg(NUM_QUBITS_IN_CACHE, isDensMatr, mpi, gpu, omp);
+        out[label] = createCustomQureg(getNumCachedQubits(), isDensMatr, mpi, gpu, omp);
 
     return out;
 }
@@ -184,7 +181,7 @@ void createCachedFullStateDiagMatrs() {
 
     // only add supported-deployment matrices to the cache
     for (auto [label, mpi, gpu, omp] : getSupportedDeployments())
-        matrices[label] = createCustomFullStateDiagMatr(NUM_QUBITS_IN_CACHE, mpi, gpu, omp);
+        matrices[label] = createCustomFullStateDiagMatr(getNumCachedQubits(), mpi, gpu, omp);
 }
 
 void destroyCachedFullStateDiagMatrs() {
@@ -213,8 +210,8 @@ matrixCache getCachedFullStateDiagMatrs() {
  */
 
 qvector getRefStatevec() {
-    return getZeroVector(getPow2(NUM_QUBITS_IN_CACHE));
+    return getZeroVector(getPow2(getNumCachedQubits()));
 }
 qmatrix getRefDensmatr() {
-    return getZeroMatrix(getPow2(NUM_QUBITS_IN_CACHE));
+    return getZeroMatrix(getPow2(getNumCachedQubits()));
 }
diff --git a/tests/utils/config.cpp b/tests/utils/config.cpp
new file mode 100644
index 000000000..c5362e899
--- /dev/null
+++ b/tests/utils/config.cpp
@@ -0,0 +1,76 @@
+/** @file
+ * Testing utilities for loading environment variables
+ * which configure the unit tests, independent of QuEST's 
+ * internal environment variable facilities
+ *
+ * @author Tyson Jones
+ */
+
+#include <string>
+#include <cstdlib>
+#include <stdexcept>
+
+using std::string;
+
+
+/*
+ * PRIVATE
+ */
+
+string getEnvVarValue(string name) {
+
+    // unspecified var returns empty string
+    const char* ptr = std::getenv(name.c_str());
+    return (ptr == nullptr)? "" : std::string(ptr);
+}
+
+int getIntEnvVarValueOrDefault(string name, int defaultValue) {
+
+    string strValue = getEnvVarValue(name);
+    int intValue = defaultValue;
+
+    // overwrite default only when passed variable is interpretable
+    try {
+        intValue = std::stoi(strValue);
+    } 
+    catch (const std::out_of_range&) { } 
+    catch (const std::invalid_argument&) { }
+    return intValue;
+}
+
+
+/*
+ * PUBLIC
+ *
+ * which each call std::getenv only once
+ */
+
+int getNumQubitsInUnitTestedQuregs() {
+
+    static int value = getIntEnvVarValueOrDefault("TEST_NUM_QUBITS_IN_QUREG", 6);
+    return value;
+}
+
+int getMaxNumTestedQubitPermutations() {
+
+    static int value = getIntEnvVarValueOrDefault("TEST_MAX_NUM_QUBIT_PERMUTATIONS", 0);
+    return value;
+}
+
+int getMaxNumTestedSuperoperatorTargets() {
+
+    static int value = getIntEnvVarValueOrDefault("TEST_MAX_NUM_SUPEROP_TARGETS", 4);
+    return value;
+}
+
+int getNumTestedMixedDeploymentRepetitions() {
+
+    static int value = getIntEnvVarValueOrDefault("TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS", 10);
+    return value;
+}
+
+bool getWhetherToTestAllDeployments() {
+
+    static bool value = getIntEnvVarValueOrDefault("TEST_ALL_DEPLOYMENTS", 1);
+    return value;
+}
diff --git a/tests/utils/config.hpp b/tests/utils/config.hpp
new file mode 100644
index 000000000..a1ef142c5
--- /dev/null
+++ b/tests/utils/config.hpp
@@ -0,0 +1,94 @@
+/** @file
+ * Testing utilities for loading environment variables
+ * which configure the unit tests, independent of QuEST's 
+ * internal environment variable facilities
+ *
+ * @author Tyson Jones
+ */
+
+
+/** @file
+ * @author Tyson Jones
+ * 
+ * @defgroup testutilsconfig Config
+ * @ingroup testutils
+ * @brief
+ * Testing utilities for loading environment variables
+ * which configure the unit tests, independent of QuEST's 
+ * internal environment variable facilities
+ * @{
+ */
+
+#ifndef CONFIG_HPP
+#define CONFIG_HPP
+
+
+/*
+ * SPECIFYING ENV-VARS 
+ */
+
+// spoofing as macros to doc; beware that the values below
+// merely duplicate but do not change the default values
+// which are hardcoded in config.cpp
+#if 0
+
+    /// @envvardoc
+    const int TEST_NUM_QUBITS_IN_QUREG = 6;
+
+    /** @envvardoc
+     * 
+     * Specifies the maximum number of control and target qubit permutations for which to unit test each relevant 
+     * API function.
+     * 
+     * Many QuEST functions accept a varying number of target qubits (like applyCompMatr()) and/or control qubits
+     * (like applyMultiControlledCompMatr()). The unit tests will run these functions, passing every possible number
+     * of target qubits (alongside every possible number of control qubits, if possible), from one (zero) up to the
+     * number contained within the tested `Qureg` (minus the number of target qubits).
+     * 
+     * For each of these tested number-of-targets and number-of-controls combinations, there are factorially-many 
+     * possible choices of the arbitrarily-ordered qubit indices, i.e. sub-permutations of all Qureg qubits. 
+     * By default, the unit tests deterministically check every permutation in-turn. This can become prohibitively 
+     * slow when the tested `Qureg` are large. For example, there are `604,800` unique, non-overlapping choices of 
+     * `4`  targets and `3` controls in a Qureg containing `10` qubits.
+     * 
+     * When this environment variable is set to a non-zero value, the unit tests will forego testing every permutation
+     * and instead perform only the number specified, randomising the involved qubits. This can significantly speed up the
+     * tests though risks missing esoteric edge-cases. The runtime of the tests are approximately linearly proportional
+     * to the specified number of permutations. When the specified non-zero value exceeds the number of unique 
+     * permutations, the tests will revert to deterministically evaluating each once.
+     * 
+     * @envvarvalues
+     * 
+     * - set to `0` (default) to systematically test all permutations.
+     * - set to a positive integer (e.g. `50`) to test (at most) that many random permutations and accelerate the tests.
+     * 
+     * @author Tyson Jones
+     */
+    const int TEST_MAX_NUM_QUBIT_PERMUTATIONS = 0;
+
+    /// @envvardoc
+    const int TEST_MAX_NUM_SUPEROP_TARGETS = 4;
+
+    /// @envvardoc
+    const int TEST_ALL_DEPLOYMENTS = 1;
+
+    /// @envvardoc
+    const int TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS = 10;
+
+#endif
+
+
+/*
+ * ACCESSING ENV-VARS 
+ */
+
+int getNumQubitsInUnitTestedQuregs();
+int getMaxNumTestedQubitPermutations();
+int getMaxNumTestedSuperoperatorTargets();
+int getNumTestedMixedDeploymentRepetitions();
+bool getWhetherToTestAllDeployments();
+
+
+#endif // CONFIG_PP
+
+/** @} (end defgroup) */
diff --git a/tests/utils/lists.cpp b/tests/utils/lists.cpp
index aa20f34db..b7ea3c340 100644
--- a/tests/utils/lists.cpp
+++ b/tests/utils/lists.cpp
@@ -10,6 +10,7 @@
 
 #include "quest.h"
 
+#include "config.hpp"
 #include "lists.hpp"
 #include "macros.hpp"
 #include "random.hpp"
@@ -264,7 +265,7 @@ listpair GENERATE_CTRLS_AND_TARGS(int numQubits, int numCtrls, int numTargs) {
 
     // impose a limit on the number of {ctrls,targs} to generate (max-int if none set)
     int numPerms = getNumPermutations(numQubits, numCtrls + numTargs);
-    int maxPerms = TEST_MAX_NUM_QUBIT_PERMUTATIONS;
+    int maxPerms = getMaxNumTestedQubitPermutations();
     if (maxPerms == 0)
         maxPerms = std::numeric_limits<int>::max();
 
diff --git a/tests/utils/macros.hpp b/tests/utils/macros.hpp
index 7d5882051..da924ad88 100644
--- a/tests/utils/macros.hpp
+++ b/tests/utils/macros.hpp
@@ -14,57 +14,6 @@
 #include <catch2/catch_test_macros.hpp>
 
 
-/**
- * macros which affect the speed and rigour of the unit tests, useful
- * for accelerating tests on particular platforms (e.g. paid github runners).
- * The default values are those which perform the most rigorous tests at
- * the slowest speed, so adjusting these macros accelerates tests. 
- *
- * @todo
- * These are clunky preprocessors (invoking full recompilation when changed),
- * rather than runtime arguments, because of the nuisance of passing such
- * args to cmake. It can be done however using environment variables; see
- * https://stackoverflow.com/questions/28812533/
- */
-
-// 0 = perform all, and a sensible value to accelerate tests is 50
-#ifndef TEST_MAX_NUM_QUBIT_PERMUTATIONS
-#define TEST_MAX_NUM_QUBIT_PERMUTATIONS 0
-#endif
-
-// 0 = perform all (very slow), while 4 limits to superops = 8-qubit matrices
-#ifndef TEST_MAX_NUM_SUPEROP_TARGETS
-#define TEST_MAX_NUM_SUPEROP_TARGETS 4
-#endif
-
-// 0 = use all available deployments at once, 1 = try all combinations in-turn
-#ifndef TEST_ALL_DEPLOYMENTS
-#define TEST_ALL_DEPLOYMENTS 1
-#endif
-
-// number of times to repeat each "[mixed]" test (minimum 1)
-#ifndef TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS
-#define TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS 10
-#endif
-
-// spoofing above macros as consts to doc
-#if 0
-
-    /// @macrodoc
-    const int TEST_MAX_NUM_QUBIT_PERMUTATIONS = 0;
-
-    /// @macrodoc
-    const int TEST_MAX_NUM_SUPEROP_TARGETS = 4;
-
-    /// @macrodoc
-    const int TEST_ALL_DEPLOYMENTS = 1;
-
-    /// @macrodoc
-    const int TEST_NUM_MIXED_DEPLOYMENT_REPETITIONS = 10;
-
-#endif
-
-
 /*
  * preconditions to the internal unit testing functions are checked using 
  * DEMAND rather than Catch2's REQUIRE, so that they are not counted in the 

From aa11c23f39210445390d9d5c460c250a6ec57931 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 26 Jun 2025 19:51:49 +0200
Subject: [PATCH 09/32] patched distributed validation test

The API functions createFullStateDiagMatr() and createCustomFullStateDiagMatr() worked correctly though their "insufficient distributed memory" validation error messages were erroneously excluded from the expected message lists in their respective unit tests.
---
 quest/src/core/validation.cpp | 2 +-
 tests/unit/matrices.cpp       | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 3f242e6df..a8223cb3f 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -1599,7 +1599,7 @@ void assertQuregFitsInCpuMem(int numQubits, int isDensMatr, int isDistrib, QuEST
         vars["${NUM_NODES}"] = numQuregNodes;
 
     // require expensive node consensus in case of heterogeneous RAMs
-    assertAllNodesAgreeThat(quregFitsInMem, report::NEW_QUREG_CANNOT_FIT_INTO_NON_DISTRIB_CPU_MEM, vars, caller);
+    assertAllNodesAgreeThat(quregFitsInMem, msg, vars, caller);
 }
 
 void assertQuregFitsInGpuMem(int numQubits, int isDensMatr, int isDistrib, int isGpuAccel, QuESTEnv env, const char* caller) {
diff --git a/tests/unit/matrices.cpp b/tests/unit/matrices.cpp
index ef92dbcda..05abe33a2 100644
--- a/tests/unit/matrices.cpp
+++ b/tests/unit/matrices.cpp
@@ -614,7 +614,8 @@ TEST_CASE( "createFullStateDiagMatr", TEST_CATEGORY ) {
                 ContainsSubstring("failed") || 
                 ContainsSubstring("insufficient available memory") || 
                 ContainsSubstring("available GPU memory") ||
-                ContainsSubstring("exceeds the available RAM") );
+                ContainsSubstring("exceeds the available RAM") ||
+                ContainsSubstring("exceeds the local available RAM") );
             #endif
         }
 
@@ -714,7 +715,8 @@ TEST_CASE( "createCustomFullStateDiagMatr", TEST_CATEGORY ) {
                     ContainsSubstring("failed") || 
                     ContainsSubstring("insufficient available memory") || 
                     ContainsSubstring("available GPU memory") ||
-                    ContainsSubstring("exceeds the available RAM") );
+                    ContainsSubstring("exceeds the available RAM") ||
+                    ContainsSubstring("exceeds the local available RAM") );
                 #endif
             }
         }

From 738ca891eee654d93b586afd10ffff3381009a07 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 30 Jun 2025 20:04:02 +0200
Subject: [PATCH 10/32] added postMultiply functions (#657)

For every existing multiply*() function, such as multiplyCompMatr(), this commit adds a corresponding postMultiply*() function which operates upon a density matrix from the right-hand side. This is useful for preparing density matrices in non-physical states which appear as sub-expressions within things like commutators and the Linbladian.

Implementing these functions involved:
- updating the templating of "any target dense matrix" function, inadvertently simplifying the associated instantiation and dispatch macros by re-using those for the "any target diagonal matrix" function
- adding new utilities and logic for obtaining/effecting the transpose of a function, to undo the transpose effected via operation upon the bra-qubits of a vectorised density matrix
- extending and refactoring the unit tests with postMultiply references

We additionally added the below expected but missing functions from the API:
- multiplyPauliX
- multiplyPauliY
- multiplyPauliZ
---
 quest/include/operations.h        | 202 +++++++++++++++++++
 quest/src/api/operations.cpp      | 313 ++++++++++++++++++++++++++++--
 quest/src/core/accelerator.cpp    | 107 +++++-----
 quest/src/core/accelerator.hpp    |  38 ++--
 quest/src/core/errors.cpp         |  17 ++
 quest/src/core/errors.hpp         |   4 +
 quest/src/core/localiser.cpp      |  60 +++---
 quest/src/core/localiser.hpp      |   8 +-
 quest/src/core/utilities.cpp      |  28 +++
 quest/src/core/utilities.hpp      |   9 +
 quest/src/cpu/cpu_subroutines.cpp |  82 +++++---
 quest/src/cpu/cpu_subroutines.hpp |   4 +-
 quest/src/gpu/gpu_cuquantum.cuh   |   7 +-
 quest/src/gpu/gpu_kernels.cuh     |  54 ++++--
 quest/src/gpu/gpu_subroutines.cpp |  75 ++++---
 quest/src/gpu/gpu_subroutines.hpp |   4 +-
 tests/unit/operations.cpp         | 270 +++++++++++++++++++++-----
 tests/utils/evolve.cpp            |  21 ++
 tests/utils/evolve.hpp            |  42 ++--
 19 files changed, 1078 insertions(+), 267 deletions(-)

diff --git a/quest/include/operations.h b/quest/include/operations.h
index 0898165af..0fb06560f 100644
--- a/quest/include/operations.h
+++ b/quest/include/operations.h
@@ -89,6 +89,7 @@ extern "C" {
  * - getCompMatr1()
  * - getInlineCompMatr1()
  * - applyCompMatr1()
+ * - postMultiplyCompMatr1()
  * - applyQubitProjector()
  * - multiplyCompMatr()
  * @author Tyson Jones
@@ -96,6 +97,57 @@ extern "C" {
 void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
 
 
+/** @notyettested
+ * 
+ * Multiplies a general one-qubit dense @p matrix upon the specified @p target 
+ * qubit of the density matrix @p qureg, from the right-hand side.
+ *  
+ * @formulae
+ * Let @f$ \dmrho = @f$ @p qureg, @f$ \hat{M} = @f$ @p matrix and @f$ t = @f$ @p target, 
+ * and notate @f$\hat{M}_t@f$ as per applyCompMatr1(). Unlike applyCompMatr1() however,
+ * this function only ever right-multiplies @p matrix upon @p qureg.
+ * 
+ * Explicitly
+ *   @f[ 
+        \dmrho \rightarrow \dmrho \, \hat{M}_t
+ *   @f]
+ * where @f$ \hat{M} @f$ is not conjugated nor transposed, and there are no additional 
+ * constraints like unitarity.
+ * 
+ * In general, this function will break the normalisation of @p qureg and result in a
+ * non-physical state, and is useful for preparing sub-expressions of formulae like
+ * the Linbladian.
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createDensityQureg(5);
+
+    CompMatr1 matrix = getInlineCompMatr1({
+        {0.1, 0.2},
+        {0.3i, 0.4i}
+    });
+
+    postMultiplyCompMatr1(qureg, 2, matrix); 
+ * ```
+ *
+ * @param[in,out] qureg  the state to modify.
+ * @param[in]     target the index of the target qubit.
+ * @param[in]     matrix the Z-basis matrix to post-multiply.
+ * @throws @validationerror
+ * - if @p qureg or @p matrix are uninitialised.
+ * - if @p qureg is not a density matrix.
+ * - if @p target is an invalid qubit index.
+ * @see
+ * - getCompMatr1()
+ * - getInlineCompMatr1()
+ * - applyCompMatr1()
+ * - multiplyCompMatr1()
+ * - multiplyCompMatr()
+ * @author Tyson Jones
+ */
+void postMultiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
+
+
 /** Applies a general one-qubit dense unitary @p matrix to the specified @p target 
  * qubit of @p qureg.
  * 
@@ -162,6 +214,7 @@ digraph {
  * - getCompMatr1()
  * - getInlineCompMatr1()
  * - multiplyCompMatr1()
+ * - postMultiplyCompMatr1()
  * - applyControlledCompMatr1()
  * - applyCompMatr2()
  * - applyCompMatr()
@@ -346,6 +399,14 @@ extern "C" {
 void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matr);
 
 
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+/// @see
+/// - postMultiplyCompMatr1
+void postMultiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix);
+
+
 /** @notyetdoced
  * 
  * Applies a general two-qubit dense unitary @p matrix to qubits @p target1 and
@@ -557,6 +618,14 @@ extern "C" {
 void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix);
 
 
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+/// @see
+/// - postMultiplyCompMatr1
+void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix);
+
+
 /** @notyetdoced
  * 
  * @formulae
@@ -612,6 +681,14 @@ void applyMultiStateControlledCompMatr(Qureg qureg, int* controls, int* states,
 void multiplyCompMatr(Qureg qureg, std::vector<int> targets, CompMatr matr);
 
 
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see postMultiplyCompMatr()
+void postMultiplyCompMatr(Qureg qureg, std::vector<int> targets, CompMatr matr);
+
+
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
@@ -667,6 +744,12 @@ extern "C" {
 void multiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matr);
 
 
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+void postMultiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix);
+
+
 /// @notyetdoced
 /// @see applyCompMatr1()
 void applyDiagMatr1(Qureg qureg, int target, DiagMatr1 matr);
@@ -734,6 +817,12 @@ extern "C" {
 void multiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matr);
 
 
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+void postMultiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix);
+
+
 /// @notyetdoced
 /// @see applyCompMatr1()
 void applyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matr);
@@ -801,6 +890,12 @@ extern "C" {
 void multiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
 
 
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+void postMultiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
+
+
 /// @notyetdoced
 /// @see applyCompMatr1()
 void applyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
@@ -828,6 +923,12 @@ void applyMultiStateControlledDiagMatr(Qureg qureg, int* controls, int* states,
 void multiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
 
 
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+void postMultiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
+
+
 /** @notyetdoced
  *
  * @formulae
@@ -874,6 +975,14 @@ void applyMultiStateControlledDiagMatrPower(Qureg qureg, int* controls, int* sta
 void multiplyDiagMatr(Qureg qureg, std::vector<int> targets, DiagMatr matrix);
 
 
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see postMultiplyDiagMatr()
+void postMultiplyDiagMatr(Qureg qureg, std::vector<int> targets, DiagMatr matrix);
+
+
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
@@ -914,6 +1023,14 @@ void applyMultiStateControlledDiagMatr(Qureg qureg, std::vector<int> controls, s
 void multiplyDiagMatrPower(Qureg qureg, std::vector<int> targets, DiagMatr matrix, qcomp exponent);
 
 
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see postMultiplyDiagMatrPower()
+void postMultiplyDiagMatrPower(Qureg qureg, std::vector<int> targets, DiagMatr matrix, qcomp exponent);
+
+
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
@@ -979,6 +1096,18 @@ void multiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
 void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
 
 
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
+
+
 /// @notyetdoced
 /// @notyetvalidated
 void applyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
@@ -1143,6 +1272,12 @@ extern "C" {
 void multiplySwap(Qureg qureg, int qubit1, int qubit2);
 
 
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplySwap(Qureg qureg, int qubit1, int qubit2);
+
+
 /** Applies a SWAP gate between @p qubit1 and @p qubit2 of @p qureg.
  * 
  * @diagram
@@ -1264,20 +1399,41 @@ extern "C" {
 
 
 /// @notyetdoced
+/// @notyettested
 /// @see multiplyCompMatr1()
 void multiplyPauliX(Qureg qureg, int target);
 
 
 /// @notyetdoced
+/// @notyettested
 /// @see multiplyCompMatr1()
 void multiplyPauliY(Qureg qureg, int target);
 
 
 /// @notyetdoced
+/// @notyettested
 /// @see multiplyCompMatr1()
 void multiplyPauliZ(Qureg qureg, int target);
 
 
+/// @notyetdoced
+/// @notyettested
+/// @see postMultiplyCompMatr1()
+void postMultiplyPauliX(Qureg qureg, int target);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see postMultiplyCompMatr1()
+void postMultiplyPauliY(Qureg qureg, int target);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see postMultiplyCompMatr1()
+void postMultiplyPauliZ(Qureg qureg, int target);
+
+
 /// @notyetdoced
 void applyPauliX(Qureg qureg, int target);
 
@@ -1408,6 +1564,12 @@ extern "C" {
 void multiplyPauliStr(Qureg qureg, PauliStr str);
 
 
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyPauliStr(Qureg qureg, PauliStr str);
+
+
 /// @notyetdoced
 void applyPauliStr(Qureg qureg, PauliStr str);
 
@@ -1796,6 +1958,12 @@ extern "C" {
 void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
 
 
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
+
+
 /** @notyetdoced
  * 
  * @formulae
@@ -1929,6 +2097,12 @@ extern "C" {
 void multiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
 
 
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
+
+
 /** @notyetdoced
  * 
  * @formulae
@@ -2201,6 +2375,14 @@ void applyMultiQubitPhaseShift(Qureg qureg, int* targets, int numTargets, qreal
 void multiplyPhaseGadget(Qureg qureg, std::vector<int> targets, qreal angle);
 
 
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see postMultiplyPhaseGadget()
+void postMultiplyPhaseGadget(Qureg qureg, std::vector<int> targets, qreal angle);
+
+
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
@@ -2273,6 +2455,12 @@ extern "C" {
 void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
 
 
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
+
+
 /** @notyettested
  * 
  * Effects (an approximation to) the exponential of @p sum, weighted by @p angle, upon @p qureg,
@@ -2558,6 +2746,12 @@ extern "C" {
 void multiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
 
 
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
+
+
 /// @notyetdoced
 void applyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
 
@@ -2592,6 +2786,14 @@ void applyMultiStateControlledMultiQubitNot(Qureg qureg, int* controls, int* sta
 void multiplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
 
 
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see postMultiplyMultiQubitNot()
+void postMultiplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
+
+
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index ad31411c3..1b57f7d2a 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -59,6 +59,12 @@ void validateAndApplyAnyCtrlAnyTargUnitaryMatrix(Qureg qureg, int* ctrls, int* s
     ctrlVec = util_getBraQubits(ctrlVec, qureg);
     targVec = util_getBraQubits(targVec, qureg);
     localiser_statevec_anyCtrlAnyTargAnyMatr(qureg, ctrlVec, stateVec, targVec, matr, conj);
+
+    /// @todo
+    /// the above logic always performs two in-turn operations upon density matrices, 
+    /// though when matr is diagonal (DiagMatr*), they can be trivially combined into 
+    /// a single operation which enumerates the state only once. We perform this
+    /// optimisation for FullStateDiagMatr elsewhere. Consider optimising here too!
 }
 
 
@@ -75,7 +81,21 @@ void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
     validate_matrixFields(matrix, __func__); // matrix can be non-unitary
 
     bool conj = false;
-    localiser_statevec_anyCtrlOneTargDenseMatr(qureg, {}, {}, target, matrix, conj);
+    bool transp = false;
+    localiser_statevec_anyCtrlOneTargDenseMatr(qureg, {}, {}, target, matrix, conj, transp);
+}
+
+void postMultiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_target(qureg, target, __func__);
+    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
+    
+    // rho matrix ~ transpose(rho) (x) I ||rho>>
+    bool conj = false;
+    bool transp = true;
+    int qubit = util_getBraQubit(target, qureg);
+    localiser_statevec_anyCtrlOneTargDenseMatr(qureg, {}, {}, qubit, matrix, conj, transp);
 }
 
 void applyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
@@ -126,7 +146,23 @@ void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix)
     validate_mixedAmpsFitInNode(qureg, 2, __func__);
 
     bool conj = false;
-    localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, target1, target2, matrix, conj);
+    bool transp = false;
+    localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, target1, target2, matrix, conj, transp);
+}
+
+void postMultiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_twoTargets(qureg, target1, target2, __func__);
+    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
+    validate_mixedAmpsFitInNode(qureg, 2, __func__);
+
+    // rho matrix ~ transpose(rho) (x) I ||rho>>
+    bool conj = false;
+    bool transp = true;
+    int qubit1 = util_getBraQubit(target1, qureg);
+    int qubit2 = util_getBraQubit(target2, qureg);
+    localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, qubit1, qubit2, matrix, conj, transp);
 }
 
 void applyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
@@ -181,7 +217,22 @@ void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix
     validate_mixedAmpsFitInNode(qureg, numTargets, __func__);
 
     bool conj = false;
-    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, util_getVector(targets, numTargets), matrix, conj);
+    bool transp = false;
+    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, util_getVector(targets, numTargets), matrix, conj, transp);
+}
+
+void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
+    validate_mixedAmpsFitInNode(qureg, numTargets, __func__);
+
+    // rho matrix ~ transpose(rho) (x) I ||rho>>
+    bool conj = false;
+    bool transp = true;
+    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, qubits, matrix, conj, transp);
 }
 
 void applyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
@@ -211,6 +262,11 @@ void multiplyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
     multiplyCompMatr(qureg, targets.data(), targets.size(), matr);
 }
 
+void postMultiplyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
+
+    postMultiplyCompMatr(qureg, targets.data(), targets.size(), matr);
+}
+
 void applyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
 
     applyCompMatr(qureg, targets.data(), targets.size(), matr);
@@ -249,6 +305,17 @@ void multiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
     localiser_statevec_anyCtrlOneTargDiagMatr(qureg, {}, {}, target, matrix, conj);
 }
 
+void postMultiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_target(qureg, target, __func__);
+    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
+
+    bool conj = false;
+    int qubit = util_getBraQubit(target, qureg);
+    localiser_statevec_anyCtrlOneTargDiagMatr(qureg, {}, {}, qubit, matrix, conj);
+}
+
 void applyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
 
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, nullptr, nullptr, 0, &target, 1, matrix, __func__);
@@ -299,6 +366,18 @@ void multiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix)
     localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, {}, {}, target1, target2, matrix, conj);
 }
 
+void postMultiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_twoTargets(qureg, target1, target2, __func__);
+    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
+
+    bool conj = false;
+    int qubit1 = util_getBraQubit(target1, qureg);
+    int qubit2 = util_getBraQubit(target2, qureg);
+    localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, {}, {}, qubit1, qubit2, matrix, conj);
+}
+
 void applyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
 
     int targs[] = {target1, target2};
@@ -351,7 +430,20 @@ void multiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix
 
     bool conj = false;
     qcomp exponent = 1;
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, util_getVector(targets, numTargets), matrix, exponent, conj);
+    auto qubits = util_getVector(targets, numTargets);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
+}
+
+void postMultiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
+
+    bool conj = false;
+    qcomp exponent = 1;
+    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
 }
 
 void applyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
@@ -381,6 +473,11 @@ void multiplyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
     multiplyDiagMatr(qureg, targets.data(), targets.size(), matrix);
 }
 
+void postMultiplyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
+
+    postMultiplyDiagMatr(qureg, targets.data(), targets.size(), matrix);
+}
+
 void applyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
 
     applyDiagMatr(qureg, targets.data(), targets.size(), matrix);
@@ -420,7 +517,20 @@ void multiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr m
     validate_matrixExpIsNonDiverging(matrix, exponent, __func__); // harmlessly re-validates fields and is-sync
 
     bool conj = false;
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, util_getVector(targets, numTargets), matrix, exponent, conj);
+    auto qubits = util_getVector(targets, numTargets);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
+}
+
+void postMultiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
+    validate_matrixExpIsNonDiverging(matrix, exponent, __func__); // harmlessly re-validates fields and is-sync
+
+    bool conj = false;
+    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
 }
 
 void applyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent)  {
@@ -505,6 +615,11 @@ void multiplyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qc
     multiplyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
 }
 
+void postMultiplyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qcomp exponent) {
+
+    postMultiplyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
+}
+
 void applyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qcomp exponent) {
 
     applyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
@@ -548,12 +663,39 @@ void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp
     validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
     validate_matrixExpIsNonDiverging(matrix, exponent, __func__);
 
-    bool onlyMultiply = true;
+    // rho -> matrix^exponent rho
+    bool leftMultiply = true;
+    bool rightMultiply = false;
+    bool rightConj = false;
+
     (qureg.isDensityMatrix)?
-        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply):
+        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, leftMultiply, rightMultiply, rightConj):
         localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
 }
 
+void postMultiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_matrixFields(matrix, __func__);
+    validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
+
+    postMultiplyFullStateDiagMatrPower(qureg, matrix, 1); // harmlessly re-validates
+}
+
+void postMultiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_matrixFields(matrix, __func__);
+    validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
+    validate_matrixExpIsNonDiverging(matrix, exponent, __func__);
+
+    // rho -> rho matrix^exponent
+    bool leftMultiply = false;
+    bool rightMultiply = true;
+    bool rightConj = false;
+    localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, leftMultiply, rightMultiply, rightConj);
+}
+
 void applyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
     validate_quregFields(qureg, __func__);
     validate_matrixFields(matrix, __func__);
@@ -571,9 +713,13 @@ void applyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp ex
     validate_unitaryExponentIsReal(exponent, __func__);
     validate_matrixExpIsNonDiverging(matrix, exponent, __func__);
 
-    bool onlyMultiply = false;
+    // rho -> matrix^exponent rho conj(matrix^exponent)
+    bool leftMultiply = true;
+    bool rightMultiply = true;
+    bool rightConj = true;
+
     (qureg.isDensityMatrix)?
-        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, onlyMultiply):
+        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, leftMultiply, rightMultiply, rightConj):
         localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
 }
 
@@ -751,6 +897,16 @@ void multiplySwap(Qureg qureg, int qubit1, int qubit2) {
     localiser_statevec_anyCtrlSwap(qureg, {}, {}, qubit1, qubit2);
 }
 
+void postMultiplySwap(Qureg qureg, int qubit1, int qubit2) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_twoTargets(qureg, qubit1, qubit2, __func__);
+
+    qubit1 = util_getBraQubit(qubit1, qureg);
+    qubit2 = util_getBraQubit(qubit2, qureg);
+    localiser_statevec_anyCtrlSwap(qureg, {}, {}, qubit1, qubit2);
+}
+
 void applySwap(Qureg qureg, int qubit1, int qubit2) {
     validate_quregFields(qureg, __func__);
     validate_twoTargets(qureg, qubit1, qubit2, __func__);
@@ -884,6 +1040,61 @@ void applyMultiStateControlledSqrtSwap(Qureg qureg, vector<int> controls, vector
 
 extern "C" {
 
+void multiplyPauliX(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    PauliStr str = getPauliStr("X", {target});
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+}
+
+void multiplyPauliY(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    PauliStr str = getPauliStr("Y", {target});
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+}
+
+void multiplyPauliZ(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    PauliStr str = getPauliStr("Z", {target});
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+}
+
+void postMultiplyPauliX(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    PauliStr str = getPauliStr("X", {target});
+    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+}
+
+void postMultiplyPauliY(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    qcomp factor = -1; // undo transpose
+    PauliStr str = getPauliStr("Y", {target});
+    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str, factor);
+}
+
+void postMultiplyPauliZ(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    PauliStr str = getPauliStr("Z", {target});
+    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+}
+
 void applyPauliX(Qureg qureg, int target) {
     validate_quregFields(qureg, __func__);
     validate_target(qureg, target, __func__);
@@ -1034,6 +1245,16 @@ void multiplyPauliStr(Qureg qureg, PauliStr str) {
     localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
 }
 
+void postMultiplyPauliStr(Qureg qureg, PauliStr str) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_pauliStrTargets(qureg, str, __func__);
+
+    qcomp factor = paulis_hasOddNumY(str)? -1 : 1; // undo transpose
+    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str, factor);
+}
+
 void applyPauliStr(Qureg qureg, PauliStr str) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrTargets(qureg, str, __func__);
@@ -1120,7 +1341,7 @@ void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
     localiser_statevec_setQuregToSuperposition(0, workspace, 1, qureg, 0, qureg);
     localiser_statevec_initUniformState(qureg, 0);
 
-    // apply each term in-turn, mixing into output qureg, then undo using idempotency
+    // left-multiply each term in-turn, mixing into output qureg, then undo using idempotency
     for (qindex i=0; i<sum.numTerms; i++) {
         localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, sum.strings[i]);
         localiser_statevec_setQuregToSuperposition(1, qureg, sum.coeffs[i], workspace, 0, workspace);
@@ -1130,6 +1351,31 @@ void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
     // workspace -> qureg, and qureg -> sum * qureg
 }
 
+void postMultiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
+    validate_quregFields(qureg, __func__);
+    validate_quregFields(workspace, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_quregCanBeWorkspace(qureg, workspace, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_pauliStrSumTargets(sum, qureg, __func__);
+
+    // clone qureg to workspace, set qureg to blank
+    localiser_statevec_setQuregToSuperposition(0, workspace, 1, qureg, 0, qureg);
+    localiser_statevec_initUniformState(qureg, 0);
+
+    // post-multiply each term in-turn, mixing into output qureg, then undo using idempotency
+    for (qindex i=0; i<sum.numTerms; i++) {
+        PauliStr str =  paulis_getShiftedPauliStr(sum.strings[i], qureg.numQubits);
+        qcomp factor = paulis_hasOddNumY(str)? -1 : 1; // undoes transpose
+
+        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, str, factor);
+        localiser_statevec_setQuregToSuperposition(1, qureg, sum.coeffs[i], workspace, 0, workspace);
+        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, str, factor);
+    }
+
+    // workspace -> qureg, and qureg -> sum * qureg
+}
+
 void internal_applyFirstOrderTrotterRepetition(
     Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
     vector<int>& states, PauliStrSum sum, qcomp angle, bool reverse
@@ -1496,6 +1742,17 @@ void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
     localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
 }
 
+void postMultiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_pauliStrTargets(qureg, str, __func__);
+
+    qreal factor = paulis_hasOddNumY(str)? -1 : 1;
+    qreal phase = factor * util_getPhaseFromGateAngle(angle);
+    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+    localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
+}
+
 void applyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrTargets(qureg, str, __func__);
@@ -1589,7 +1846,18 @@ void multiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle)
     validate_targets(qureg, targets, numTargets, __func__);
 
     qreal phase = util_getPhaseFromGateAngle(angle);
-    localiser_statevec_anyCtrlPhaseGadget(qureg, {}, {}, util_getVector(targets,numTargets), phase);
+    auto qubits = util_getVector(targets, numTargets);
+    localiser_statevec_anyCtrlPhaseGadget(qureg, {}, {}, qubits, phase);
+}
+
+void postMultiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+
+    qreal phase = util_getPhaseFromGateAngle(angle);
+    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlPhaseGadget(qureg, {}, {}, qubits, phase);
 }
 
 void applyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
@@ -1643,6 +1911,11 @@ void multiplyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
     multiplyPhaseGadget(qureg, targets.data(), targets.size(), angle);
 }
 
+void postMultiplyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
+
+    postMultiplyPhaseGadget(qureg, targets.data(), targets.size(), angle);
+}
+
 void applyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
 
     applyPhaseGadget(qureg, targets.data(), targets.size(), angle);
@@ -1763,7 +2036,18 @@ void multiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
     validate_targets(qureg, targets, numTargets, __func__);
 
     // harmlessly re-validates
-    multiplyPauliStr(qureg, getPauliStr(std::string(numTargets, 'X'), targets, numTargets));
+    PauliStr str = getPauliStr(std::string(numTargets, 'X'), targets, numTargets);
+    multiplyPauliStr(qureg, str);
+}
+
+void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+
+    // harmlessly re-validates
+    PauliStr str = getPauliStr(std::string(numTargets, 'X'), targets, numTargets);
+    postMultiplyPauliStr(qureg, str);
 }
 
 void applyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
@@ -1809,6 +2093,11 @@ void multiplyMultiQubitNot(Qureg qureg, vector<int> targets) {
     multiplyMultiQubitNot(qureg, targets.data(), targets.size());
 }
 
+void postMultiplyMultiQubitNot(Qureg qureg, vector<int> targets) {
+
+    postMultiplyMultiQubitNot(qureg, targets.data(), targets.size());
+}
+
 void applyMultiQubitNot(Qureg qureg, vector<int> targets) {
 
     applyMultiQubitNot(qureg, targets.data(), targets.size());
diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 9f877fba3..5e3e40f55 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -63,6 +63,12 @@ using std::min;
         ((b2)? funcname<false,true> : funcname<false,false>))
 
 
+#define GET_CPU_OR_GPU_FOUR_BOOL_FUNC_OPTIMISED_FOR_FIRST_BOOL( isgpu, funcsuffix, value, fixed1,fixed2,fixed3 ) \
+    ((isgpu)? \
+        ((value)? gpu_##funcsuffix<true, fixed1,fixed2,fixed3> : gpu_##funcsuffix<false, fixed1,fixed2,fixed3> ) : \
+        ((value)? cpu_##funcsuffix<true, fixed1,fixed2,fixed3> : cpu_##funcsuffix<false, fixed1,fixed2,fixed3> ))
+
+
 #if (MAX_OPTIMISED_NUM_CTRLS != 5) || (MAX_OPTIMISED_NUM_TARGS != 5)
     #error "The number of optimised, templated QuEST functions was inconsistent between accelerator's source and header."
 #endif
@@ -108,9 +114,9 @@ using std::min;
 
 
 /// @todo
-/// GET_CPU_OR_GPU_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS,
-/// as defined below, is only ever called by used by anyCtrlAnyTargDenseMatr,
-/// which only ever receives numTargs>=3 (due to accelerator redirecting 
+/// GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS as defined below
+/// is used by anyCtrlAnyTargDiagMatr and anyCtrlAnyTargDenseMatr; the 
+/// latter only ever receives numTargs>=3 (due to accelerator redirecting 
 /// fewer targets to faster bespoke functions which e.g. avoid global GPU
 /// cache emory access). This means its instantiation with numTargs=0,1,2
 /// is useless, though contributes to 42% of the function's compilation
@@ -118,38 +124,7 @@ using std::min;
 /// can ergo non-negligibly speed up compilation by avoiding these redundant 
 /// instances at the cost of increased code complexity/asymmetry. Consider!
 
-#define GET_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs, c) \
-    (vector <CONJ_ARR(f)> { \
-        CONJ_ARR(f) {&f<0,0,c>,  &f<0,1,c>,  &f<0,2,c>,  &f<0,3,c>,  &f<0,4,c>,  &f<0,5,c>,  &f<0,-1,c>}, \
-        CONJ_ARR(f) {&f<1,0,c>,  &f<1,1,c>,  &f<1,2,c>,  &f<1,3,c>,  &f<1,4,c>,  &f<1,5,c>,  &f<1,-1,c>}, \
-        CONJ_ARR(f) {&f<2,0,c>,  &f<2,1,c>,  &f<2,2,c>,  &f<2,3,c>,  &f<2,4,c>,  &f<2,5,c>,  &f<2,-1,c>}, \
-        CONJ_ARR(f) {&f<3,0,c>,  &f<3,1,c>,  &f<3,2,c>,  &f<3,3,c>,  &f<3,4,c>,  &f<3,5,c>,  &f<3,-1,c>}, \
-        CONJ_ARR(f) {&f<4,0,c>,  &f<4,1,c>,  &f<4,2,c>,  &f<4,3,c>,  &f<4,4,c>,  &f<4,5,c>,  &f<4,-1,c>}, \
-        CONJ_ARR(f) {&f<5,0,c>,  &f<5,1,c>,  &f<5,2,c>,  &f<5,3,c>,  &f<5,4,c>,  &f<5,5,c>,  &f<5,-1,c>}, \
-        CONJ_ARR(f) {&f<-1,0,c>, &f<-1,1,c>, &f<-1,2,c>, &f<-1,3,c>, &f<-1,4,c>, &f<-1,5,c>, &f<-1,-1,c>}}) \
-    [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)] \
-    [std::min((int) numtargs, MAX_OPTIMISED_NUM_TARGS + 1)]
-
-#define CONJ_ARR(f) vector<decltype(&f<0,0,false>)>
-
-#define GET_CPU_OR_GPU_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(funcsuffix, qureg, numctrls, numtargs, conj) \
-    ((qureg.isGpuAccelerated)? \
-        ((conj)? \
-            GET_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true ) : \
-            GET_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false ) ) : \
-        ((conj)? \
-            GET_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true ) : \
-            GET_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false ) ) )
-
-
-/// @todo
-/// This has gotten a bit ridiculous. Is there a way to use (likely)
-/// more abominable pre-processor mischief which negates the need
-/// to repeat the entire macro(s) when the number of templated
-/// parameters grows?
-
-
-#define GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs, c, h) \
+#define GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(f, numctrls, numtargs, c, h) \
     (vector <POWER_CONJ_ARR(f)> { \
         POWER_CONJ_ARR(f) {&f<0,0,c,h>,  &f<0,1,c,h>,  &f<0,2,c,h>,  &f<0,3,c,h>,  &f<0,4,c,h>,  &f<0,5,c,h>,  &f<0,-1,c,h>}, \
         POWER_CONJ_ARR(f) {&f<1,0,c,h>,  &f<1,1,c,h>,  &f<1,2,c,h>,  &f<1,3,c,h>,  &f<1,4,c,h>,  &f<1,5,c,h>,  &f<1,-1,c,h>}, \
@@ -163,22 +138,25 @@ using std::min;
 
 #define POWER_CONJ_ARR(f) vector<decltype(&f<0,0,false,false>)>
 
-#define GET_CPU_OR_GPU_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(funcsuffix, qureg, numctrls, numtargs, conj, haspower) \
+#define GET_CPU_OR_GPU_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(funcsuffix, qureg, numctrls, numtargs, conj, haspower) \
     ((qureg.isGpuAccelerated)? \
         ((conj)? \
             ((haspower)? \
-                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true, true ) : \
-                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true, false ) ) : \
+                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true, true ) : \
+                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, true, false ) ) : \
             ((haspower)? \
-                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false, true ) : \
-                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false, false ) ) ) : \
+                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false, true ) : \
+                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( gpu_##funcsuffix, numctrls, numtargs, false, false ) ) ) : \
         ((conj)? \
             ((haspower)? \
-                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true, true ) : \
-                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true, false ) ) : \
+                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true, true ) : \
+                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, true, false ) ) : \
             ((haspower)? \
-                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false, true ) : \
-                GET_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false, false ) ) ) )
+                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false, true ) : \
+                GET_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( cpu_##funcsuffix, numctrls, numtargs, false, false ) ) ) )
+
+/// @todo
+/// The above macro spaghetti is diabolical - update using C++ metaprogamming!
 
 
 
@@ -329,9 +307,9 @@ void accel_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls,
 }
 
 
-void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj) {
+void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj, bool transp) {
 
-    auto func = GET_CPU_OR_GPU_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevec_anyCtrlAnyTargDenseMatr_sub, qureg, ctrls.size(), targs.size(), conj );
+    auto func = GET_CPU_OR_GPU_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevec_anyCtrlAnyTargDenseMatr_sub, qureg, ctrls.size(), targs.size(), conj, transp );
     func(qureg, ctrls, ctrlStates, targs, matr);
 }
 
@@ -360,7 +338,7 @@ void accel_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, v
 
     bool hasPower = exponent != qcomp(1, 0);
 
-    auto func = GET_CPU_OR_GPU_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevec_anyCtrlAnyTargDiagMatr_sub, qureg, ctrls.size(), targs.size(), conj, hasPower );
+    auto func = GET_CPU_OR_GPU_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( statevec_anyCtrlAnyTargDiagMatr_sub, qureg, ctrls.size(), targs.size(), conj, hasPower );
     func(qureg, ctrls, ctrlStates, targs, matr, exponent);
 }
 
@@ -414,7 +392,30 @@ void accel_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qco
 }
 
 
-void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyOnly) {
+auto getDenseMatrAllTargDiagMatrFunc(bool isGpu, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight) {
+
+    // this helper function exists, dissimilar from the function-agnostic macros used
+    // by other functions, because densmatr_allTargDiagMatr_sub() does not accept every
+    // possible combination of its boolean template parameters 
+    assert_fullStateDiagMatrTemplateParamsAreValid(multiplyLeft, multiplyRight, conjRight);
+
+    bool hasPower = exponent != qcomp(1, 0);
+
+    if (multiplyLeft && multiplyRight && conjRight)
+        return GET_CPU_OR_GPU_FOUR_BOOL_FUNC_OPTIMISED_FOR_FIRST_BOOL( isGpu, densmatr_allTargDiagMatr_sub, hasPower, true,true,true );
+
+    if (multiplyLeft && ! multiplyRight && ! conjRight)
+        return GET_CPU_OR_GPU_FOUR_BOOL_FUNC_OPTIMISED_FOR_FIRST_BOOL( isGpu, densmatr_allTargDiagMatr_sub, hasPower, true,false,false );
+
+    if (! multiplyLeft && multiplyRight && ! conjRight)
+        return GET_CPU_OR_GPU_FOUR_BOOL_FUNC_OPTIMISED_FOR_FIRST_BOOL( isGpu, densmatr_allTargDiagMatr_sub, hasPower, false,true,false );
+
+    // unreachable
+    return (void (*)(Qureg, FullStateDiagMatr, qcomp)) nullptr;
+}
+
+
+void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight) {
 
     // matr is always local, qureg can be local or distributed...
     assert_fullStateDiagMatrIsLocal(matr);
@@ -423,9 +424,9 @@ void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qc
     bool quregGPU = qureg.isGpuAccelerated;
     bool matrGPU = matr.isGpuAccelerated;
 
-    bool hasPower = exponent != qcomp(1, 0);
-    auto cpuFunc = GET_FUNC_OPTIMISED_FOR_TWO_BOOLS( cpu_densmatr_allTargDiagMatr_sub, hasPower, multiplyOnly );
-    auto gpuFunc = GET_FUNC_OPTIMISED_FOR_TWO_BOOLS( gpu_densmatr_allTargDiagMatr_sub, hasPower, multiplyOnly );
+    // which determines which function is called
+    auto gpuFunc = getDenseMatrAllTargDiagMatrFunc(true,  exponent, multiplyLeft, multiplyRight, conjRight);
+    auto cpuFunc = getDenseMatrAllTargDiagMatrFunc(false, exponent, multiplyLeft, multiplyRight, conjRight);
 
     // when deployments match, we trivially call the common backend
     if ( quregGPU &&  matrGPU) gpuFunc(qureg, matr, exponent);
@@ -475,7 +476,7 @@ void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qc
 }
 
 
-void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyOnly) {
+void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight) {
 
     assert_fullStateDiagMatrIsDistributed(matr);
     assert_acceleratorQuregIsDistributed(qureg);
@@ -500,7 +501,7 @@ void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qc
     temp.cpuElems = qureg.cpuCommBuffer;
     temp.gpuElems = qureg.gpuCommBuffer;
 
-    accel_densmatr_allTargDiagMatr_subA(qureg, temp, exponent, multiplyOnly);
+    accel_densmatr_allTargDiagMatr_subA(qureg, temp, exponent, multiplyLeft, multiplyRight, conjRight);
 }
 
 
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index 01cf3efcd..5480d8133 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -101,22 +101,22 @@ using std::vector;
     template returntype funcname <-1,numtargs, conj>  args;
 
 
-#define INSTANTIATE_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(returntype, funcname, args) \
-    private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_outer(returntype, funcname, true,  true,  args) \
-    private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_outer(returntype, funcname, true,  false, args) \
-    private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_outer(returntype, funcname, false, true,  args) \
-    private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_outer(returntype, funcname, false, false, args)
-
-#define private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_outer(returntype, funcname, conj, haspower, args) \
-    private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_inner(returntype, funcname, 0, conj, haspower, args) \
-    private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_inner(returntype, funcname, 1, conj, haspower, args) \
-    private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_inner(returntype, funcname, 2, conj, haspower, args) \
-    private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_inner(returntype, funcname, 3, conj, haspower, args) \
-    private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_inner(returntype, funcname, 4, conj, haspower, args) \
-    private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_inner(returntype, funcname, 5, conj, haspower, args) \
-    private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_inner(returntype, funcname,-1, conj, haspower, args)
-
-#define private_EXPONENTIABLE_CONJUGABLE_INSTANTIATE_inner(returntype, funcname, numtargs, conj, haspower, args) \
+#define INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(returntype, funcname, args) \
+    private_TWO_BOOL_INSTANTIATE_outer(returntype, funcname, true,  true,  args) \
+    private_TWO_BOOL_INSTANTIATE_outer(returntype, funcname, true,  false, args) \
+    private_TWO_BOOL_INSTANTIATE_outer(returntype, funcname, false, true,  args) \
+    private_TWO_BOOL_INSTANTIATE_outer(returntype, funcname, false, false, args)
+
+#define private_TWO_BOOL_INSTANTIATE_outer(returntype, funcname, conj, haspower, args) \
+    private_TWO_BOOL_INSTANTIATE_inner(returntype, funcname, 0, conj, haspower, args) \
+    private_TWO_BOOL_INSTANTIATE_inner(returntype, funcname, 1, conj, haspower, args) \
+    private_TWO_BOOL_INSTANTIATE_inner(returntype, funcname, 2, conj, haspower, args) \
+    private_TWO_BOOL_INSTANTIATE_inner(returntype, funcname, 3, conj, haspower, args) \
+    private_TWO_BOOL_INSTANTIATE_inner(returntype, funcname, 4, conj, haspower, args) \
+    private_TWO_BOOL_INSTANTIATE_inner(returntype, funcname, 5, conj, haspower, args) \
+    private_TWO_BOOL_INSTANTIATE_inner(returntype, funcname,-1, conj, haspower, args)
+
+#define private_TWO_BOOL_INSTANTIATE_inner(returntype, funcname, numtargs, conj, haspower, args) \
     template returntype funcname <0, numtargs, conj, haspower>  args; \
     template returntype funcname <1, numtargs, conj, haspower>  args; \
     template returntype funcname <2, numtargs, conj, haspower>  args; \
@@ -194,7 +194,7 @@ void accel_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector<int> ctrls,
 
 void accel_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr);
 
-void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj);
+void accel_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj, bool transp);
 
 
 /*
@@ -209,8 +209,8 @@ void accel_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, v
 
 void accel_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
-void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyOnly);
-void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyOnly);
+void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight);
+void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight);
 
 
 
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 2f44127c8..a24516891 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -431,6 +431,18 @@ void assert_fullStateDiagMatrIsDistributed(FullStateDiagMatr matr) {
         raiseInternalError("An accelerator function received a non-distributed FullStateDiagMatr where a distributed one was expected.");
 }
 
+void assert_fullStateDiagMatrTemplateParamsAreValid(bool multiplyLeft, bool multiplyRight, bool conjRight) {
+
+    bool valid = (
+        (  multiplyLeft &&   multiplyRight &&   conjRight) || // matr qureg conj(matr)
+        (  multiplyLeft && ! multiplyRight && ! conjRight) || // matr qureg
+        (! multiplyLeft &&   multiplyRight && ! conjRight)    //      qureg matr
+    );
+
+    if (!valid)
+        raiseInternalError("The accelerator function accel_densmatr_allTargDiagMatr_subA() recieved an invalid combination of template parameters.");
+}
+
 void assert_acceleratorQuregIsDistributed(Qureg qureg) {
 
     if (!qureg.isDistributed)
@@ -603,6 +615,11 @@ void error_gpuDeadCopyMatrixFunctionCalled() {
     raiseInternalError("The internal GPU function copyMatrixIfGpuCompiled() was called, though is intended as dead-code - matrices needing copying to GPU should be stored as flat row-wise lists.");
 }
 
+void error_gpuDenseMatrixConjugatedAndTransposed() {
+
+    raiseInternalError("The GPU + cuQuantum implementation of anyCtrlAnyTargDenseMatr() assumes that at most one of template arguments ApplyConj and ApplyTransp is true, though this was violated.");
+}
+
 void assert_quregIsGpuAccelerated(Qureg qureg) {
 
     if (!qureg.isGpuAccelerated)
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index ce8f7e68c..7f6a0c609 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -153,6 +153,8 @@ void assert_fullStateDiagMatrIsLocal(FullStateDiagMatr matr);
 
 void assert_fullStateDiagMatrIsDistributed(FullStateDiagMatr matr);
 
+void assert_fullStateDiagMatrTemplateParamsAreValid(bool multiplyLeft, bool multiplyRight, bool conjRight);
+
 void assert_acceleratorQuregIsDistributed(Qureg qureg);
 
 void assert_quregAndFullStateDiagMatrAreBothOrNeitherDistrib(Qureg qureg, FullStateDiagMatr matr);
@@ -227,6 +229,8 @@ void error_gpuUnexpectedlyInaccessible();
 
 void error_gpuDeadCopyMatrixFunctionCalled();
 
+void error_gpuDenseMatrixConjugatedAndTransposed();
+
 void assert_gpuIsAccessible();
 
 void assert_gpuHasBeenBound(bool isBound);
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index cdaef4c24..71cb8f211 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -953,7 +953,7 @@ void anyCtrlOneTargDenseMatrOnPrefix(Qureg qureg, vector<int> ctrls, vector<int>
 }
 
 
-void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr, bool conj) {
+void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr, bool conj, bool transp) {
     assertValidCtrlStates(ctrls, ctrlStates);
     setDefaultCtrlStates(ctrls, ctrlStates);
 
@@ -964,8 +964,11 @@ void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, vector<int> ctrls,
     // retain only suffix control qubits as relevant to communication and local amp modification
     removePrefixQubitsAndStates(qureg, ctrls, ctrlStates);
 
+    // only one of conj or transp will be true (but logic is correct if both were true)
     if (conj) 
         matr = util_getConj(matr);
+    if (transp)
+        matr = util_getTranspose(matr);
 
     // perform embarrassingly parallel routine or communication-inducing swaps
     doesGateRequireComm(qureg, targ)?
@@ -983,18 +986,21 @@ void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, vector<int> ctrls,
  */
 
 
-void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr2 matr, bool conj) {
-    if (conj) matr = util_getConj(matr);
+void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr2 matr, bool conj, bool transp) {
+    if (conj) 
+        matr = util_getConj(matr);
+    if (transp)
+        matr = util_getTranspose(matr);
     accel_statevec_anyCtrlTwoTargDenseMatr_sub(qureg, ctrls, ctrlStates, targs[0], targs[1], matr);
 }
-void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr  matr, bool conj) {
-    accel_statevec_anyCtrlAnyTargDenseMatr_sub(qureg, ctrls, ctrlStates, targs, matr, conj);
+void anyCtrlTwoOrAnyTargDenseMatrOnSuffix(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr  matr, bool conj, bool transp) {
+    accel_statevec_anyCtrlAnyTargDenseMatr_sub(qureg, ctrls, ctrlStates, targs, matr, conj, transp);
 }
 
 
 // T can be CompMatr2 or CompMatr
 template <typename T>
-void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, T matr, bool conj) {
+void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, T matr, bool conj, bool transp) {
 
     // node has nothing to do if all local amps violate control condition
     if (!doAnyLocalStatesHaveQubitValues(qureg, ctrls, ctrlStates))
@@ -1005,7 +1011,7 @@ void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ct
 
         // using only the suffix ctrls
         removePrefixQubitsAndStates(qureg, ctrls, ctrlStates);
-        anyCtrlTwoOrAnyTargDenseMatrOnSuffix(qureg, ctrls, ctrlStates, targs, matr, conj);
+        anyCtrlTwoOrAnyTargDenseMatrOnSuffix(qureg, ctrls, ctrlStates, targs, matr, conj, transp);
         return;
     }
 
@@ -1032,7 +1038,7 @@ void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ct
 
         // perform embarrassingly parallel simulation using only the new suffix ctrls
         removePrefixQubitsAndStates(qureg, newCtrls, ctrlStates);
-        anyCtrlTwoOrAnyTargDenseMatrOnSuffix(qureg, newCtrls, ctrlStates, newTargs, matr, conj);
+        anyCtrlTwoOrAnyTargDenseMatrOnSuffix(qureg, newCtrls, ctrlStates, newTargs, matr, conj, transp);
     }
 
     // undo swaps, again invoking communication
@@ -1040,15 +1046,15 @@ void anyCtrlTwoOrAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ct
 }
 
 
-void localiser_statevec_anyCtrlTwoTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr, bool conj) {
+void localiser_statevec_anyCtrlTwoTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr, bool conj, bool transp) {
     assertValidCtrlStates(ctrls, ctrlStates);
     setDefaultCtrlStates(ctrls, ctrlStates);
 
-    anyCtrlTwoOrAnyTargDenseMatr(qureg, ctrls, ctrlStates, {targ1,targ2}, matr, conj);
+    anyCtrlTwoOrAnyTargDenseMatr(qureg, ctrls, ctrlStates, {targ1,targ2}, matr, conj, transp);
 }
 
 
-void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj) {
+void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj, bool transp) {
     assertValidCtrlStates(ctrls, ctrlStates);
     setDefaultCtrlStates(ctrls, ctrlStates);
 
@@ -1060,19 +1066,19 @@ void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, vector<int> ctrls,
     // its convenient generality, so we divert to the one-targ routine when possible, copying the 
     // heap CPU matrix (assumed consistent with GPU memory) into stack memory
     if (targs.size() == 1)
-        localiser_statevec_anyCtrlOneTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], getCompMatr1(matr.cpuElems), conj);
+        localiser_statevec_anyCtrlOneTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], getCompMatr1(matr.cpuElems), conj, transp);
     
     // similarly, bespoke two-targ routines are preferable although they offer no communication
     // benefit because they call the same any-targ localiser, but still accelerate GPU memory access.
     // this function call is the same as below, but we explicitly pass a CompMatr2 type in lieu of 
     // CompMatr, which avoids having to copy the CompMatr dynamic memory into accelerator backends
     else if (targs.size() == 2)
-        localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], targs[1], getCompMatr2(matr.cpuElems), conj);
+        localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], targs[1], getCompMatr2(matr.cpuElems), conj, transp);
     
     // call the any-targ routine when given 3 or more targs, which may still invoke bespoke,
     // fixed-targ instances of backend templated functions depending the number of targs
     else
-        anyCtrlTwoOrAnyTargDenseMatr(qureg, ctrls, ctrlStates, targs, matr, conj);
+        anyCtrlTwoOrAnyTargDenseMatr(qureg, ctrls, ctrlStates, targs, matr, conj, transp);
 }
 
 
@@ -1165,7 +1171,7 @@ void localiser_statevec_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qco
 }
 
 
-void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyOnly) {
+void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight) {
     assert_localiserGivenDensMatr(qureg);
 
     // the diagonal matr has quadratically fewer elements than the density-matrix
@@ -1191,7 +1197,7 @@ void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qco
     // when the matrix is not distributed, we call the same routine despite whether qureg 
     // is distributed or not; that merely changes how many qureg columns get updated
     if (!matrDist) {
-        accel_densmatr_allTargDiagMatr_subA(qureg, matr, exponent, multiplyOnly);
+        accel_densmatr_allTargDiagMatr_subA(qureg, matr, exponent, multiplyLeft, multiplyRight, conjRight);
         return;
     }
 
@@ -1200,7 +1206,7 @@ void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qco
 
     // matr elems are inside qureg buffer, but we still pass matr struct along to
     // accelerator, because it is going to perform mischief to re-use subA().
-    accel_densmatr_allTargDiagMatr_subB(qureg, matr, exponent, multiplyOnly); 
+    accel_densmatr_allTargDiagMatr_subB(qureg, matr, exponent, multiplyLeft, multiplyRight, conjRight); 
 }
 
 
@@ -1215,12 +1221,17 @@ void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qco
 
 template <class T>
 void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, T matr, bool conj) {
-    if constexpr (util_isDiagMatr <T>()) localiser_statevec_anyCtrlAnyTargDiagMatr(qureg,  ctrls, ctrlStates, targs, matr, 1, conj); // exponent=1
-    if constexpr (util_isDiagMatr1<T>()) localiser_statevec_anyCtrlOneTargDiagMatr(qureg,  ctrls, ctrlStates, targs[0], matr, conj);
+
+    // this function is never invoked by operations whch require transposing matr
+    bool transp = false;
+    qcomp expo = 1;
+
+    if constexpr (util_isDiagMatr <T>()) localiser_statevec_anyCtrlAnyTargDiagMatr(qureg,  ctrls, ctrlStates, targs,        matr, expo, conj);
+    if constexpr (util_isDiagMatr1<T>()) localiser_statevec_anyCtrlOneTargDiagMatr(qureg,  ctrls, ctrlStates, targs[0],           matr, conj);
     if constexpr (util_isDiagMatr2<T>()) localiser_statevec_anyCtrlTwoTargDiagMatr(qureg,  ctrls, ctrlStates, targs[0], targs[1], matr, conj);
-    if constexpr (util_isCompMatr <T>()) localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, ctrls, ctrlStates, targs, matr, conj);
-    if constexpr (util_isCompMatr1<T>()) localiser_statevec_anyCtrlOneTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], matr, conj);
-    if constexpr (util_isCompMatr2<T>()) localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], targs[1], matr, conj);
+    if constexpr (util_isCompMatr <T>()) localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, ctrls, ctrlStates, targs,              matr, conj, transp);
+    if constexpr (util_isCompMatr1<T>()) localiser_statevec_anyCtrlOneTargDenseMatr(qureg, ctrls, ctrlStates, targs[0],           matr, conj, transp);
+    if constexpr (util_isCompMatr2<T>()) localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, ctrls, ctrlStates, targs[0], targs[1], matr, conj, transp);
 }
 
 template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, vector<int>, vector<int>, vector<int>, DiagMatr,  bool);
@@ -1670,12 +1681,13 @@ CompMatr getSpoofedCompMatrFromSuperOp(SuperOp op) {
 void localiser_densmatr_superoperator(Qureg qureg, SuperOp op, vector<int> ketTargs) {
     assert_localiserGivenDensMatr(qureg);
 
-    // effect the superoperator as a (non-conjugated) dense matrix on the ket + bra qubits
+    // effect the superoperator as a dense matrix on the ket + bra qubits
     bool conj = false;
+    bool transp = false;
     auto braTargs = util_getBraQubits(ketTargs, qureg);
     auto allTargs = util_getConcatenated(ketTargs, braTargs);
     CompMatr matr = getSpoofedCompMatrFromSuperOp(op);
-    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, allTargs, matr, conj);
+    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, allTargs, matr, conj, transp);
 }
 
 
diff --git a/quest/src/core/localiser.hpp b/quest/src/core/localiser.hpp
index 8b9975aca..50413fe68 100644
--- a/quest/src/core/localiser.hpp
+++ b/quest/src/core/localiser.hpp
@@ -83,11 +83,11 @@ void localiser_statevec_anyCtrlSwap(Qureg qureg, vector<int> ctrls, vector<int>
  * DENSE MATRICES
  */
 
-void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr, bool conj);
+void localiser_statevec_anyCtrlOneTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ, CompMatr1 matr, bool conj, bool transp);
 
-void localiser_statevec_anyCtrlTwoTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr, bool conj);
+void localiser_statevec_anyCtrlTwoTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr, bool conj, bool transp);
 
-void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj);
+void localiser_statevec_anyCtrlAnyTargDenseMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr, bool conj, bool transp);
 
 
 /*
@@ -101,7 +101,7 @@ void localiser_statevec_anyCtrlTwoTargDiagMatr(Qureg qureg, vector<int> ctrls, v
 void localiser_statevec_anyCtrlAnyTargDiagMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent, bool conj);
 
 void localiser_statevec_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
-void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyOnly);
+void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight);
 
 
 /*
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 1c22d43d8..16891f234 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -409,6 +409,34 @@ void util_setConj(DiagMatr matrix) {
 }
 
 
+/*
+ * MATRIX TRANSPOSITION
+ */
+
+// type T can be qcomp*[2] or qcomp*[4]
+template <typename T>
+void setDenseElemsTranspose(T elems, qindex dim) {
+    for (qindex i=0; i<dim; i++) {
+        for (qindex j=0; j<i; j++) {
+            qcomp temp = elems[i][j];
+            elems[i][j] = elems[j][i];
+            elems[j][i] = temp;
+        }
+    }
+}
+
+CompMatr1 util_getTranspose(CompMatr1 matrix) {
+    CompMatr1 conj = matrix;
+    setDenseElemsTranspose(conj.elems, matrix.numRows);
+    return conj;
+}
+CompMatr2 util_getTranspose(CompMatr2 matrix) {
+    CompMatr2 conj = matrix;
+    setDenseElemsTranspose(conj.elems, matrix.numRows);
+    return conj;
+}
+
+
 
 /*
  * MATRIX UNITARITY
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index 6d741312b..44acce231 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -257,6 +257,15 @@ void util_setConj(DiagMatr matrix);
 
 
 
+/*
+ * MATRIX TRANSPOSITION
+ */
+
+CompMatr1 util_getTranspose(CompMatr1 matrix);
+CompMatr2 util_getTranspose(CompMatr2 matrix);
+
+
+
 /*
  * MATRIX PROPERTIES
  */
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 9c90e08c1..4a783a372 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -479,7 +479,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevec_anyCtrlTwoTargDense
  */
 
 
-template <int NumCtrls, int NumTargs, bool ApplyConj>
+template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp>
 void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr) {
     
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
@@ -548,8 +548,13 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
                 // loop may be unrolled
                 for (qindex j=0; j<numTargAmps; j++) {
 
-                    // matr.cpuElems[k][j] = matr.cpuElemsFlat[l]
-                    qindex l = fast_getMatrixFlatIndex(k, j, numTargAmps);
+                    // matr.cpuElemsFlat[l] = matr.cpuElems[k][j] OR matr.cpuElems[j][k]
+                    qindex l;
+                    if constexpr (ApplyTransp)
+                        l = fast_getMatrixFlatIndex(j, k, numTargAmps);
+                    else
+                        l = fast_getMatrixFlatIndex(k, j, numTargAmps);
+
                     qcomp elem = matr.cpuElemsFlat[l];
 
                     // optionally conjugate matrix elems on the fly to avoid pre-modifying heap structure
@@ -569,7 +574,7 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 }
 
 
-INSTANTIATE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, cpu_statevec_anyCtrlAnyTargDenseMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, CompMatr) )
+INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, cpu_statevec_anyCtrlAnyTargDenseMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, CompMatr) )
 
 
 
@@ -701,7 +706,14 @@ void cpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 }
 
 
-INSTANTIATE_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, cpu_statevec_anyCtrlAnyTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, DiagMatr, qcomp) )
+INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, cpu_statevec_anyCtrlAnyTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, DiagMatr, qcomp) )
+
+
+/// @todo
+/// there is currently no density matrix version of anyCtrlAnyTargDiagMatr_sub();
+/// instead, operations.cpp invokes the statevector version twice as it does for
+/// dense matrices. This re-enumeration of the state however can be avoided since
+/// the matrix is diagonal, as done below for cpu_densmatr_allTargDiagMatr_sub()
 
 
 
@@ -737,28 +749,46 @@ void cpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 }
 
 
-template <bool HasPower, bool MultiplyOnly>
+template <bool HasPower, bool MultiplyLeft, bool MultiplyRight, bool ConjRight>
 void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent) {
 
+    // unlike other functions, this function handles all scenarios of...
+    // - matr -> matr qureg conj(matr)
+    // - matr -> matr qureg
+    // - matr ->      qureg matr
+    // and all of the above where matr is raised to a power. This is an
+    // optimisation permitted by diagonality of matr, avoiding superfluous
+    // re-enumeration of the state otherwise invoked by operations.cpp
+
     assert_exponentMatchesTemplateParam(exponent, HasPower);
 
-    // every iteration modifies one qureg amp, using one matr element
+    // every iteration modifies one qureg amp, using one or two matr elements
     qindex numIts = qureg.numAmpsPerNode;
 
     #pragma omp parallel for if(qureg.isMultithreaded||matr.isMultithreaded)
     for (qindex n=0; n<numIts; n++) {
 
-        // i = global row of nth local index
-        qindex i = fast_getQuregGlobalRowFromFlatIndex(n, matr.numElems);
-        qcomp fac = matr.cpuElems[i];
+        // the nth local amplitude will be multiplied by fac
+        qcomp fac = 1;
 
-        // compile-time decide if applying power to avoid in-loop branching...
-        // (beware that complex pow() is numerically unstable; see below)
-        if constexpr (HasPower)
-            fac = std::pow(fac, exponent);
+        // update fac to effect rho -> (matr * rho) or (matr^exponent * rho)
+        if constexpr (MultiplyLeft) {
 
-        // and whether we should also right-apply matr to qureg
-        if constexpr (!MultiplyOnly) {
+            // i = global row of nth local amp
+            qindex i = fast_getQuregGlobalRowFromFlatIndex(n, matr.numElems);
+            qcomp term = matr.cpuElems[i];
+
+            // compile-time decide if applying power to avoid in-loop branching...
+            // (beware that complex pow() is numerically unstable as detailed below)
+            if constexpr (HasPower)
+                term = std::pow(term, exponent);
+
+            fac = term;
+        }
+
+        // update fac to additional include rho -> (rho * matr) or 
+        // (rho * conj(matr)), or the same exponentiated
+        if constexpr (MultiplyRight) {
 
             // m = global index corresponding to n
             qindex m = concatenateBits(qureg.rank, n, qureg.logNumAmpsPerNode);
@@ -767,16 +797,18 @@ void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
             qindex j = fast_getQuregGlobalColFromFlatIndex(m, matr.numElems);
             qcomp term = matr.cpuElems[j];
 
-            // right-apply matrix elem may also need to be exponentiated.
             // beware that pow(qcomp,qcomp) below gives notable error over pow(qreal,qreal) 
             // (by producing an unexpected non-zero imaginary component) when the base is real 
             // and negative, and the exponent is an integer. We tolerate this heightened error
             // because we have no reason to think matr is real (it's not constrained Hermitian).
-            if constexpr(HasPower)
+            if constexpr (HasPower)
                 term = std::pow(term, exponent);
 
-            // conj after pow
-            fac *= std::conj(term);
+            // conj strictly after pow, to effect conj(matr^exponent)
+            if constexpr (ConjRight)
+                term = std::conj(term);
+
+            fac *= term;
         }
 
         qureg.cpuAmps[n] *= fac;
@@ -787,10 +819,12 @@ void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 template void cpu_statevec_allTargDiagMatr_sub<true> (Qureg, FullStateDiagMatr, qcomp);
 template void cpu_statevec_allTargDiagMatr_sub<false>(Qureg, FullStateDiagMatr, qcomp);
 
-template void cpu_densmatr_allTargDiagMatr_sub<true, true>  (Qureg, FullStateDiagMatr, qcomp);
-template void cpu_densmatr_allTargDiagMatr_sub<true, false> (Qureg, FullStateDiagMatr, qcomp);
-template void cpu_densmatr_allTargDiagMatr_sub<false, true> (Qureg, FullStateDiagMatr, qcomp);
-template void cpu_densmatr_allTargDiagMatr_sub<false, false>(Qureg, FullStateDiagMatr, qcomp);
+template void cpu_densmatr_allTargDiagMatr_sub<false, true,  true,  true>  (Qureg, FullStateDiagMatr, qcomp); // matr qureg conj(matr)
+template void cpu_densmatr_allTargDiagMatr_sub<false, true,  false, false> (Qureg, FullStateDiagMatr, qcomp); // matr qureg
+template void cpu_densmatr_allTargDiagMatr_sub<false, false, true,  false> (Qureg, FullStateDiagMatr, qcomp); //      qureg matr
+template void cpu_densmatr_allTargDiagMatr_sub<true,  true,  true,  true>  (Qureg, FullStateDiagMatr, qcomp); // matr^P qureg conj(matr^P)
+template void cpu_densmatr_allTargDiagMatr_sub<true,  true,  false, false> (Qureg, FullStateDiagMatr, qcomp); // matr^P qureg
+template void cpu_densmatr_allTargDiagMatr_sub<true,  false, true,  false> (Qureg, FullStateDiagMatr, qcomp); //      qureg matr^P
 
 
 
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index 661de150a..ea570ba2b 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -67,7 +67,7 @@ template <int NumCtrls> void cpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qur
 
 template <int NumCtrls> void cpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr);
 
-template <int NumCtrls, int NumTargs, bool ApplyConj> void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr);
+template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp> void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr);
 
 
 /*
@@ -82,7 +82,7 @@ template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower> void cpu_st
 
 template <bool HasPower> void cpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
-template <bool HasPower, bool MultiplyOnly> void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
+template <bool HasPower, bool MultiplyLeft, bool MultiplyRight, bool ConjRight> void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
 
 /*
diff --git a/quest/src/gpu/gpu_cuquantum.cuh b/quest/src/gpu/gpu_cuquantum.cuh
index 3b1c55fdb..13afaae87 100644
--- a/quest/src/gpu/gpu_cuquantum.cuh
+++ b/quest/src/gpu/gpu_cuquantum.cuh
@@ -197,14 +197,11 @@ void cuquantum_statevec_anyCtrlSwap_subA(Qureg qureg, vector<int> ctrls, vector<
  */
 
 
-void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, cu_qcomp* flatMatrElems) {
+void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, cu_qcomp* flatMatrElems, bool applyAdj) {
 
     // this funciton is called 'subA' instead of just 'sub', because it is also called in 
     // the one-target case whereby it is strictly the embarrassingly parallel _subA scenario
 
-    // do not adjoint matrix
-    int adj = 0;
-
     // use automatic workspace management
     void* work = nullptr;
     size_t workSize = 0;
@@ -212,7 +209,7 @@ void cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(Qureg qureg, vector<int>
     CUDA_CHECK( custatevecApplyMatrix(
         config.handle, 
         toCuQcomps(qureg.gpuAmps), CUQUANTUM_QCOMP, qureg.logNumAmpsPerNode, 
-        flatMatrElems, CUQUANTUM_QCOMP, CUSTATEVEC_MATRIX_LAYOUT_ROW, adj, 
+        flatMatrElems, CUQUANTUM_QCOMP, CUSTATEVEC_MATRIX_LAYOUT_ROW, applyAdj, 
         targs.data(), targs.size(),
         ctrls.data(), ctrlStates.data(), ctrls.size(), 
         CUSTATEVEC_COMPUTE_DEFAULT,
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 4cf860bee..4fdd7ceea 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -293,7 +293,7 @@ __forceinline__ __device__ qindex getThreadsNthGlobalArrInd(qindex n, qindex thr
 }
 
 
-template <int NumCtrls, int NumTargs, bool ApplyConj>
+template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp>
 __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
     cu_qcomp* amps, qindex numThreads,
     int* ctrlsAndTargs, int numCtrls, qindex ctrlsAndTargsMask, int* targs,
@@ -341,8 +341,12 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
         #pragma unroll
         for (qindex l=0; l<numTargAmps; l++) {
 
-            // h = flat index of matrix's (k,l)-th element
-            qindex h = fast_getMatrixFlatIndex(k, l, numTargAmps);
+            // h = flat index of matrix's (k,l)-th or (l,k)-th element
+            qindex h;
+            if constexpr (ApplyTransp)
+                h = fast_getMatrixFlatIndex(l, k, numTargAmps);
+            else
+                h = fast_getMatrixFlatIndex(k, l, numTargAmps);
 
             // optionally conjugate matrix elem
             cu_qcomp elem = flatMatrElems[h];
@@ -356,7 +360,7 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
 }
 
 
-template <int NumCtrls, bool ApplyConj>
+template <int NumCtrls, bool ApplyConj, bool ApplyTransp>
 __global__ void kernel_statevec_anyCtrlManyTargDenseMatr(
     cu_qcomp* globalCache,
     cu_qcomp* amps, qindex numThreads, qindex numBatchesPerThread,
@@ -398,10 +402,16 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr(
         
             for (qindex l=0; l<numTargAmps; l++) {
                 qindex j = getThreadsNthGlobalArrInd(l, t, numThreads);
-                qindex h = fast_getMatrixFlatIndex(k, l, numTargAmps);
 
-                // optionally conjugate matrix elem
+                // // h = flat index of matrix's (k,l)-th or (l,k)-th element
+                qindex h;
+                if constexpr (ApplyTransp)
+                    h = fast_getMatrixFlatIndex(l, k, numTargAmps);
+                else
+                    h = fast_getMatrixFlatIndex(k, l, numTargAmps);
+
                 cu_qcomp elem = flatMatrElems[h];
+
                 if constexpr (ApplyConj)
                     elem.y *= -1;
 
@@ -553,34 +563,38 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub(
  */
 
 
-template <bool HasPower, bool MultiplyOnly>
+template <bool HasPower, bool MultiplyLeft, bool MultiplyRight, bool ConjRight> 
 __global__ void kernel_densmatr_allTargDiagMatr_sub(
     cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
     cu_qcomp* elems, qindex numElems, cu_qcomp exponent
 ) {
     GET_THREAD_IND(n, numThreads);
 
-    // i = global row of nth local index
-    qindex i = n % numElems;
-    cu_qcomp fac = elems[i];
+    cu_qcomp fac = getCuQcomp(1, 0);
 
-    if constexpr (HasPower)
-        fac = getCompPower(fac, exponent);
+    if constexpr (MultiplyLeft) {
 
-    if constexpr (!MultiplyOnly) {
+        qindex i = fast_getQuregGlobalRowFromFlatIndex(n, numElems);
+        cu_qcomp term = elems[i];
 
-        // m = global index corresponding to n
-        qindex m = concatenateBits(rank, n, logNumAmpsPerNode);
+        if constexpr (HasPower)
+            term = getCompPower(term, exponent);
 
-        // j = global column corresponding to n
-        qindex j = m / numElems;
+        fac = term;
+    }
+
+    if constexpr (MultiplyRight) {
+
+        qindex m = concatenateBits(rank, n, logNumAmpsPerNode);
+        qindex j = fast_getQuregGlobalColFromFlatIndex(m, numElems);
         cu_qcomp term = elems[j];
 
-        if constexpr(HasPower)
+        if constexpr (HasPower)
             term = getCompPower(term, exponent);
 
-        // conj after pow
-        term.y *= -1;
+        if constexpr (ConjRight)
+            term.y *= -1;
+
         fac = fac * term;
     }
 
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 0b0bc46a3..0e7bb9385 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -290,8 +290,9 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
 
 #if COMPILE_CUQUANTUM
 
+    bool applyAdj = false;
     auto arr = unpackMatrixToCuQcomps(matr);
-    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ}, arr.data());
+    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ}, arr.data(), applyAdj);
 
 #elif COMPILE_CUDA
 
@@ -358,8 +359,9 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 
 #if COMPILE_CUQUANTUM
 
+    bool applyAdj = false;
     auto arr = unpackMatrixToCuQcomps(matr);
-    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ1, targ2}, arr.data());
+    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, {targ1, targ2}, arr.data(), applyAdj);
 
 #elif COMPILE_CUDA
 
@@ -393,7 +395,7 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevec_anyCtrlTwoTargDense
  */
 
 
-template <int NumCtrls, int NumTargs, bool ApplyConj>
+template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp>
 void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr) {
 
     assert_numCtrlsMatchesNumCtrlStatesAndTemplateParam(ctrls.size(), ctrlStates.size(), NumCtrls);
@@ -404,14 +406,21 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
     auto matrElemsPtr = toCuQcomps(matr.gpuElemsFlat);
     auto matrElemsLen = matr.numRows * matr.numRows;
 
-    // conjugate every matrix element if necessary (cuStateVec cannot conj for us; only adjoint)
-    if (ApplyConj)
+    // assert the pre-condition assumed below
+    if (ApplyConj && ApplyTransp)
+        error_gpuDenseMatrixConjugatedAndTransposed();
+
+    // cuStateVec can effect the adjoint, but not the individual conjugate or transpose,
+    // and alas we only ever use one at a time (because applying matrix to the bra-qubits of
+    // a vectorised density matrix effectively transposes the matrix), so we effect transpose
+    // by manually conjugating then telling cuQuantum to adjoint (hehe!)
+    if (ApplyConj || ApplyTransp)
         thrust_setElemsToConjugate(matrElemsPtr, matrElemsLen);
 
-    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, targs, matrElemsPtr);
+    cuquantum_statevec_anyCtrlAnyTargDenseMatrix_subA(qureg, ctrls, ctrlStates, targs, matrElemsPtr, ApplyTransp);
 
-    // undo conjugation (which is only not done if cuQuantum encounters a non-recoverable internal error)
-    if (ApplyConj)
+    // undo changes (which is only not done if cuQuantum encounters a non-recoverable internal error)
+    if (ApplyConj || ApplyTransp)
         thrust_setElemsToConjugate(matrElemsPtr, matrElemsLen);
 
 #elif COMPILE_CUDA
@@ -454,10 +463,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         qindex numThreads = numBatches;
         qindex numBlocks = getNumBlocks(numThreads);
 
-        kernel_statevec_anyCtrlFewTargDenseMatr<NumCtrls, NumTargs, ApplyConj> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-            ampsPtr, numThreads, 
-            qubitsPtr, nCtrls, qubitStateMask, 
-            targsPtr, matrPtr
+        kernel_statevec_anyCtrlFewTargDenseMatr
+            <NumCtrls, NumTargs, ApplyConj, ApplyTransp> 
+            <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+                ampsPtr, numThreads, 
+                qubitsPtr, nCtrls, qubitStateMask, 
+                targsPtr, matrPtr
         );
 
     } else {
@@ -490,11 +501,13 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
         qindex numKernelInvocations = numBlocks * NUM_THREADS_PER_BLOCK;
         qcomp* cache = gpu_getCacheOfSize(powerOf2(targs.size()), numKernelInvocations);
 
-        kernel_statevec_anyCtrlManyTargDenseMatr <NumCtrls, ApplyConj> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-            toCuQcomps(cache),
-            ampsPtr, numThreads, numBatchesPerThread, 
-            qubitsPtr, nCtrls, qubitStateMask, 
-            targsPtr, targs.size(), powerOf2(targs.size()), matrPtr
+        kernel_statevec_anyCtrlManyTargDenseMatr 
+            <NumCtrls, ApplyConj, ApplyTransp> 
+            <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+                toCuQcomps(cache),
+                ampsPtr, numThreads, numBatchesPerThread, 
+                qubitsPtr, nCtrls, qubitStateMask, 
+                targsPtr, targs.size(), powerOf2(targs.size()), matrPtr
         );
     }
 
@@ -504,7 +517,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
 }
 
 
-INSTANTIATE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevec_anyCtrlAnyTargDenseMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, CompMatr) )
+INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevec_anyCtrlAnyTargDenseMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, CompMatr) )
 
 
 
@@ -709,7 +722,7 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
 }
 
 
-INSTANTIATE_EXPONENTIABLE_CONJUGABLE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevec_anyCtrlAnyTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, DiagMatr, qcomp) )
+INSTANTIATE_TWO_BOOL_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS( void, gpu_statevec_anyCtrlAnyTargDiagMatr_sub, (Qureg, vector<int>, vector<int>, vector<int>, DiagMatr, qcomp) )
 
 
 
@@ -736,23 +749,21 @@ void gpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 }
 
 
-template <bool HasPower, bool MultiplyOnly>
+template <bool HasPower, bool MultiplyLeft, bool MultiplyRight, bool ConjRight>
 void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent) {
 
     assert_exponentMatchesTemplateParam(exponent, HasPower);
 
-    // in theory, we could use cuQuantum when HasPower=MultiplyOnly=true,
-    // treating FullStateDiagMatr like an N/2-qubit DiagMatr upon a SV,
-    // but this scenario is not worth the code complication
-
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
 
     qindex numThreads = qureg.numAmpsPerNode;
     qindex numBlocks = getNumBlocks(numThreads);
 
-    kernel_densmatr_allTargDiagMatr_sub <HasPower, MultiplyOnly> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
-        toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
-        toCuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toCuQcomp(exponent)
+    kernel_densmatr_allTargDiagMatr_sub 
+        <HasPower, MultiplyLeft, MultiplyRight, ConjRight> 
+        <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+            toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
+            toCuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toCuQcomp(exponent)
     );
 
 #else
@@ -764,10 +775,12 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 template void gpu_statevec_allTargDiagMatr_sub<true >(Qureg, FullStateDiagMatr, qcomp);
 template void gpu_statevec_allTargDiagMatr_sub<false>(Qureg, FullStateDiagMatr, qcomp);
 
-template void gpu_densmatr_allTargDiagMatr_sub<true, true>  (Qureg, FullStateDiagMatr, qcomp);
-template void gpu_densmatr_allTargDiagMatr_sub<true, false> (Qureg, FullStateDiagMatr, qcomp);
-template void gpu_densmatr_allTargDiagMatr_sub<false, true> (Qureg, FullStateDiagMatr, qcomp);
-template void gpu_densmatr_allTargDiagMatr_sub<false, false>(Qureg, FullStateDiagMatr, qcomp);
+template void gpu_densmatr_allTargDiagMatr_sub<false, true,  true,  true>  (Qureg, FullStateDiagMatr, qcomp); // matr qureg conj(matr)
+template void gpu_densmatr_allTargDiagMatr_sub<false, true,  false, false> (Qureg, FullStateDiagMatr, qcomp); // matr qureg
+template void gpu_densmatr_allTargDiagMatr_sub<false, false, true,  false> (Qureg, FullStateDiagMatr, qcomp); //      qureg matr
+template void gpu_densmatr_allTargDiagMatr_sub<true,  true,  true,  true>  (Qureg, FullStateDiagMatr, qcomp); // matr^P qureg conj(matr^P)
+template void gpu_densmatr_allTargDiagMatr_sub<true,  true,  false, false> (Qureg, FullStateDiagMatr, qcomp); // matr^P qureg
+template void gpu_densmatr_allTargDiagMatr_sub<true,  false, true,  false> (Qureg, FullStateDiagMatr, qcomp); //      qureg matr^P
 
 
 
diff --git a/quest/src/gpu/gpu_subroutines.hpp b/quest/src/gpu/gpu_subroutines.hpp
index 04fabe0a2..7ec3f6696 100644
--- a/quest/src/gpu/gpu_subroutines.hpp
+++ b/quest/src/gpu/gpu_subroutines.hpp
@@ -60,7 +60,7 @@ template <int NumCtrls> void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qur
 
 template <int NumCtrls> void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, int targ1, int targ2, CompMatr2 matr);
 
-template <int NumCtrls, int NumTargs, bool ApplyConj> void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr);
+template <int NumCtrls, int NumTargs, bool ApplyConj, bool ApplyTransp> void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, CompMatr matr);
 
 
 /*
@@ -75,7 +75,7 @@ template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower> void gpu_st
 
 template <bool HasPower> void gpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
-template <bool HasPower, bool MultiplyOnly> void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
+template <bool HasPower, bool MultiplyLeft, bool MultiplyRight, bool ConjRight> void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
 
 /*
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index 041000c4f..b8522321a 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -581,6 +581,25 @@ qmatrix getReferenceMatrix(auto matrixRefGen, vector<int> targs, auto additional
 }
 
 
+/*
+ * Template parameters which specify how the reference 
+ * operatorshould be applied upon the reference state. 
+ * Let |psi> be a statevector, rho be a density matrix, 
+ * and matr be an operator matrix. The options perform:
+ * 
+ * apply:    |psi> -> matr |psi>,  rho -> matr rho adj(matr)
+ * multiply: |psi> -> matr |psi>,  rho -> matr rho
+ * postmultiply:                   rho -> rho matr
+ * 
+ * Note this is necessarily a template parameter (rather
+ * than just a runtime parameter) only because the
+ * postMultiplyReferenceOperator() function is defined
+ * only upon qmatrix (for density matrices)
+ */
+
+enum ApplyFlag { apply, multiply, postmultiply };
+
+
 /*
  * display all/only relevant inputs given to an 
  * API operation when its subsequent test fails.
@@ -681,8 +700,8 @@ void CAPTURE_RELEVANT( vector<int> ctrls, vector<int> states, vector<int> targs,
  * matrixRefGen) is formatted.
  */
 
-template <NumQubitsFlag Ctrls, NumQubitsFlag Targs, ArgsFlag Args>
-void testOperationCorrectness(auto operation, auto matrixRefGen, bool multiplyOnly) {
+template <NumQubitsFlag Ctrls, NumQubitsFlag Targs, ArgsFlag Args, ApplyFlag Apply>
+void testOperationCorrectness(auto operation, auto matrixRefGen) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -724,18 +743,25 @@ void testOperationCorrectness(auto operation, auto matrixRefGen, bool multiplyOn
         auto allArgs = std::tuple_cat(tuple{operation, qureg}, primaryArgs, furtherArgs);
         std::apply(apiFunc, allArgs);
 
-        // update reference state
-        (multiplyOnly)?
-            multiplyReferenceOperator(stateRef, ctrls, states, targs, matrixRef):
-            applyReferenceOperator(stateRef, ctrls, states, targs, matrixRef);
+        // update reference state (ctrls & states happen to only ever be used by apply)
+        if constexpr (Apply == apply)        applyReferenceOperator(       stateRef, ctrls, states, targs, matrixRef);
+        if constexpr (Apply == multiply)     multiplyReferenceOperator(    stateRef, ctrls, states, targs, matrixRef);
+        if constexpr (Apply == postmultiply) postMultiplyReferenceOperator(stateRef, ctrls, states, targs, matrixRef);
     };
 
     // report operation's input parameters if any subsequent test fails
     CAPTURE_RELEVANT<Ctrls,Targs,Args>( ctrls, states, targs, furtherArgs );
 
-    // test API operation on all available deployment combinations (e.g. OMP, MPI, MPI+GPU, etc)
-    SECTION( LABEL_STATEVEC ) { TEST_ON_CACHED_QUREGS(statevecQuregs, statevecRef, testFunc); }
-    SECTION( LABEL_DENSMATR ) { TEST_ON_CACHED_QUREGS(densmatrQuregs, densmatrRef, testFunc); }
+    // test API operation on all available deployment combinations (e.g. OMP, MPI, MPI+GPU, etc),
+    // though the postMultiply*() functions do not accept statevectors
+    if constexpr (Apply != postmultiply) {
+        SECTION( LABEL_STATEVEC ) { 
+            TEST_ON_CACHED_QUREGS(statevecQuregs, statevecRef, testFunc); 
+        }
+    }
+    SECTION( LABEL_DENSMATR ) { 
+        TEST_ON_CACHED_QUREGS(densmatrQuregs, densmatrRef, testFunc);
+    }
 
     // free any heap-alloated API matrices and restore epsilon
     freeRemainingArgs<Targs,Args>(furtherArgs);
@@ -786,11 +812,11 @@ auto getFixedRemainingArgs(vector<int> targs) {
     if constexpr (Args == pauligad)  return tuple{ getPauliStr("XXX", targs), 0 }; // (XXX, angle)
 }
 
-template <NumQubitsFlag Ctrls, NumQubitsFlag Targs, ArgsFlag Args>
-void testOperationValidation(auto operation, bool multiplyOnly) {
+template <NumQubitsFlag Ctrls, NumQubitsFlag Targs, ArgsFlag Args, ApplyFlag Apply>
+void testOperationValidation(auto operation) {
 
-    // use any cached Qureg
-    Qureg qureg = getCachedStatevecs().begin()->second;
+    // use any cached Qureg (though postMultiply*() functions accept only density matrices)
+    Qureg qureg = getCachedDensmatrs().begin()->second;
 
     // in lieu of preparing random inputs like testOperationCorrectness()
     // above, we instead obtain simple, fixed, compatible inputs
@@ -963,7 +989,7 @@ void testOperationValidation(auto operation, bool multiplyOnly) {
             return;
 
         // which enforce unitarity
-        if (multiplyOnly)
+        if (Apply != apply)
             return;
 
         if constexpr (Args == compmatr || Args == diagmatr)
@@ -1005,6 +1031,9 @@ void testOperationValidation(auto operation, bool multiplyOnly) {
 
     SECTION( "targeted amps fit in node" ) {
 
+        // simplest to trigger validation using a statevector
+        qureg = getCachedStatevecs().begin()->second;
+
         // can only be validated when environment AND qureg
         // are distributed (over more than 1 node, of course)
         if (qureg.numNodes < 2)
@@ -1050,7 +1079,7 @@ void testOperationValidation(auto operation, bool multiplyOnly) {
     SECTION( "non-unitary exponent" ) {
 
         // not relevant for functions which do not assert unitarity
-        if (multiplyOnly)
+        if (Apply != apply)
             return;
 
         if constexpr (Args == diagpower)
@@ -1065,7 +1094,7 @@ void testOperationValidation(auto operation, bool multiplyOnly) {
 
         // when being applied as a unitary, abs(elem)=1 so there's no
         // possibility of divergence (we'd merely trigger isUnitary)
-        if (!multiplyOnly)
+        if (Apply == apply)
             return;
 
         if constexpr (Args == diagpower)
@@ -1086,6 +1115,18 @@ void testOperationValidation(auto operation, bool multiplyOnly) {
             REQUIRE_THROWS_WITH( apiFunc(), ContainsSubstring("zero vector") );
     }
 
+    SECTION( "qureg type" ) {
+
+        // only postMultiply*() functions discriminate Qureg
+        if (Apply != postmultiply)
+            return;
+
+        // use any statevector
+        qureg = getCachedStatevecs().begin()->second;
+
+        REQUIRE_THROWS_WITH( apiFunc(), ContainsSubstring("Expected a density matrix") );
+    }
+
     freeRemainingArgs<Targs,Args>(furtherArgs);
 }
 
@@ -1095,27 +1136,20 @@ void testOperationValidation(auto operation, bool multiplyOnly) {
  * inputs as indicated by the template flags
  */
 
-template <NumQubitsFlag Ctrls, NumQubitsFlag Targs, ArgsFlag Args>
-void testOperation(auto operation, auto matrixRefGen, bool multiplyOnly) {
+template <NumQubitsFlag Ctrls, NumQubitsFlag Targs, ArgsFlag Args, ApplyFlag Apply>
+void testOperation(auto operation, auto matrixRefGen) {
 
     assertNumQubitsFlagsAreValid(Ctrls, Targs);
 
     SECTION( LABEL_CORRECTNESS ) { 
-        testOperationCorrectness<Ctrls,Targs,Args>(operation, matrixRefGen, multiplyOnly); 
+        testOperationCorrectness<Ctrls,Targs,Args,Apply>(operation, matrixRefGen); 
     }
 
     SECTION( LABEL_VALIDATION ) { 
-        testOperationValidation<Ctrls,Targs,Args>(operation, multiplyOnly); 
+        testOperationValidation<Ctrls,Targs,Args,Apply>(operation);
     }
 }
 
-template <NumQubitsFlag Ctrls, NumQubitsFlag Targs, ArgsFlag Args>
-void testOperation(auto operation, auto matrixRefGen) {
-
-    bool multiplyOnly = false;
-    testOperation<Ctrls,Targs,Args>(operation, matrixRefGen, multiplyOnly);
-}
-
 
 /*
  * perform unit tests for the four distinctly-controlled
@@ -1234,13 +1268,13 @@ void testOperation(auto operation, auto matrixRefGen) {
 // defines a Catch2 test-case for the implied function
 #define TEST_CASE_OPERATION( namesuffix, numctrls, numtargs, argtype, matrixgen ) \
     TEST_CASE( GET_FUNC_NAME_STR(numctrls, namesuffix), TEST_CATEGORY ) {         \
-        testOperation<numctrls, numtargs, argtype>(                               \
+        testOperation<numctrls, numtargs, argtype, apply>(                        \
             GET_CASTED_FUNC(namesuffix, numctrls, numtargs, argtype),             \
             matrixgen);                                                           \
     }
  
-// automate the testing of a function for all its controlled variants
-#define TEST_ALL_CTRL_OPERATIONS( namesuffix, numtargs, argtype, matrixgen ) \
+// automate the testing of a apply*() function for all its controlled variants
+#define TEST_ALL_CTRL_OPERATIONS( namesuffix, numtargs, argtype, matrixgen )   \
     TEST_CASE_OPERATION( namesuffix, zero,      numtargs, argtype, matrixgen ) \
     TEST_CASE_OPERATION( namesuffix, one,       numtargs, argtype, matrixgen ) \
     TEST_CASE_OPERATION( namesuffix, any,       numtargs, argtype, matrixgen ) \
@@ -1289,61 +1323,105 @@ TEST_ALL_CTRL_OPERATIONS( PhaseGadget, any, scalar, VariableSizeParameterisedMat
  * non-controlled operations with no C++ overloads
  */
 
-TEST_CASE( "multiplyPauliStr",        TEST_CATEGORY ) { testOperation<zero,any,paulistr>(multiplyPauliStr,    nullptr, true); }
-TEST_CASE( "multiplyPauliGadget",     TEST_CATEGORY ) { testOperation<zero,any,pauligad>(multiplyPauliGadget, nullptr, true); }
-TEST_CASE( "multiplyCompMatr1",       TEST_CATEGORY ) { testOperation<zero,one,compmatr>(multiplyCompMatr1,   nullptr, true); }
-TEST_CASE( "multiplyCompMatr2",       TEST_CATEGORY ) { testOperation<zero,two,compmatr>(multiplyCompMatr2,   nullptr, true); }
-TEST_CASE( "multiplyDiagMatr1",       TEST_CATEGORY ) { testOperation<zero,one,diagmatr>(multiplyDiagMatr1,   nullptr, true); }
-TEST_CASE( "multiplyDiagMatr2",       TEST_CATEGORY ) { testOperation<zero,two,diagmatr>(multiplyDiagMatr2,   nullptr, true); }
-TEST_CASE( "applyPhaseFlip",          TEST_CATEGORY ) { testOperation<zero,one,none>  (applyPhaseFlip,          VariableSizeMatrices::PF(1)); }
-TEST_CASE( "applyTwoQubitPhaseFlip",  TEST_CATEGORY ) { testOperation<zero,two,none>  (applyTwoQubitPhaseFlip,  VariableSizeMatrices::PF(2)); }
-TEST_CASE( "applyPhaseShift",         TEST_CATEGORY ) { testOperation<zero,one,scalar>(applyPhaseShift,         ParameterisedMatrices::PS); }
-TEST_CASE( "applyTwoQubitPhaseShift", TEST_CATEGORY ) { testOperation<zero,two,scalar>(applyTwoQubitPhaseShift, ParameterisedMatrices::PS2); }
+TEST_CASE( "applyPhaseFlip",          TEST_CATEGORY ) { testOperation<zero,one,none,apply>  (applyPhaseFlip,          VariableSizeMatrices::PF(1)); }
+TEST_CASE( "applyTwoQubitPhaseFlip",  TEST_CATEGORY ) { testOperation<zero,two,none,apply>  (applyTwoQubitPhaseFlip,  VariableSizeMatrices::PF(2)); }
+TEST_CASE( "applyPhaseShift",         TEST_CATEGORY ) { testOperation<zero,one,scalar,apply>(applyPhaseShift,         ParameterisedMatrices::PS  ); }
+TEST_CASE( "applyTwoQubitPhaseShift", TEST_CATEGORY ) { testOperation<zero,two,scalar,apply>(applyTwoQubitPhaseShift, ParameterisedMatrices::PS2 ); }
+
+TEST_CASE( "multiplySwap",            TEST_CATEGORY ) { testOperation<zero,two,none,multiply>(multiplySwap, FixedMatrices::SWAP); }
+TEST_CASE( "multiplyPauliX",          TEST_CATEGORY ) { testOperation<zero,one,none,multiply>(multiplyPauliX, FixedMatrices::X); }
+TEST_CASE( "multiplyPauliY",          TEST_CATEGORY ) { testOperation<zero,one,none,multiply>(multiplyPauliY, FixedMatrices::Y); }
+TEST_CASE( "multiplyPauliZ",          TEST_CATEGORY ) { testOperation<zero,one,none,multiply>(multiplyPauliZ, FixedMatrices::Z); }
+TEST_CASE( "multiplyPauliStr",        TEST_CATEGORY ) { testOperation<zero,any,paulistr,multiply>(multiplyPauliStr,    nullptr); }
+TEST_CASE( "multiplyPauliGadget",     TEST_CATEGORY ) { testOperation<zero,any,pauligad,multiply>(multiplyPauliGadget, nullptr); }
+TEST_CASE( "multiplyCompMatr1",       TEST_CATEGORY ) { testOperation<zero,one,compmatr,multiply>(multiplyCompMatr1,   nullptr); }
+TEST_CASE( "multiplyCompMatr2",       TEST_CATEGORY ) { testOperation<zero,two,compmatr,multiply>(multiplyCompMatr2,   nullptr); }
+TEST_CASE( "multiplyDiagMatr1",       TEST_CATEGORY ) { testOperation<zero,one,diagmatr,multiply>(multiplyDiagMatr1,   nullptr); }
+TEST_CASE( "multiplyDiagMatr2",       TEST_CATEGORY ) { testOperation<zero,two,diagmatr,multiply>(multiplyDiagMatr2,   nullptr); }
+
+TEST_CASE( "postMultiplySwap",            TEST_CATEGORY ) { testOperation<zero,two,none,postmultiply>(postMultiplySwap, FixedMatrices::SWAP); }
+TEST_CASE( "postMultiplyPauliX",          TEST_CATEGORY ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliX, FixedMatrices::X); }
+TEST_CASE( "postMultiplyPauliY",          TEST_CATEGORY ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliY, FixedMatrices::Y); }
+TEST_CASE( "postMultiplyPauliZ",          TEST_CATEGORY ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliZ, FixedMatrices::Z); }
+TEST_CASE( "postMultiplyPauliStr",        TEST_CATEGORY ) { testOperation<zero,any,paulistr,postmultiply>(postMultiplyPauliStr,    nullptr); }
+TEST_CASE( "postMultiplyPauliGadget",     TEST_CATEGORY ) { testOperation<zero,any,pauligad,postmultiply>(postMultiplyPauliGadget, nullptr); }
+TEST_CASE( "postMultiplyCompMatr1",       TEST_CATEGORY ) { testOperation<zero,one,compmatr,postmultiply>(postMultiplyCompMatr1,   nullptr); }
+TEST_CASE( "postMultiplyCompMatr2",       TEST_CATEGORY ) { testOperation<zero,two,compmatr,postmultiply>(postMultiplyCompMatr2,   nullptr); }
+TEST_CASE( "postMultiplyDiagMatr1",       TEST_CATEGORY ) { testOperation<zero,one,diagmatr,postmultiply>(postMultiplyDiagMatr1,   nullptr); }
+TEST_CASE( "postMultiplyDiagMatr2",       TEST_CATEGORY ) { testOperation<zero,two,diagmatr,postmultiply>(postMultiplyDiagMatr2,   nullptr); }
 
 
 /*
  * non-controlled operations which have a C++ overload
  * (because they accept qubit lists which become vector),
  * and so which require explicit casting to resolve the
- * compiler ambiguity
+ * compiler ambiguity (spaghetti 4 lyf)
  */
 
+TEST_CASE( "applyMultiQubitPhaseFlip",  TEST_CATEGORY ) {
+    auto func = static_cast<void(*)(Qureg, int*, int)>(applyMultiQubitPhaseFlip);
+    testOperation<zero,any,none,apply>(func, VariableSizeMatrices::PF);
+}
+
+TEST_CASE( "applyMultiQubitPhaseShift",  TEST_CATEGORY ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(applyMultiQubitPhaseShift);
+    testOperation<zero,any,scalar,apply>(func, VariableSizeParameterisedMatrices::PS);
+}
+
+
 TEST_CASE( "multiplyCompMatr",  TEST_CATEGORY ) { 
     auto func = static_cast<void(*)(Qureg, int*, int, CompMatr)>(multiplyCompMatr);
-    testOperation<zero,any,compmatr>(func, nullptr, true); 
+    testOperation<zero,any,compmatr,multiply>(func, nullptr); 
 }
 
 TEST_CASE( "multiplyDiagMatr",  TEST_CATEGORY ) {
     auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr)>(multiplyDiagMatr);
-    testOperation<zero,any,diagmatr>(func, nullptr, true);
+    testOperation<zero,any,diagmatr,multiply>(func, nullptr);
 }
 
 TEST_CASE( "multiplyDiagMatrPower",  TEST_CATEGORY ) {
     auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr, qcomp)>(multiplyDiagMatrPower);
-    testOperation<zero,any,diagpower>(func, nullptr, true);
+    testOperation<zero,any,diagpower,multiply>(func, nullptr);
 }
 
 TEST_CASE( "multiplyMultiQubitNot",  TEST_CATEGORY ) {
     auto func = static_cast<void(*)(Qureg, int*, int)>(multiplyMultiQubitNot);
-    testOperation<zero,any,none>(func, VariableSizeMatrices::X, true);
+    testOperation<zero,any,none,multiply>(func, VariableSizeMatrices::X);
 }
 
 TEST_CASE( "multiplyPhaseGadget",  TEST_CATEGORY ) {
     auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(multiplyPhaseGadget);
-    testOperation<zero,any,scalar>(func, VariableSizeParameterisedMatrices::Z, true);
+    testOperation<zero,any,scalar,multiply>(func, VariableSizeParameterisedMatrices::Z);
 }
 
-TEST_CASE( "applyMultiQubitPhaseFlip",  TEST_CATEGORY ) {
-    auto func = static_cast<void(*)(Qureg, int*, int)>(applyMultiQubitPhaseFlip);
-    testOperation<zero,any,none>(func, VariableSizeMatrices::PF);
+
+TEST_CASE( "postMultiplyCompMatr",  TEST_CATEGORY ) { 
+    auto func = static_cast<void(*)(Qureg, int*, int, CompMatr)>(postMultiplyCompMatr);
+    testOperation<zero,any,compmatr,postmultiply>(func, nullptr); 
 }
 
-TEST_CASE( "applyMultiQubitPhaseShift",  TEST_CATEGORY ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(applyMultiQubitPhaseShift);
-    testOperation<zero,any,scalar>(func, VariableSizeParameterisedMatrices::PS);
+TEST_CASE( "postMultiplyDiagMatr",  TEST_CATEGORY ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr)>(postMultiplyDiagMatr);
+    testOperation<zero,any,diagmatr,postmultiply>(func, nullptr);
+}
+
+TEST_CASE( "postMultiplyDiagMatrPower",  TEST_CATEGORY ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr, qcomp)>(postMultiplyDiagMatrPower);
+    testOperation<zero,any,diagpower,postmultiply>(func, nullptr);
+}
+
+TEST_CASE( "postMultiplyMultiQubitNot",  TEST_CATEGORY ) {
+    auto func = static_cast<void(*)(Qureg, int*, int)>(postMultiplyMultiQubitNot);
+    testOperation<zero,any,none,postmultiply>(func, VariableSizeMatrices::X);
+}
+
+TEST_CASE( "postMultiplyPhaseGadget",  TEST_CATEGORY ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(postMultiplyPhaseGadget);
+    testOperation<zero,any,scalar,postmultiply>(func, VariableSizeParameterisedMatrices::Z);
 }
 
 
+
 /*
  * operations which need custom logic
  */
@@ -1804,6 +1882,65 @@ TEST_CASE( "multiplyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLOY_TA
 }
 
 
+TEST_CASE( "postMultiplyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
+
+    PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
+
+    auto cachedMatrs = getCachedFullStateDiagMatrs();
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        qmatrix refMatr = getRandomDiagonalMatrix(getPow2(numQubits));
+        auto apiFunc = postMultiplyFullStateDiagMatr;
+
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
+
+        SECTION( LABEL_DENSMATR ) {
+
+            auto refFunc = [&] (qmatrix& state, qmatrix matr) { postMultiplyReferenceOperator(state, matr); };
+
+            TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
+        }
+    }
+
+    /// @todo input validation
+}
+
+
+TEST_CASE( "postMultiplyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
+
+    PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
+
+    auto cachedMatrs = getCachedFullStateDiagMatrs();
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        qmatrix refMatr = getRandomDiagonalMatrix(getPow2(numQubits));
+        qcomp exponent = getRandomComplex();
+
+        auto apiFunc = [&](Qureg qureg, FullStateDiagMatr matr) { 
+            return postMultiplyFullStateDiagMatrPower(qureg, matr, exponent);
+        };
+
+        CAPTURE( exponent );
+        
+        GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
+
+        SECTION( LABEL_DENSMATR ) {
+
+            auto refFunc = [&] (qmatrix& state, qmatrix matr) { 
+                matr = getPowerOfDiagonalMatrix(matr, exponent);
+                postMultiplyReferenceOperator(state, matr);
+            };
+
+            TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
+        }
+    }
+
+    /// @todo input validation
+}
+
+
 TEST_CASE( "applyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
@@ -1921,6 +2058,35 @@ TEST_CASE( "multiplyPauliStrSum", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
 }
 
 
+TEST_CASE( "postMultiplyPauliStrSum", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
+
+    PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        int numQubits = getNumCachedQubits();
+        int numTerms = GENERATE_COPY( 1, 2, 10 );
+
+        PauliStrSum sum = createRandomPauliStrSum(numQubits, numTerms);
+
+        auto testFunc = [&](Qureg qureg, auto& ref) {
+
+            // must use (and ergo make) an identically-deployed workspace
+            Qureg workspace = createCloneQureg(qureg);
+            postMultiplyPauliStrSum(qureg, sum, workspace);
+            destroyQureg(workspace);
+
+            ref = ref * getMatrix(sum, numQubits);
+        };
+
+        CAPTURE( numTerms );
+        SECTION( LABEL_DENSMATR ) { TEST_ON_CACHED_QUREGS(densmatrQuregs, densmatrRef, testFunc); }
+    }
+
+    /// @todo input validation
+}
+
+
 TEST_CASE( "applyNonUnitaryPauliGadget", TEST_CATEGORY ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
diff --git a/tests/utils/evolve.cpp b/tests/utils/evolve.cpp
index 14df4de7f..38b43cc9c 100644
--- a/tests/utils/evolve.cpp
+++ b/tests/utils/evolve.cpp
@@ -180,6 +180,13 @@ void multiplyReferenceOperator(qmatrix& state, qmatrix matrix) {
     state = matrix * state;
 }
 
+void postMultiplyReferenceOperator(qmatrix& state, qmatrix matrix) {
+    DEMAND( state.size() == matrix.size() );
+
+    // we right-multiply upon density matrices only
+    state = state * matrix;
+}
+
 
 // overloads with ctrls, states and targs (given sub-operator)
 
@@ -206,6 +213,12 @@ void multiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> ct
     multiplyReferenceOperator(state, left);
 }
 
+void postMultiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qmatrix matrix) {
+    
+    qmatrix left = getFullStateOperator(ctrls, ctrlStates, targs, matrix, getLog2(state.size()));
+    postMultiplyReferenceOperator(state, left);
+}
+
 
 // overloads with only ctrls and targs
 
@@ -225,6 +238,10 @@ void multiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> ta
     
     multiplyReferenceOperator(state, ctrls, {}, targs, matrix);
 }
+void postMultiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix) {
+    
+    postMultiplyReferenceOperator(state, ctrls, {}, targs, matrix);
+}
 
 
 // overloads with only targs
@@ -245,6 +262,10 @@ void multiplyReferenceOperator(qmatrix& state, vector<int> targs, qmatrix matrix
 
     multiplyReferenceOperator(state, {}, {}, targs, matrix);
 }
+void postMultiplyReferenceOperator(qmatrix& state, vector<int> targs, qmatrix matrix) {
+
+    postMultiplyReferenceOperator(state, {}, {}, targs, matrix);
+}
 
 
 // overloads with only targs and kraus operators
diff --git a/tests/utils/evolve.hpp b/tests/utils/evolve.hpp
index e0b186e67..876130167 100644
--- a/tests/utils/evolve.hpp
+++ b/tests/utils/evolve.hpp
@@ -21,25 +21,29 @@
 using std::vector;
 
 
-void applyReferenceOperator(   qvector& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
-void applyReferenceOperator(   qmatrix& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(qvector& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
-
-void applyReferenceOperator(   qvector& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
-void applyReferenceOperator(   qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(qvector& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
-
-void applyReferenceOperator(   qvector& state, vector<int> targs, qmatrix matrix);
-void applyReferenceOperator(   qmatrix& state, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(qvector& state, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(qmatrix& state, vector<int> targs, qmatrix matrix);
-
-void applyReferenceOperator(   qvector& state, qmatrix matrix);
-void applyReferenceOperator(   qmatrix& state, qmatrix matrix);
-void multiplyReferenceOperator(qvector& state, qmatrix matrix);
-void multiplyReferenceOperator(qmatrix& state, qmatrix matrix);
+void applyReferenceOperator(       qvector& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
+void applyReferenceOperator(       qmatrix& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
+void multiplyReferenceOperator(    qvector& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
+void multiplyReferenceOperator(    qmatrix& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
+void postMultiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
+
+void applyReferenceOperator(       qvector& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
+void applyReferenceOperator(       qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
+void multiplyReferenceOperator(    qvector& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
+void multiplyReferenceOperator(    qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
+void postMultiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
+
+void applyReferenceOperator(       qvector& state, vector<int> targs, qmatrix matrix);
+void applyReferenceOperator(       qmatrix& state, vector<int> targs, qmatrix matrix);
+void multiplyReferenceOperator(    qvector& state, vector<int> targs, qmatrix matrix);
+void multiplyReferenceOperator(    qmatrix& state, vector<int> targs, qmatrix matrix);
+void postMultiplyReferenceOperator(qmatrix& state, vector<int> targs, qmatrix matrix);
+
+void applyReferenceOperator(       qvector& state, qmatrix matrix);
+void applyReferenceOperator(       qmatrix& state, qmatrix matrix);
+void multiplyReferenceOperator(    qvector& state, qmatrix matrix);
+void multiplyReferenceOperator(    qmatrix& state, qmatrix matrix);
+void postMultiplyReferenceOperator(qmatrix& state, qmatrix matrix);
 
 void applyReferenceOperator(qmatrix& state, vector<int> targs, vector<qmatrix> matrices);
 

From 378ad71021c2911db38c46eb5e5de09cfdfee155 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Wed, 9 Jul 2025 11:18:36 -0400
Subject: [PATCH 11/32] restored NUMA-awareness (#658)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Luc:
v3.7 was sensible on NUMA machines “by default” through first-touch initialization. This had been lost in v4 as idnetified by James Richings. Here’s some basic numa-aware allocation, and a little love for general parallel/openmp usage.

- If we’re on *nix _and_ we find libnuma, we enable NUMA-aware allocaitons
- Add & use cpu_allocNumaArray() and cpu_deallocNumaArray for the state-vector allocations (as the current alloc functions are also used for many smaller regions). Fall-back to normal allocation functions if NUMA-unaware.
- Perform zero-initialization in parallel (still with std::fill() but use a parallel region)
- Make getCurrentNumThreads() work inside parallel regions (!)
- Add getAvailableNumThreads() to get thread count outside parallel regions. Improve this from previous getCurrentNumThreads() to only call the omp function once (rather than once per thread).

Luc coded the logic and Tyson added doc and error-handling. PR #658 replaced the original of #652

---------

Co-authored-by: Luc Jaulmes <ljaulmes@ed.ac.uk>
---
 CMakeLists.txt                    |  19 ++-
 quest/src/api/environment.cpp     |   4 +-
 quest/src/api/qureg.cpp           |   4 +-
 quest/src/core/autodeployer.cpp   |   2 +-
 quest/src/core/errors.cpp         |  36 ++++++
 quest/src/core/errors.hpp         |  15 +++
 quest/src/core/memory.cpp         |   1 +
 quest/src/core/utilities.cpp      |  37 ++++++
 quest/src/core/utilities.hpp      |   4 +
 quest/src/cpu/cpu_config.cpp      | 197 +++++++++++++++++++++++++++---
 quest/src/cpu/cpu_config.hpp      |   9 +-
 quest/src/cpu/cpu_subroutines.cpp |  20 ++-
 12 files changed, 323 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 933e23086..e2e52d85e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,7 +1,7 @@
 # @author Oliver Thomson Brown
 # @author Erich Essmann (patches including MSVC support)
 # @author Tyson Jones (patches including clang multithreading)
-# @author Luc Jaulmes (patching install)
+# @author Luc Jaulmes (NUMA awareness, patching install)
 #
 # Contributions to previous builds from:
 #  - Ania Brown
@@ -262,6 +262,23 @@ if (ENABLE_MULTITHREADING)
     OpenMP::OpenMP_C
   )
 
+  # Find NUMA - location of NUMA headers
+  if (WIN32)
+    compile_option(NUMA_AWARE 0)
+    message(WARNING "Building on Windows, QuEST will not be aware of numa locality")
+  else()
+    include(FindPkgConfig)
+    pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL)
+    if (${NUMA_FOUND})
+      compile_option(NUMA_AWARE ${NUMA_FOUND})
+      target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
+      message(STATUS "NUMA awareness is enabled.")
+    else()
+      compile_option(NUMA_AWARE 0)
+      message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
+    endif()
+  endif()
+
   if (VERBOSE_LIB_NAME)
     string(CONCAT LIB_NAME ${LIB_NAME} "+mt")
   endif()
diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
index 6eef515c4..541491899 100644
--- a/quest/src/api/environment.cpp
+++ b/quest/src/api/environment.cpp
@@ -225,7 +225,7 @@ void printCpuInfo() {
         "cpu", {
         {"numCpuCores",   printer_toStr(std::thread::hardware_concurrency()) + pm},
         {"numOmpProcs",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getNumOpenmpProcessors()) + pm : na},
-        {"numOmpThrds",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getCurrentNumThreads()) + pn : na},
+        {"numOmpThrds",   (cpu_isOpenmpCompiled())? printer_toStr(cpu_getAvailableNumThreads()) + pn : na},
         {"cpuMemory",     ram},
         {"cpuMemoryFree", un},
     });
@@ -494,7 +494,7 @@ void getEnvironmentString(char str[200]) {
 
     QuESTEnv env = getQuESTEnv();
 
-    int numThreads = cpu_isOpenmpCompiled()? cpu_getCurrentNumThreads() : 1;
+    int numThreads = cpu_isOpenmpCompiled()? cpu_getAvailableNumThreads() : 1;
     int cuQuantum = env.isGpuAccelerated && gpu_isCuQuantumCompiled();
     int gpuDirect = env.isGpuAccelerated && gpu_isDirectGpuCommPossible();
 
diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp
index 98068f079..7d68528a1 100644
--- a/quest/src/api/qureg.cpp
+++ b/quest/src/api/qureg.cpp
@@ -154,7 +154,7 @@ Qureg validateAndCreateCustomQureg(int numQubits, int isDensMatr, int useDistrib
     Qureg qureg = qureg_populateNonHeapFields(numQubits, isDensMatr, useDistrib, useGpuAccel, useMultithread);
 
     // always allocate CPU memory
-    qureg.cpuAmps = cpu_allocArray(qureg.numAmpsPerNode); // nullptr if failed
+    qureg.cpuAmps = cpu_allocNumaArray(qureg.numAmpsPerNode); // nullptr if failed
 
     // conditionally allocate GPU memory and communication buffers (even if numNodes == 1).
     // note that in distributed settings but where useDistrib=false, each node will have a
@@ -334,7 +334,7 @@ void destroyQureg(Qureg qureg) {
     validate_quregFields(qureg, __func__);
 
     // free CPU memory
-    cpu_deallocArray(qureg.cpuAmps);
+    cpu_deallocNumaArray(qureg.cpuAmps, qureg.numAmpsPerNode);
 
     // free CPU communication buffer
     if (qureg.isDistributed)
diff --git a/quest/src/core/autodeployer.cpp b/quest/src/core/autodeployer.cpp
index 2b6645e42..27c412687 100644
--- a/quest/src/core/autodeployer.cpp
+++ b/quest/src/core/autodeployer.cpp
@@ -36,7 +36,7 @@ void autodep_chooseQuESTEnvDeployment(int &useDistrib, int &useGpuAccel, int &us
 
     // and we require more than 1 thread available at QuESTEnv creation
     if (useMultithread == modeflag::USE_AUTO)
-        useMultithread = (cpu_isOpenmpCompiled())? (cpu_getCurrentNumThreads() > 1) : 0;
+        useMultithread = (cpu_isOpenmpCompiled())? (cpu_getAvailableNumThreads() > 1) : 0;
 }
 
 
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index a24516891..4f68090d8 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -5,6 +5,7 @@
  * deployment is consistent with the compiled deployment modes.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (NUMA & pagesize errors)
  */
 
 #include "quest/include/types.h"
@@ -104,6 +105,41 @@ void error_memSizeQueriedButWouldOverflow() {
     raiseInternalError("Attempted to obtain memory necessary to allocate a distributed object's single-node partition but it overflowed size_t despite prior validation.");
 }
 
+void error_gettingPageSizeFailed() {
+
+    raiseInternalError("Failed to get the page size.");
+}
+
+void error_pageSizeNotAPowerOf2() {
+
+    raiseInternalError("The discovered page size was not a power of 2. Get Dr Denning on the phone.");
+}
+
+void error_pageSizeNotAMultipleOfQcomp() {
+
+    raiseInternalError("The page size was indivisible by the number of bytes in a qcomp.");
+}
+
+void error_gettingNumNumaNodesFailed() {
+
+    raiseInternalError("Failed to get the NUMA node count");
+}
+
+void error_numaAllocOrDeallocAttemptedOnWindows() {
+
+    raiseInternalError("NUMA-aware memory allocation or deallocation was attempted on Windows though this is not yet implemented, indicating a potential build issue.");
+}
+
+void error_numaBindingFailed() {
+
+    raiseInternalError("The binding of memory pages to NUMA nodes (with mbind) unexpectedly failed, despite prior reservation (with mmap) succeeding.");
+}
+
+void error_numaUnmappingFailed() {
+
+    raiseInternalError("NUMA-aware memory deallocation unexpectedly failed.");
+}
+
 
 
 /*
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 7f6a0c609..e99166af4 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -5,6 +5,7 @@
  * deployment is consistent with the compiled deployment modes.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (NUMA & pagesize errors)
  */
 
 #ifndef ERRORS_HPP
@@ -50,6 +51,20 @@ void error_allocOfQuESTEnvFailed();
 
 void error_memSizeQueriedButWouldOverflow();
 
+void error_gettingPageSizeFailed();
+
+void error_pageSizeNotAPowerOf2();
+
+void error_pageSizeNotAMultipleOfQcomp();
+
+void error_gettingNumNumaNodesFailed();
+
+void error_numaAllocOrDeallocAttemptedOnWindows();
+
+void error_numaBindingFailed();
+
+void error_numaUnmappingFailed();
+
 
 
 /*
diff --git a/quest/src/core/memory.cpp b/quest/src/core/memory.cpp
index 79d4301a4..c8b81fc88 100644
--- a/quest/src/core/memory.cpp
+++ b/quest/src/core/memory.cpp
@@ -30,6 +30,7 @@
     #include <sys/sysctl.h>
 #elif defined(_WIN32)
     #define NOMINMAX
+    #define WIN32_LEAN_AND_MEAN
     #include <windows.h>
 #endif
 
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 16891f234..e30aa86d6 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -5,6 +5,7 @@
  * logic, matrix algebra, and channel parameters.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (distributing ranges over blocks)
  */
 
 #include "quest/include/types.h"
@@ -25,6 +26,7 @@
 
 #include <functional>
 #include <algorithm>
+#include <utility>
 #include <complex>
 #include <cmath>
 #include <vector>
@@ -930,6 +932,41 @@ util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qin
     return out;
 }
 
+std::pair<qindex, qindex> util_getBlockMultipleSubRange(
+    qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges
+) {
+    // divides a range into whole blocks (and a single leftover sub-block) and
+    // attempts to uniformly distribute the blocks across the specified number of
+    // sub-ranges. When the blocks do not divide evenly between sub-ranges, the
+    // leftover blocks are spread apart across sub-ranges. When the range does not 
+    // divide evenly into blocks, the overflow is given to the final sub-range.
+
+    qindex numFullBlocks = rangeLen / blockLen; // floors
+    qindex subBlockLen = rangeLen % blockLen;
+
+    qindex baseNumBlocksPerSubRange = numFullBlocks / numSubRanges;
+    qindex numExtraBlocks = numFullBlocks % numSubRanges;
+
+    // determine how many extra blocks this subrange should contain
+    qindex prevExtra = (idSubRange * numExtraBlocks) / numSubRanges;
+    qindex prevShift = (idSubRange * numExtraBlocks) % numSubRanges;
+    bool hereExtra = (prevShift + numExtraBlocks) >= numSubRanges;
+
+    // allocate blocks to this sub-range
+    qindex startBlockInd = idSubRange * baseNumBlocksPerSubRange + prevExtra;
+    qindex endBlockInd = startBlockInd + baseNumBlocksPerSubRange + hereExtra;
+
+    // find this sub-range indices within [0, rangeLen)
+    qindex startInd = startBlockInd * blockLen;
+    qindex endInd = endBlockInd * blockLen; // exclusive
+
+    // arbitrarily allocate the leftover sub-block to the final sub-range
+    if (idSubRange == numSubRanges - 1)
+        endInd += subBlockLen;
+
+    return std::make_pair(startInd, endInd);
+}
+
 
 
 /*
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index 44acce231..c514821bf 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -21,6 +21,7 @@
 
 #include <type_traits>
 #include <functional>
+#include <utility>
 #include <string>
 #include <vector>
 #include <array>
@@ -351,6 +352,8 @@ bool util_areAnyVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex s
 
 util_VectorIndexRange util_getLocalIndRangeOfVectorElemsWithinNode(int rank, qindex numElemsPerNode, qindex elemStartInd, qindex numInds);
 
+std::pair<qindex, qindex> util_getBlockMultipleSubRange(qindex rangeLen, qindex blockLen, int idSubRange, int numSubRanges);
+
 
 
 /*
@@ -361,6 +364,7 @@ qreal util_getPhaseFromGateAngle(qreal angle);
 qcomp util_getPhaseFromGateAngle(qcomp angle);
 
 
+
 /*
  * DECOHERENCE FACTORS
  */
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index f27471d38..e488e6a9c 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -3,17 +3,21 @@
  * configuration, and allocating and copying RAM data.
  * 
  * @author Tyson Jones
+ * @author Luc Jaulmes (NUMA awareness)
  */
 
 #include "quest/include/modes.h"
 #include "quest/include/types.h"
 #include "quest/include/paulis.h"
 
+#include "quest/src/core/memory.hpp"
 #include "quest/src/core/errors.hpp"
+#include "quest/src/core/bitwise.hpp"
 
 #include <vector>
 #include <cstring>
 #include <cstdlib>
+#include <cstdint>
 
 using std::vector;
 
@@ -30,10 +34,35 @@ using std::vector;
 #endif
 
 
+/// @todo
+/// Windows provides a NUMA API we could access in theory, although we 
+/// forego the hassle for now - who is running QuEST on big multi-core 
+/// Windows? This validation protects against enabling NUMA awareness
+/// on Windows but silently recieving no benefit due to no NUMA API calls
+
+#if NUMA_AWARE && defined(_WIN32)
+    #error "NUMA awareness is not currently supported on non-POSIX systems like Windows."
+#endif
+
+
 #if COMPILE_OPENMP
     #include <omp.h>
 #endif
 
+#if NUMA_AWARE && ! defined(_WIN32)
+    #include <sys/mman.h>
+    #include <numaif.h>
+    #include <numa.h>
+#endif
+
+#if defined(_WIN32)
+    #define NOMINMAX
+    #define WIN32_LEAN_AND_MEAN
+    #include <windows.h>
+#else
+    #include <unistd.h>
+#endif
+
 
 
 /*
@@ -46,11 +75,12 @@ bool cpu_isOpenmpCompiled() {
 }
 
 
-int cpu_getCurrentNumThreads() {
+int cpu_getAvailableNumThreads() {
 #if COMPILE_OPENMP
     int n = -1;
 
     #pragma omp parallel shared(n)
+    #pragma omp single
     n = omp_get_num_threads();
 
     return n;
@@ -90,31 +120,140 @@ int cpu_getOpenmpThreadInd() {
 }
 
 
+int cpu_getCurrentNumThreads() {
+#if COMPILE_OPENMP
+    return omp_get_num_threads();
+#else
+    return 1;
+#endif
+}
+
+
 
 /*
  * MEMORY ALLOCATION
  */
 
 
-qcomp* cpu_allocArray(qindex length) {
+qindex getNumPagesToContainArray(long pageLen, qindex arrLen) {
 
-    /// @todo
-    /// here, we calloc the entire array in a serial setting, rather than one malloc 
-    /// followed by threads subsequently memset'ing their own partitions. The latter
-    /// approach would distribute the array pages across NUMA nodes, accelerating 
-    /// their subsequent access by the same threads (via NUMA's first-touch policy).
-    /// We have so far foregone this optimisation since a thread's memory-access pattern
-    /// in many of the QuEST functions is non-trivial, and likely to be inconsistent 
-    /// with the memset pattern. As such, I expect the benefit is totally occluded
-    /// and only introduces potential new bugs - but this should be tested and confirmed!
-
-    // we call calloc over malloc in order to fail immediately if mem isn't available;
-    // caller must handle nullptr result
+    // round up to the nearest page
+    return static_cast<qindex>(std::ceil(arrLen / (qreal) pageLen));
+}
+
+
+long cpu_getPageSize() {
+
+    // avoid repeated queries to this fixed value
+    static long pageSize = 0;
+    if (pageSize > 0)
+        return pageSize;
+
+    // obtain pageSize for the first time
+#if defined(_WIN32)
+    SYSTEM_INFO sysInfo;
+    GetSystemInfo(&sysInfo);
+    pageSize = sysInfo.dwPageSize;
+#else
+    pageSize = sysconf(_SC_PAGESIZE);
+#endif
+
+    // rigorously check the found pagesize is valid
+    // and consistent with preconditions assumed by
+    // callers, to avoid extremely funky bugs on
+    // esoteric future systems
+
+    if (pageSize <= 0)
+        error_gettingPageSizeFailed();
+
+    if (!isPowerOf2(pageSize))
+        error_pageSizeNotAPowerOf2();
 
+    if (pageSize % sizeof(qcomp) != 0)
+        error_pageSizeNotAMultipleOfQcomp();
+
+    return pageSize;
+}
+
+
+qcomp* cpu_allocArray(qindex length) {
     return (qcomp*) calloc(length, sizeof(qcomp));
 }
 
 
+qcomp* cpu_allocNumaArray(qindex length) {
+#if ! NUMA_AWARE
+    return cpu_allocArray(length);
+
+#elif defined(_WIN32)
+    error_numaAllocOrDeallocAttemptedOnWindows();
+
+#else
+    // we will divide array's memory into pages
+    long pageSize = cpu_getPageSize();
+    qindex arraySize = length * sizeof(qcomp); // gauranteed no overflow
+
+    // if entire array fits within a single page, alloc like normal
+    if (arraySize <= pageSize)
+        return cpu_allocArray(length);
+
+    // otherwise we will bind pages across NUMA nodes
+    static int numNodes = numa_num_configured_nodes();
+    if (numNodes < 1)
+        error_gettingNumNumaNodesFailed();
+
+    qindex numPages = getNumPagesToContainArray(pageSize, arraySize);
+    qindex numBytes = numPages * pageSize; // prior validation gaurantees no overflow
+    
+    // allocate memory, potentially more than arraySize (depending on page divisibility)
+    void *rawAddr = mmap(NULL, numBytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+    
+    // indicate memory alloc failure to caller (no NUMA-specific validation error message)
+    if (rawAddr == MAP_FAILED)
+        return nullptr;
+
+    // if there is only a single NUMA node, then all memory access will occur within it
+    qcomp* outAddr = reinterpret_cast<qcomp*>(rawAddr);
+    if (numNodes == 1)
+        return outAddr;
+
+    // otherwise, we bind continguous pages to NUMA nodes, distributing the pages 
+    // attemptedly uniformly and spreading remaining pages maximally apart
+    qindex baseNumPagesPerNode = numPages / numNodes; // floors
+    qindex remainingNumPagesTotal = numPages % numNodes;
+
+    // use integer type for safe address arithmetic below
+    uintptr_t offsetAddr = reinterpret_cast<uintptr_t>(rawAddr);
+
+    for (int node=0, shift=numNodes; node < numNodes; ++node) {
+
+        // decide number of pages to bind to NUMA node
+        shift -= remainingNumPagesTotal;
+        qindex numPagesInNode = baseNumPagesPerNode + (shift <= 0);
+        qindex numBytesInNode = numPagesInNode * pageSize; // validation prevents overflow
+
+        // bind those pages from the offset address to the node (identified by mask)
+        unsigned long nodeMask = 1UL << node;
+        unsigned long numBitsInMask = 8 * sizeof(nodeMask);
+        void* nodeAddr = reinterpret_cast<void*>(offsetAddr);
+        long success = mbind(nodeAddr, numBytesInNode, MPOL_BIND, &nodeMask, numBitsInMask, 0);
+
+        // treat bind failure as internal error (even though it can result from insufficient kernel mem),
+        // rather than permitting silent fallback to non-NUMA awareness which might be astonishingly slow
+        if (success == -1)
+            error_numaBindingFailed();
+
+        // prepare next node's address
+        offsetAddr += numPagesInNode * pageSize;
+        if (shift <= 0)
+            shift += numNodes;
+    }
+
+    return outAddr;
+#endif
+}
+
+
 void cpu_deallocArray(qcomp* arr) {
 
     // arr can safely be nullptr
@@ -122,6 +261,36 @@ void cpu_deallocArray(qcomp* arr) {
 }
 
 
+void cpu_deallocNumaArray(qcomp* arr, qindex length) {
+
+    // musn't pass nullptr to munmap() below
+    if (arr == nullptr)
+        return;
+
+#if ! NUMA_AWARE
+    cpu_deallocArray(arr);
+
+#elif defined(_WIN32)
+    error_numaAllocOrDeallocAttemptedOnWindows();
+
+#else
+    qindex arrSize = length * sizeof(qcomp);
+    long pageSize = cpu_getPageSize();
+
+    // sub-page arrays were allocated with calloc()
+    if (arrSize <= pageSize)
+        return cpu_deallocArray(arr);
+
+    qindex numPages = getNumPagesToContainArray(pageSize, arrSize);
+    qindex numBytes = numPages * pageSize; // gauranteed no overflow
+    int success = munmap(arr, numBytes);
+
+    if (success == -1)
+        error_numaUnmappingFailed();
+#endif
+}
+
+
 qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim) {
 
     // do not allocate if arr alloc failed (caller will handle)
diff --git a/quest/src/cpu/cpu_config.hpp b/quest/src/cpu/cpu_config.hpp
index 48f54b44f..21d39e359 100644
--- a/quest/src/cpu/cpu_config.hpp
+++ b/quest/src/cpu/cpu_config.hpp
@@ -23,7 +23,7 @@ using std::vector;
 
 bool cpu_isOpenmpCompiled();
 
-int cpu_getCurrentNumThreads();
+int cpu_getAvailableNumThreads();
 
 int cpu_getNumOpenmpProcessors();
 
@@ -35,6 +35,8 @@ int cpu_getNumOpenmpProcessors();
 
 int cpu_getOpenmpThreadInd();
 
+int cpu_getCurrentNumThreads();
+
 
 
 /*
@@ -44,6 +46,9 @@ int cpu_getOpenmpThreadInd();
 qcomp* cpu_allocArray(qindex length);
 void cpu_deallocArray(qcomp* arr);
 
+qcomp* cpu_allocNumaArray(qindex length);
+void cpu_deallocNumaArray(qcomp* arr, qindex length);
+
 qcomp** cpu_allocAndInitMatrixWrapper(qcomp* arr, qindex dim);
 void cpu_deallocMatrixWrapper(qcomp** wrapper);
 
@@ -60,6 +65,8 @@ PauliStr* cpu_allocPauliStrings(qindex numStrings);
 void cpu_deallocPauliStrings(PauliStr* strings);
 
 
+long cpu_getPageSize();
+
 
 /*
  * MEMORY MOVEMENT
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 4a783a372..ce51f2b78 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -9,6 +9,7 @@
  * 
  * @author Tyson Jones
  * @author Oliver Brown (OpenMP 'if' clauses)
+ * @author Luc Jaulmes (optimised initUniformState)
  * @author Richard Meister (helped patch on LLVM)
  * @author Kshitij Chhabra (patched v3 clauses with gcc9)
  * @author Ania (Anna) Brown (developed QuEST v1 logic)
@@ -907,7 +908,7 @@ void cpu_statevector_anyCtrlPauliTensorOrGadget_subA(
     // whenever each thread has at least 1 iteration for itself. And of course
     // we serialise both inner and outer loops when qureg multithreading is off.
 
-    if (!qureg.isMultithreaded || numOuterIts >= cpu_getCurrentNumThreads()) {
+    if (!qureg.isMultithreaded || numOuterIts >= cpu_getAvailableNumThreads()) {
     
         // parallel
         #pragma omp parallel for if(qureg.isMultithreaded)
@@ -2389,9 +2390,20 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS( void, cpu_densmatr_multiQubitProjector
 
 void cpu_statevec_initUniformState_sub(Qureg qureg, qcomp amp) {
 
-    // faster on average (though perhaps not for large quregs)
-    // than a custom multithreaded loop
-    std::fill(qureg.cpuAmps, qureg.cpuAmps + qureg.numAmpsPerNode, amp);
+    // approx-uniformly distribute modified memory pages across threads,
+    // in the hope that each std::fill() will touch only memory within 
+    // the thread's corresponding NUMA node, for best performance 
+
+    int numAmpsPerPage = cpu_getPageSize() / sizeof(qcomp); // divides evenly
+
+    #pragma omp parallel if(qureg.isMultithreaded)
+    {
+        const auto [start, end] = util_getBlockMultipleSubRange(
+            qureg.numAmpsPerNode, numAmpsPerPage,
+            cpu_getOpenmpThreadInd(), cpu_getCurrentNumThreads());
+
+        std::fill(qureg.cpuAmps + start, qureg.cpuAmps + end, amp);
+    }
 }
 
 

From 37e222ea2f3526d29cb14b01c319f4fabb62ec7e Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 13 Jul 2025 12:21:20 -0400
Subject: [PATCH 12/32] patched broken CUDA state after malloc failure

Danny Hindson discovered a bug wherein the failing cudaMalloc() call deliberately induced by the 'out-of-memory' unit test of createFullStateDiagMatr() breaks subsequent GPU simulation. This is because a failing cudaMalloc corrupts the CUDA API state until being explicitly cleared using the undocumented facility of cudaGetLastError (which clears "non-sticky' errors). We correct this, and defensively check for irrecoverable sticky errors.
---
 quest/src/core/errors.cpp    |  5 ++++
 quest/src/core/errors.hpp    |  2 ++
 quest/src/gpu/gpu_config.cpp | 57 ++++++++++++++++++++++++++++++++----
 3 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 4f68090d8..c1cfd2a02 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -698,6 +698,11 @@ void error_cudaCallFailed(const char* msg, const char* func, const char* caller,
     raiseInternalError(err);
 }
 
+void error_cudaEncounteredIrrecoverableError() {
+
+    raiseInternalError("The CUDA API encountered an irrecoverable \"sticky\" error which was attemptedly cleared as if it were non-sticky.");
+}
+
 
 
 /*
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index e99166af4..7097650a7 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -266,6 +266,8 @@ void assert_applyFullStateDiagMatrTempGpuAllocSucceeded(qcomp* gpuPtr);
 
 void error_cudaCallFailed(const char* msg, const char* func, const char* caller, const char* file, int line);
 
+void error_cudaEncounteredIrrecoverableError();
+
 
 
 /*
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 0b420c22a..87a1c5192 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -46,14 +46,16 @@
 /*
  * CUDA ERROR HANDLING
  *
- * which is only defined when CUDA-compiling, since it is invoked only a macro (defined
- * in gpu_config.hpp) which wraps CUDA API calls
+ * which are only defined when CUDA-compiling, since only ever invoked
+ * when encountering issues through use of the CUDA API
  */
 
 #if COMPILE_CUDA
 
 void assertCudaCallSucceeded(int result, const char* call, const char* caller, const char* file, int line) {
 
+    // this function is only invoked by the CUDA_CHECK macro defined in gpu_config.hpp header
+
     // result (int) is actually type cudaError_t but we cannot use this CUDA-defined type
     // in gpu_config.hpp (since it's included by non-CUDA-compiled files), and we wish to keep
     // the signature consistent.
@@ -63,6 +65,32 @@ void assertCudaCallSucceeded(int result, const char* call, const char* caller, c
         error_cudaCallFailed(cudaGetErrorString(code), call, caller, file, line);
 }
 
+void clearPossibleCudaError() {
+
+    // beware that in addition to clearing anticipated CUDA errors (like
+    // cudaMalloc failing), this function will check that the CUDA API is
+    // generally working (i.e. has not encountered an irrecoverable error),
+    // including whether e.g. the CUDA drivers match the runtime version. It
+    // should ergo never be called in settings where GPU is compiled but not
+    // runtime activated, since such settings see CUDA be in an acceptably
+    // broken state - calling this function would throw an internal error
+
+    // clear "non-sticky" errors so that future CUDA API use is not corrupted
+    cudaError_t initialCode = cudaGetLastError();
+
+    // nothing to do if no error had occurred
+    if (initialCode == cudaSuccess)
+        return;
+
+    // sync and re-check if error code is erroneously unchanged, which 
+    // indicates that CUDA encountered an irrecoverable "sticky" error
+    CUDA_CHECK( cudaDeviceSynchronize() );
+
+    cudaError_t finalCode = cudaGetLastError();
+    if (initialCode == finalCode)
+        error_cudaEncounteredIrrecoverableError();
+}
+
 #endif
 
 
@@ -153,6 +181,12 @@ int gpu_getNumberOfLocalGpus() {
     // is called but no devices exist, which we handle
     int num;
     auto status = cudaGetDeviceCount(&num);
+
+    // treat query failure as indication of no local GPUs
+    // so do not call clearPossibleCudaError(). This is
+    // necessary because cudaGetDeviceCount() can report
+    // driver version errors when QuEST is GPU-compiled
+    // on a platform without a GPU, which we tolerate
     return (status == cudaSuccess)? num : 0;
 
 #else
@@ -176,8 +210,12 @@ bool gpu_isGpuAvailable() {
         struct cudaDeviceProp props;
         auto status = cudaGetDeviceProperties(&props, deviceInd);
 
-        // if the query failed, device is anyway unusable
-        if (status != cudaSuccess) 
+        // if the query failed, device is anyway unusable; we do not
+        // clear the error with clearPossibleCudaError() since this
+        // can trigger an internal error when QuEST is GPU-compiled
+        // but no valid GPU exists (hence no valid driver), like
+        // occurs on cluster submission nodes
+        if (status != cudaSuccess)
             continue;
 
         // if the device is a real GPU, it's 'major' compute capability is != 9999 (meaning emulation)
@@ -405,9 +443,16 @@ qcomp* gpu_allocArray(qindex length) {
     qcomp* ptr;
     cudaError_t errCode = cudaMalloc(&ptr, numBytes);
 
-    // intercept memory-alloc error and merely return nullptr pointer (to be handled by validation)
-    if (errCode == cudaErrorMemoryAllocation)
+    // intercept memory-alloc error (handled by caller's validation)
+    if (errCode == cudaErrorMemoryAllocation) {
+
+        // malloc failure can break CUDA API state, so recover it in
+        // case execution is continuing (e.g. by unit tests)
+        clearPossibleCudaError();
+
+        // indicate alloc failure
         return nullptr;
+    }
 
     // pass all other unexpected errors to internal error handling
     CUDA_CHECK(errCode);

From c49f8d2ee92fd4055106daab2e36001d7ff3d498 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 20 Jul 2025 00:52:22 +0200
Subject: [PATCH 13/32] separated API multiplications from operations

in order to shrink the bloated set of operations, making those remaining "standard" and trace-preserving (with the exception of the applyQubitProjector and applyMultiQubitProjector). The new multiplications module is catered to "raw" linear algebra upon density matrices
---
 quest/include/multiplication.h   | 760 +++++++++++++++++++++++++++++++
 quest/include/operations.h       | 426 +----------------
 quest/include/quest.h            |   1 +
 quest/src/api/CMakeLists.txt     |   1 +
 quest/src/api/multiplication.cpp | 624 +++++++++++++++++++++++++
 quest/src/api/operations.cpp     | 459 +------------------
 tests/unit/CMakeLists.txt        |   1 +
 tests/unit/multiplication.cpp    |   7 +
 tests/unit/operations.cpp        | 409 +++++++++--------
 9 files changed, 1640 insertions(+), 1048 deletions(-)
 create mode 100644 quest/include/multiplication.h
 create mode 100644 quest/src/api/multiplication.cpp
 create mode 100644 tests/unit/multiplication.cpp

diff --git a/quest/include/multiplication.h b/quest/include/multiplication.h
new file mode 100644
index 000000000..8ebc7aa7f
--- /dev/null
+++ b/quest/include/multiplication.h
@@ -0,0 +1,760 @@
+/** @file
+ * API signatures for directly pre- and post-multiplying 
+ * operators upon density matrices, likely constituting 
+ * non-physical operations which break state normalisation.
+ * 
+ * @author Tyson Jones
+ * 
+ * @defgroup multiplication Multiplication
+ * @ingroup api
+ * @brief Functions for directly multiplying operators upon 
+ *        density matrices.
+ * @{
+ */
+
+#ifndef MULTIPLICATION_H
+#define MULTIPLICATION_H
+
+#include "quest/include/qureg.h"
+#include "quest/include/paulis.h"
+#include "quest/include/matrices.h"
+#include "quest/include/channels.h"
+
+#ifdef __cplusplus
+    #include <vector>
+#endif
+
+
+/*
+ * unlike some other headers, we here intermix the C and C++-only
+ * signatures, grouping them semantically & by their doc groups
+ */
+
+
+
+/** 
+ * @defgroup mult_compmatr1 CompMatr1
+ * @brief Functions for pre- or post-multiplying general one-qubit dense matrices
+ *        (as CompMatr1) upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** Multiplies a general one-qubit dense @p matrix upon the specified @p target 
+ * qubit of @p qureg.
+ *  
+ * @formulae
+ * Let @f$ \hat{M} = @f$ @p matrix and @f$ t = @f$ @p target, and notate 
+ * @f$\hat{M}_t@f$ as per applyCompMatr1(). Unlike applyCompMatr1() however,
+ * this function only ever left-multiplies @p matrix upon @p qureg, regardless
+ * of whether it is a statevector or density matrix.
+ * 
+ * Explicitly,
+ * - When @p qureg is a statevector @f$ \svpsi @f$, this function effects
+ *   @f[ 
+        \svpsi \rightarrow \hat{M}_t \, \svpsi.
+ *   @f]
+ * - When @p qureg is a density matrix @f$\dmrho@f$, this function effects
+ *   @f[ 
+        \dmrho \rightarrow \hat{M}_t \, \dmrho.
+ *   @f]
+ *  
+ * There are no additional constraints like unitarity.
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createDensityQureg(5);
+
+    CompMatr1 matrix = getInlineCompMatr1({
+        {0.1, 0.2},
+        {0.3i, 0.4i}
+    });
+
+    multiplyCompMatr1(qureg, 2, matrix); 
+ * ```
+ *
+ * @param[in,out] qureg  the state to modify.
+ * @param[in]     target the index of the target qubit.
+ * @param[in]     matrix the Z-basis matrix to multiply.
+ * @throws @validationerror
+ * - if @p qureg or @p matrix are uninitialised.
+ * - if @p target is an invalid qubit index.
+ * @see
+ * - getCompMatr1()
+ * - getInlineCompMatr1()
+ * - applyCompMatr1()
+ * - postMultiplyCompMatr1()
+ * - applyQubitProjector()
+ * - multiplyCompMatr()
+ * @author Tyson Jones
+ */
+void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
+
+
+/** @notyettested
+ * 
+ * Multiplies a general one-qubit dense @p matrix upon the specified @p target 
+ * qubit of the density matrix @p qureg, from the right-hand side.
+ *  
+ * @formulae
+ * Let @f$ \dmrho = @f$ @p qureg, @f$ \hat{M} = @f$ @p matrix and @f$ t = @f$ @p target, 
+ * and notate @f$\hat{M}_t@f$ as per applyCompMatr1(). Unlike applyCompMatr1() however,
+ * this function only ever right-multiplies @p matrix upon @p qureg.
+ * 
+ * Explicitly
+ *   @f[ 
+        \dmrho \rightarrow \dmrho \, \hat{M}_t
+ *   @f]
+ * where @f$ \hat{M} @f$ is not conjugated nor transposed, and there are no additional 
+ * constraints like unitarity.
+ * 
+ * In general, this function will break the normalisation of @p qureg and result in a
+ * non-physical state, and is useful for preparing sub-expressions of formulae like
+ * the Linbladian.
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createDensityQureg(5);
+
+    CompMatr1 matrix = getInlineCompMatr1({
+        {0.1, 0.2},
+        {0.3i, 0.4i}
+    });
+
+    postMultiplyCompMatr1(qureg, 2, matrix); 
+ * ```
+ *
+ * @param[in,out] qureg  the state to modify.
+ * @param[in]     target the index of the target qubit.
+ * @param[in]     matrix the Z-basis matrix to post-multiply.
+ * @throws @validationerror
+ * - if @p qureg or @p matrix are uninitialised.
+ * - if @p qureg is not a density matrix.
+ * - if @p target is an invalid qubit index.
+ * @see
+ * - getCompMatr1()
+ * - getInlineCompMatr1()
+ * - applyCompMatr1()
+ * - multiplyCompMatr1()
+ * - multiplyCompMatr()
+ * @author Tyson Jones
+ */
+void postMultiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+
+
+/** 
+ * @defgroup mult_compmatr2 CompMatr2
+ * @brief Functions for pre- or post-multiplying general two-qubit dense matrices
+ *        (as CompMatr2) upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @see
+/// - applyCompMatr2()
+/// - multiplyCompMatr1()
+void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matr);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+/// @see
+/// - postMultiplyCompMatr1
+void postMultiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+
+
+
+/** 
+ * @defgroup mult_compmatr CompMatr
+ * @brief Functions for pre- or post-multiplying general many-target dense matrices
+ *        (as CompMatr) upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** @notyetdoced
+ * 
+ * @see
+ * - applyCompMatr()
+ * - multiplyCompMatr1()
+ */
+void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+/// @see
+/// - postMultiplyCompMatr1
+void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see multiplyCompMatr()
+void multiplyCompMatr(Qureg qureg, std::vector<int> targets, CompMatr matr);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see postMultiplyCompMatr()
+void postMultiplyCompMatr(Qureg qureg, std::vector<int> targets, CompMatr matr);
+
+
+#endif 
+
+/** @} */
+
+
+
+/** 
+ * @defgroup mult_diagmatr1 DiagMatr1
+ * @brief Functions for pre- or post-multiplying general single-qubit diagonal 
+ *        matrices (as DiagMatr1) upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @see multiplyCompMatr1()
+void multiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matr);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+void postMultiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+
+
+/** 
+ * @defgroup mult_diagmatr2 DiagMatr2
+ * @brief Functions for pre- or post-multiplying general two-qubit diagonal 
+ *        matrices (as DiagMatr2) upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @see multiplyCompMatr1()
+void multiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matr);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+void postMultiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+
+
+/** 
+ * @defgroup mult_diagmatr DiagMatr
+ * @brief Functions for pre- or post-multiplying general any-target diagonal 
+ *        matrices (as DiagMatr), or powers thereof, upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @see multiplyCompMatr1()
+void multiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+void postMultiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
+
+
+/// @notyetdoced
+/// @see
+/// - multiplyCompMatr1()
+/// - applyDiagMatrPower()
+void multiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+void postMultiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see multiplyDiagMatr()
+void multiplyDiagMatr(Qureg qureg, std::vector<int> targets, DiagMatr matrix);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see postMultiplyDiagMatr()
+void postMultiplyDiagMatr(Qureg qureg, std::vector<int> targets, DiagMatr matrix);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see multiplyDiagMatrPower()
+void multiplyDiagMatrPower(Qureg qureg, std::vector<int> targets, DiagMatr matrix, qcomp exponent);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see postMultiplyDiagMatrPower()
+void postMultiplyDiagMatrPower(Qureg qureg, std::vector<int> targets, DiagMatr matrix, qcomp exponent);
+
+
+#endif 
+
+/** @} */
+
+
+
+/** 
+ * @defgroup mult_fullstatediagmatr FullStateDiagMatr
+ * @brief Functions for pre- or post-multiplying general full-state diagonal 
+ *        matrices (FullStateDiagMatr), or powers thereof, upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @notyetvalidated
+/// @see
+/// - multiplyCompMatr1
+void multiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
+
+
+/// @notyetdoced
+/// @notyetvalidated
+/// @see
+/// - multiplyCompMatr1
+/// - applyDiagMatrPower
+void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+
+
+/** 
+ * @defgroup multi_swap Swap
+ * @brief Functions for pre- or post-multiplying the two-qubit SWAP
+ *        gate upon density matrices
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @see multiplyCompMatr1()
+void multiplySwap(Qureg qureg, int qubit1, int qubit2);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplySwap(Qureg qureg, int qubit1, int qubit2);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+
+
+/** 
+ * @defgroup mult_pauli Pauli
+ * @brief Functions for pre- or post-multiplying the individual one-qubit 
+ *        Pauli operators upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see multiplyCompMatr1()
+void multiplyPauliX(Qureg qureg, int target);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see multiplyCompMatr1()
+void multiplyPauliY(Qureg qureg, int target);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see multiplyCompMatr1()
+void multiplyPauliZ(Qureg qureg, int target);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see postMultiplyCompMatr1()
+void postMultiplyPauliX(Qureg qureg, int target);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see postMultiplyCompMatr1()
+void postMultiplyPauliY(Qureg qureg, int target);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see postMultiplyCompMatr1()
+void postMultiplyPauliZ(Qureg qureg, int target);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+
+
+/** 
+ * @defgroup mult_paulistr PauliStr
+ * @brief Functions for pre- or post-multiplying a tensor product of 
+ *       Pauli operators (as a PauliStr) upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @see multiplyCompMatr1()
+void multiplyPauliStr(Qureg qureg, PauliStr str);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyPauliStr(Qureg qureg, PauliStr str);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+
+
+/** 
+ * @defgroup mult_pauligadget Pauli gadgets
+ * @brief Functions for pre- or post-multiplying many-qubit rotations around 
+ *        arbitrary PauliStr upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @see 
+/// - multiplyCompMatr1()
+/// - applyPauliGadget()
+void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+
+
+/** 
+ * @defgroup mult_phasegadget Phase gates
+ * @brief Functions for pre- or post-multiplying many-qubit rotations around 
+ *        the Pauli Z axis upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @see 
+/// - multiplyCompMatr1()
+/// - applyPhaseGadget
+void multiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see multiplyPhaseGadget()
+void multiplyPhaseGadget(Qureg qureg, std::vector<int> targets, qreal angle);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see postMultiplyPhaseGadget()
+void postMultiplyPhaseGadget(Qureg qureg, std::vector<int> targets, qreal angle);
+
+
+#endif
+
+/** @} */
+
+
+
+/** 
+ * @defgroup mult_nots Many-not gates
+ * @brief Functions for pre- or post-multiplying many-qubit NOT gates 
+ *        upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @see multiplyCompMatr1()
+void multiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see multiplyMultiQubitNot()
+void multiplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see postMultiplyMultiQubitNot()
+void postMultiplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
+
+
+#endif
+
+/** @} */
+
+
+
+/** 
+ * @defgroup mult_paulistrsum PauliStrSum
+ * @brief Functions for pre- or post-multiplying weighted sums of Pauli 
+ *        tensors upon a density matrix.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @notyetvalidated
+/// @see multiplyCompMatr1()
+void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @notyetvalidated
+void postMultiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+
+
+#endif // MULTIPLICATION_H
+
+/** @} */ // (end file-wide doxygen defgroup)
diff --git a/quest/include/operations.h b/quest/include/operations.h
index 0fb06560f..bbeaf12d2 100644
--- a/quest/include/operations.h
+++ b/quest/include/operations.h
@@ -1,15 +1,19 @@
 /** @file
- * API signatures for effecting operators (such as gates and unitaries) 
- * upon Quregs which are instantiated as either statevectors or 
- * density matrices. This excludes decoherence channels which are
- * instead exposed in decoherence.h
+ * API signatures for effecting mostly physical and/or trace
+ * preserving operators, such as unitaries, gates and 
+ * measurements, upon Quregs which are instantiated as both 
+ * statevectors or density matrices. This excludes Trotterised
+ * gadgets and evolutions (exposed instead in trotterisation.h),
+ * functions to pre- or post-multiply operators upon density
+ * matrices (multiplication.h) and decoherence channels
+ * (decoherence.h).
  * 
  * @author Tyson Jones
  * @author Diogo Pratas Maia (non-unitary Pauli gadget)
  * 
  * @defgroup operations Operations
  * @ingroup api
- * @brief Functions for effecting operators upon Quregs.
+ * @brief Functions for effecting standard operators upon Quregs.
  * @{
  */
 
@@ -46,108 +50,6 @@ extern "C" {
 #endif
 
 
-/** Multiplies a general one-qubit dense @p matrix upon the specified @p target 
- * qubit of @p qureg.
- *  
- * @formulae
- * Let @f$ \hat{M} = @f$ @p matrix and @f$ t = @f$ @p target, and notate 
- * @f$\hat{M}_t@f$ as per applyCompMatr1(). Unlike applyCompMatr1() however,
- * this function only ever left-multiplies @p matrix upon @p qureg, regardless
- * of whether it is a statevector or density matrix.
- * 
- * Explicitly,
- * - When @p qureg is a statevector @f$ \svpsi @f$, this function effects
- *   @f[ 
-        \svpsi \rightarrow \hat{M}_t \, \svpsi.
- *   @f]
- * - When @p qureg is a density matrix @f$\dmrho@f$, this function effects
- *   @f[ 
-        \dmrho \rightarrow \hat{M}_t \, \dmrho.
- *   @f]
- *
- * There are no additional constraints like unitarity.
- *
- * @myexample
- * ```
-    Qureg qureg = createDensityQureg(5);
-
-    CompMatr1 matrix = getInlineCompMatr1({
-        {0.1, 0.2},
-        {0.3i, 0.4i}
-    });
-
-    multiplyCompMatr1(qureg, 2, matrix); 
- * ```
- *
- * @param[in,out] qureg  the state to modify.
- * @param[in]     target the index of the target qubit.
- * @param[in]     matrix the Z-basis matrix to multiply.
- * @throws @validationerror
- * - if @p qureg or @p matrix are uninitialised.
- * - if @p target is an invalid qubit index.
- * @see
- * - getCompMatr1()
- * - getInlineCompMatr1()
- * - applyCompMatr1()
- * - postMultiplyCompMatr1()
- * - applyQubitProjector()
- * - multiplyCompMatr()
- * @author Tyson Jones
- */
-void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
-
-
-/** @notyettested
- * 
- * Multiplies a general one-qubit dense @p matrix upon the specified @p target 
- * qubit of the density matrix @p qureg, from the right-hand side.
- *  
- * @formulae
- * Let @f$ \dmrho = @f$ @p qureg, @f$ \hat{M} = @f$ @p matrix and @f$ t = @f$ @p target, 
- * and notate @f$\hat{M}_t@f$ as per applyCompMatr1(). Unlike applyCompMatr1() however,
- * this function only ever right-multiplies @p matrix upon @p qureg.
- * 
- * Explicitly
- *   @f[ 
-        \dmrho \rightarrow \dmrho \, \hat{M}_t
- *   @f]
- * where @f$ \hat{M} @f$ is not conjugated nor transposed, and there are no additional 
- * constraints like unitarity.
- * 
- * In general, this function will break the normalisation of @p qureg and result in a
- * non-physical state, and is useful for preparing sub-expressions of formulae like
- * the Linbladian.
- *
- * @myexample
- * ```
-    Qureg qureg = createDensityQureg(5);
-
-    CompMatr1 matrix = getInlineCompMatr1({
-        {0.1, 0.2},
-        {0.3i, 0.4i}
-    });
-
-    postMultiplyCompMatr1(qureg, 2, matrix); 
- * ```
- *
- * @param[in,out] qureg  the state to modify.
- * @param[in]     target the index of the target qubit.
- * @param[in]     matrix the Z-basis matrix to post-multiply.
- * @throws @validationerror
- * - if @p qureg or @p matrix are uninitialised.
- * - if @p qureg is not a density matrix.
- * - if @p target is an invalid qubit index.
- * @see
- * - getCompMatr1()
- * - getInlineCompMatr1()
- * - applyCompMatr1()
- * - multiplyCompMatr1()
- * - multiplyCompMatr()
- * @author Tyson Jones
- */
-void postMultiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
-
-
 /** Applies a general one-qubit dense unitary @p matrix to the specified @p target 
  * qubit of @p qureg.
  * 
@@ -392,21 +294,6 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @see
-/// - applyCompMatr2()
-/// - multiplyCompMatr1()
-void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matr);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
-/// @see
-/// - postMultiplyCompMatr1
-void postMultiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix);
-
-
 /** @notyetdoced
  * 
  * Applies a general two-qubit dense unitary @p matrix to qubits @p target1 and
@@ -435,6 +322,8 @@ digraph {
  *
  * @see
  * - applyCompMatr1()
+ * - multiplyCompMatr2()
+ * - postMultiplyCompMatr2()
  */
 void applyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix);
 
@@ -609,23 +498,6 @@ extern "C" {
 #endif
 
 
-/** @notyetdoced
- * 
- * @see
- * - applyCompMatr()
- * - multiplyCompMatr1()
- */
-void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
-/// @see
-/// - postMultiplyCompMatr1
-void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix);
-
-
 /** @notyetdoced
  * 
  * @formulae
@@ -643,6 +515,8 @@ void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr ma
  *
  * @see
  * - applyCompMatr1()
+ * - multiplyCompMatr()
+ * - postMultiplyCompMatr()
  */
 void applyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matr);
 
@@ -673,22 +547,6 @@ void applyMultiStateControlledCompMatr(Qureg qureg, int* controls, int* states,
 #ifdef __cplusplus
 
 
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see multiplyCompMatr()
-void multiplyCompMatr(Qureg qureg, std::vector<int> targets, CompMatr matr);
-
-
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see postMultiplyCompMatr()
-void postMultiplyCompMatr(Qureg qureg, std::vector<int> targets, CompMatr matr);
-
-
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
@@ -739,19 +597,12 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @see multiplyCompMatr1()
-void multiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matr);
-
-
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-void postMultiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix);
-
-
-/// @notyetdoced
-/// @see applyCompMatr1()
+/** @notyetdoced
+ * @see 
+ * - applyCompMatr1()
+ * - multiplyCompMatr2()
+ * - postMultiplyCompMatr2()
+ */
 void applyDiagMatr1(Qureg qureg, int target, DiagMatr1 matr);
 
 
@@ -812,17 +663,6 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @see multiplyCompMatr1()
-void multiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matr);
-
-
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-void postMultiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix);
-
-
 /// @notyetdoced
 /// @see applyCompMatr1()
 void applyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matr);
@@ -885,17 +725,6 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @see multiplyCompMatr1()
-void multiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
-
-
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-void postMultiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
-
-
 /// @notyetdoced
 /// @see applyCompMatr1()
 void applyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
@@ -916,19 +745,6 @@ void applyMultiControlledDiagMatr(Qureg qureg, int* controls, int numControls, i
 void applyMultiStateControlledDiagMatr(Qureg qureg, int* controls, int* states, int numControls, int* targets, int numTargets, DiagMatr matrix);
 
 
-/// @notyetdoced
-/// @see
-/// - multiplyCompMatr1()
-/// - applyDiagMatrPower()
-void multiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
-
-
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-void postMultiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
-
-
 /** @notyetdoced
  *
  * @formulae
@@ -967,22 +783,6 @@ void applyMultiStateControlledDiagMatrPower(Qureg qureg, int* controls, int* sta
 #ifdef __cplusplus
 
 
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see multiplyDiagMatr()
-void multiplyDiagMatr(Qureg qureg, std::vector<int> targets, DiagMatr matrix);
-
-
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see postMultiplyDiagMatr()
-void postMultiplyDiagMatr(Qureg qureg, std::vector<int> targets, DiagMatr matrix);
-
-
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
@@ -1015,22 +815,6 @@ void applyMultiControlledDiagMatr(Qureg qureg, std::vector<int> controls, std::v
 void applyMultiStateControlledDiagMatr(Qureg qureg, std::vector<int> controls, std::vector<int> states, std::vector<int> targets, DiagMatr matrix);
 
 
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see multiplyDiagMatrPower()
-void multiplyDiagMatrPower(Qureg qureg, std::vector<int> targets, DiagMatr matrix, qcomp exponent);
-
-
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see postMultiplyDiagMatrPower()
-void postMultiplyDiagMatrPower(Qureg qureg, std::vector<int> targets, DiagMatr matrix, qcomp exponent);
-
-
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
@@ -1081,33 +865,6 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @notyetvalidated
-/// @see
-/// - multiplyCompMatr1
-void multiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
-
-
-/// @notyetdoced
-/// @notyetvalidated
-/// @see
-/// - multiplyCompMatr1
-/// - applyDiagMatrPower
-void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
-void postMultiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
-void postMultiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
-
-
 /// @notyetdoced
 /// @notyetvalidated
 void applyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
@@ -1267,17 +1024,6 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @see multiplyCompMatr1()
-void multiplySwap(Qureg qureg, int qubit1, int qubit2);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
-void postMultiplySwap(Qureg qureg, int qubit1, int qubit2);
-
-
 /** Applies a SWAP gate between @p qubit1 and @p qubit2 of @p qureg.
  * 
  * @diagram
@@ -1398,42 +1144,6 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @notyettested
-/// @see multiplyCompMatr1()
-void multiplyPauliX(Qureg qureg, int target);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @see multiplyCompMatr1()
-void multiplyPauliY(Qureg qureg, int target);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @see multiplyCompMatr1()
-void multiplyPauliZ(Qureg qureg, int target);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @see postMultiplyCompMatr1()
-void postMultiplyPauliX(Qureg qureg, int target);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @see postMultiplyCompMatr1()
-void postMultiplyPauliY(Qureg qureg, int target);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @see postMultiplyCompMatr1()
-void postMultiplyPauliZ(Qureg qureg, int target);
-
-
 /// @notyetdoced
 void applyPauliX(Qureg qureg, int target);
 
@@ -1559,17 +1269,6 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @see multiplyCompMatr1()
-void multiplyPauliStr(Qureg qureg, PauliStr str);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
-void postMultiplyPauliStr(Qureg qureg, PauliStr str);
-
-
 /// @notyetdoced
 void applyPauliStr(Qureg qureg, PauliStr str);
 
@@ -1940,7 +1639,7 @@ void applyMultiStateControlledRotateAroundAxis(Qureg qureg, std::vector<int> ctr
 
 
 /** 
- * @defgroup op_pauligadget Pauli gadgets
+ * @defgroup op_pauligadget PauliStr gadgets
  * @brief Functions for applying many-qubit rotations around arbitrary PauliStr.
  * @{
  */
@@ -1951,19 +1650,6 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @see 
-/// - multiplyCompMatr1()
-/// - applyPauliGadget()
-void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
-void postMultiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
-
-
 /** @notyetdoced
  * 
  * @formulae
@@ -2090,19 +1776,6 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @see 
-/// - multiplyCompMatr1()
-/// - applyPhaseGadget
-void multiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
-void postMultiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
-
-
 /** @notyetdoced
  * 
  * @formulae
@@ -2367,22 +2040,6 @@ void applyMultiQubitPhaseShift(Qureg qureg, int* targets, int numTargets, qreal
 #ifdef __cplusplus
 
 
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see multiplyPhaseGadget()
-void multiplyPhaseGadget(Qureg qureg, std::vector<int> targets, qreal angle);
-
-
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see postMultiplyPhaseGadget()
-void postMultiplyPhaseGadget(Qureg qureg, std::vector<int> targets, qreal angle);
-
-
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
@@ -2438,8 +2095,8 @@ void applyMultiQubitPhaseShift(Qureg qureg, std::vector<int> targets, qreal angl
 
 
 /** 
- * @defgroup op_paulistrsum PauliStrSum
- * @brief Functions for applying, exponentiating or Trotterising a weigthed sum of Pauli tensors.
+ * @defgroup op_paulistrsum PauliStrSum gadgets
+ * @brief Functions for apply Trotterised exponentials of weighted sums of Pauli tensors.
  * @{
  */
 
@@ -2449,18 +2106,6 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @notyetvalidated
-/// @see multiplyCompMatr1()
-void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
-void postMultiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
-
-
 /** @notyettested
  * 
  * Effects (an approximation to) the exponential of @p sum, weighted by @p angle, upon @p qureg,
@@ -2741,17 +2386,6 @@ extern "C" {
 #endif
 
 
-/// @notyetdoced
-/// @see multiplyCompMatr1()
-void multiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
-void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
-
-
 /// @notyetdoced
 void applyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
 
@@ -2778,22 +2412,6 @@ void applyMultiStateControlledMultiQubitNot(Qureg qureg, int* controls, int* sta
 #ifdef __cplusplus
 
 
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see multiplyMultiQubitNot()
-void multiplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
-
-
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see postMultiplyMultiQubitNot()
-void postMultiplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
-
-
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
diff --git a/quest/include/quest.h b/quest/include/quest.h
index fcc49ab76..afcb316be 100644
--- a/quest/include/quest.h
+++ b/quest/include/quest.h
@@ -50,6 +50,7 @@
 #include "quest/include/environment.h"
 #include "quest/include/initialisations.h"
 #include "quest/include/channels.h"
+#include "quest/include/multiplication.h"
 #include "quest/include/operations.h"
 #include "quest/include/paulis.h"
 #include "quest/include/qureg.h"
diff --git a/quest/src/api/CMakeLists.txt b/quest/src/api/CMakeLists.txt
index a6d3137b6..d02797506 100644
--- a/quest/src/api/CMakeLists.txt
+++ b/quest/src/api/CMakeLists.txt
@@ -8,6 +8,7 @@ target_sources(QuEST
   initialisations.cpp
   matrices.cpp
   modes.cpp
+  multiplication.cpp
   operations.cpp
   paulis.cpp
   qureg.cpp
diff --git a/quest/src/api/multiplication.cpp b/quest/src/api/multiplication.cpp
new file mode 100644
index 000000000..0bb0a6b72
--- /dev/null
+++ b/quest/src/api/multiplication.cpp
@@ -0,0 +1,624 @@
+/** @file
+ * API definitions for directly pre- and post-multiplying 
+ * operators upon density matrices, likely constituting 
+ * non-physical operations which break state normalisation.
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest/include/qureg.h"
+#include "quest/include/paulis.h"
+#include "quest/include/matrices.h"
+#include "quest/include/multiplication.h"
+
+#include "quest/src/core/validation.hpp"
+#include "quest/src/core/utilities.hpp"
+#include "quest/src/core/localiser.hpp"
+
+#include <vector>
+
+using std::vector;
+
+
+
+/*
+ * CompMatr1
+ */
+
+extern "C" {
+
+void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_target(qureg, target, __func__);
+    validate_matrixFields(matrix, __func__);
+
+    bool conj = false;
+    bool transp = false;
+    localiser_statevec_anyCtrlOneTargDenseMatr(qureg, {}, {}, target, matrix, conj, transp);
+}
+
+void postMultiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_target(qureg, target, __func__);
+    validate_matrixFields(matrix, __func__);
+    
+    // rho matrix ~ transpose(rho) (x) I ||rho>>
+    bool conj = false;
+    bool transp = true;
+    int qubit = util_getBraQubit(target, qureg);
+    localiser_statevec_anyCtrlOneTargDenseMatr(qureg, {}, {}, qubit, matrix, conj, transp);
+}
+
+} // end de-mangler
+
+
+
+/*
+ * CompMatr2
+ */
+
+extern "C" {
+
+void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_twoTargets(qureg, target1, target2, __func__);
+    validate_matrixFields(matrix, __func__);
+    validate_mixedAmpsFitInNode(qureg, 2, __func__);
+
+    bool conj = false;
+    bool transp = false;
+    localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, target1, target2, matrix, conj, transp);
+}
+
+void postMultiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_twoTargets(qureg, target1, target2, __func__);
+    validate_matrixFields(matrix, __func__);
+    validate_mixedAmpsFitInNode(qureg, 2, __func__);
+
+    // rho matrix ~ transpose(rho) (x) I ||rho>>
+    bool conj = false;
+    bool transp = true;
+    int qubit1 = util_getBraQubit(target1, qureg);
+    int qubit2 = util_getBraQubit(target2, qureg);
+    localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, qubit1, qubit2, matrix, conj, transp);
+}
+
+} // end de-mangler
+
+
+
+/*
+ * CompMatr
+ */
+
+extern "C" {
+
+void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync
+    validate_mixedAmpsFitInNode(qureg, numTargets, __func__);
+
+    bool conj = false;
+    bool transp = false;
+    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, util_getVector(targets, numTargets), matrix, conj, transp);
+}
+
+void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync
+    validate_mixedAmpsFitInNode(qureg, numTargets, __func__);
+
+    // rho matrix ~ transpose(rho) (x) I ||rho>>
+    bool conj = false;
+    bool transp = true;
+    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, qubits, matrix, conj, transp);
+}
+
+} // end de-mangler
+
+void multiplyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
+
+    multiplyCompMatr(qureg, targets.data(), targets.size(), matr);
+}
+
+void postMultiplyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
+
+    postMultiplyCompMatr(qureg, targets.data(), targets.size(), matr);
+}
+
+
+
+/*
+ * DiagMatr1
+ */
+
+extern "C" {
+
+void multiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_target(qureg, target, __func__);
+    validate_matrixFields(matrix, __func__);
+
+    bool conj = false;
+    localiser_statevec_anyCtrlOneTargDiagMatr(qureg, {}, {}, target, matrix, conj);
+}
+
+void postMultiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_target(qureg, target, __func__);
+    validate_matrixFields(matrix, __func__);
+
+    bool conj = false;
+    int qubit = util_getBraQubit(target, qureg);
+    localiser_statevec_anyCtrlOneTargDiagMatr(qureg, {}, {}, qubit, matrix, conj);
+}
+
+} // end de-mangler
+
+
+
+/*
+ * DiagMatr2
+ */
+
+extern "C" {
+
+void multiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_twoTargets(qureg, target1, target2, __func__);
+    validate_matrixFields(matrix, __func__);
+
+    bool conj = false;
+    localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, {}, {}, target1, target2, matrix, conj);
+}
+
+void postMultiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_twoTargets(qureg, target1, target2, __func__);
+    validate_matrixFields(matrix, __func__);
+
+    bool conj = false;
+    int qubit1 = util_getBraQubit(target1, qureg);
+    int qubit2 = util_getBraQubit(target2, qureg);
+    localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, {}, {}, qubit1, qubit2, matrix, conj);
+}
+
+} // end de-mangler
+
+
+
+/*
+ * DiagMatr
+ */
+
+extern "C" {
+
+void multiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync
+
+    bool conj = false;
+    qcomp exponent = 1;
+    auto qubits = util_getVector(targets, numTargets);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
+}
+
+void postMultiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync
+
+    bool conj = false;
+    qcomp exponent = 1;
+    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
+}
+
+} // end de-mangler
+
+void multiplyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
+
+    multiplyDiagMatr(qureg, targets.data(), targets.size(), matrix);
+}
+
+void postMultiplyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
+
+    postMultiplyDiagMatr(qureg, targets.data(), targets.size(), matrix);
+}
+
+
+
+/*
+ * DiagMatrPower
+ */
+
+extern "C" {
+
+void multiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent) {
+    validate_quregFields(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
+    validate_matrixExpIsNonDiverging(matrix, exponent, __func__); // harmlessly re-validates fields and is-sync
+
+    bool conj = false;
+    auto qubits = util_getVector(targets, numTargets);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
+}
+
+void postMultiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
+    validate_matrixExpIsNonDiverging(matrix, exponent, __func__); // harmlessly re-validates fields and is-sync
+
+    bool conj = false;
+    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
+}
+
+} // end de-mangler
+
+void multiplyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qcomp exponent) {
+
+    multiplyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
+}
+
+void postMultiplyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qcomp exponent) {
+
+    postMultiplyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
+}
+
+
+
+/*
+ * FullStateDiagMatr (and power)
+ */
+
+extern "C" {
+
+void multiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_matrixFields(matrix, __func__);
+    validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
+
+    multiplyFullStateDiagMatrPower(qureg, matrix, 1); // harmlessly re-validates
+}
+
+void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
+    validate_quregFields(qureg, __func__);
+    validate_matrixFields(matrix, __func__);
+    validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
+    validate_matrixExpIsNonDiverging(matrix, exponent, __func__);
+
+    // rho -> matrix^exponent rho
+    bool leftMultiply = true;
+    bool rightMultiply = false;
+    bool rightConj = false;
+
+    (qureg.isDensityMatrix)?
+        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, leftMultiply, rightMultiply, rightConj):
+        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
+}
+
+void postMultiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_matrixFields(matrix, __func__);
+    validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
+
+    postMultiplyFullStateDiagMatrPower(qureg, matrix, 1); // harmlessly re-validates
+}
+
+void postMultiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_matrixFields(matrix, __func__);
+    validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
+    validate_matrixExpIsNonDiverging(matrix, exponent, __func__);
+
+    // rho -> rho matrix^exponent
+    bool leftMultiply = false;
+    bool rightMultiply = true;
+    bool rightConj = false;
+    localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, leftMultiply, rightMultiply, rightConj);
+}
+
+} // end de-mangler
+
+
+
+/*
+ * swap
+ */
+
+extern "C" {
+
+void multiplySwap(Qureg qureg, int qubit1, int qubit2) {
+    validate_quregFields(qureg, __func__);
+    validate_twoTargets(qureg, qubit1, qubit2, __func__);
+
+    localiser_statevec_anyCtrlSwap(qureg, {}, {}, qubit1, qubit2);
+}
+
+void postMultiplySwap(Qureg qureg, int qubit1, int qubit2) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_twoTargets(qureg, qubit1, qubit2, __func__);
+
+    qubit1 = util_getBraQubit(qubit1, qureg);
+    qubit2 = util_getBraQubit(qubit2, qureg);
+    localiser_statevec_anyCtrlSwap(qureg, {}, {}, qubit1, qubit2);
+}
+
+} // end de-mangler
+
+
+
+/*
+ * individual Paulis
+ */
+
+extern PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift);
+
+extern "C" {
+
+void multiplyPauliX(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    PauliStr str = getPauliStr("X", {target});
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+}
+
+void multiplyPauliY(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    PauliStr str = getPauliStr("Y", {target});
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+}
+
+void multiplyPauliZ(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    PauliStr str = getPauliStr("Z", {target});
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+}
+
+void postMultiplyPauliX(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    PauliStr str = getPauliStr("X", {target});
+    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+}
+
+void postMultiplyPauliY(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    qcomp factor = -1; // undo transpose
+    PauliStr str = getPauliStr("Y", {target});
+    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str, factor);
+}
+
+void postMultiplyPauliZ(Qureg qureg, int target) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_target(qureg, target, __func__);
+
+    PauliStr str = getPauliStr("Z", {target});
+    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+}
+
+} // end de-mangler
+
+
+
+/*
+ * Pauli strings
+ */
+
+extern bool paulis_hasOddNumY(PauliStr str);
+
+extern "C" {
+
+void multiplyPauliStr(Qureg qureg, PauliStr str) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrTargets(qureg, str, __func__);
+
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
+}
+
+void postMultiplyPauliStr(Qureg qureg, PauliStr str) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_pauliStrTargets(qureg, str, __func__);
+
+    qcomp factor = paulis_hasOddNumY(str)? -1 : 1; // undo transpose
+    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str, factor);
+}
+
+} // end de-mangler
+
+
+
+/*
+ * Pauli gadgets
+ */
+
+extern "C" {
+
+void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrTargets(qureg, str, __func__);
+
+    qreal phase = util_getPhaseFromGateAngle(angle);
+    localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
+}
+
+void postMultiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_pauliStrTargets(qureg, str, __func__);
+
+    qreal factor = paulis_hasOddNumY(str)? -1 : 1;
+    qreal phase = factor * util_getPhaseFromGateAngle(angle);
+    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+    localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
+}
+
+} // end de-mangler
+
+
+
+/*
+ * phase gadgets
+ */
+
+extern "C" {
+
+void multiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
+    validate_quregFields(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+
+    qreal phase = util_getPhaseFromGateAngle(angle);
+    auto qubits = util_getVector(targets, numTargets);
+    localiser_statevec_anyCtrlPhaseGadget(qureg, {}, {}, qubits, phase);
+}
+
+void postMultiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+
+    qreal phase = util_getPhaseFromGateAngle(angle);
+    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
+    localiser_statevec_anyCtrlPhaseGadget(qureg, {}, {}, qubits, phase);
+}
+
+} // end de-mangler
+
+void multiplyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
+
+    multiplyPhaseGadget(qureg, targets.data(), targets.size(), angle);
+}
+
+void postMultiplyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
+
+    postMultiplyPhaseGadget(qureg, targets.data(), targets.size(), angle);
+}
+
+
+
+/*
+ * many-qubit NOTs
+ */
+
+extern "C" {
+
+void multiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
+    validate_quregFields(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+
+    // harmlessly re-validates
+    PauliStr str = getPauliStr(std::string(numTargets, 'X'), targets, numTargets);
+    multiplyPauliStr(qureg, str);
+}
+
+void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_targets(qureg, targets, numTargets, __func__);
+
+    // harmlessly re-validates
+    PauliStr str = getPauliStr(std::string(numTargets, 'X'), targets, numTargets);
+    postMultiplyPauliStr(qureg, str);
+}
+
+} // end de-mangler
+
+void multiplyMultiQubitNot(Qureg qureg, vector<int> targets) {
+
+    multiplyMultiQubitNot(qureg, targets.data(), targets.size());
+}
+
+void postMultiplyMultiQubitNot(Qureg qureg, vector<int> targets) {
+
+    postMultiplyMultiQubitNot(qureg, targets.data(), targets.size());
+}
+
+
+
+/*
+ * Pauli string sums
+ */
+
+extern "C" {
+
+void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
+    validate_quregFields(qureg, __func__);
+    validate_quregFields(workspace, __func__);
+    validate_quregCanBeWorkspace(qureg, workspace, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_pauliStrSumTargets(sum, qureg, __func__);
+
+    // clone qureg to workspace, set qureg to blank
+    localiser_statevec_setQuregToSuperposition(0, workspace, 1, qureg, 0, qureg);
+    localiser_statevec_initUniformState(qureg, 0);
+
+    // left-multiply each term in-turn, mixing into output qureg, then undo using idempotency
+    for (qindex i=0; i<sum.numTerms; i++) {
+        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, sum.strings[i]);
+        localiser_statevec_setQuregToSuperposition(1, qureg, sum.coeffs[i], workspace, 0, workspace);
+        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, sum.strings[i]);
+    }
+
+    // workspace -> qureg, and qureg -> sum * qureg
+}
+
+void postMultiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
+    validate_quregFields(qureg, __func__);
+    validate_quregFields(workspace, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_quregCanBeWorkspace(qureg, workspace, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_pauliStrSumTargets(sum, qureg, __func__);
+
+    // clone qureg to workspace, set qureg to blank
+    localiser_statevec_setQuregToSuperposition(0, workspace, 1, qureg, 0, qureg);
+    localiser_statevec_initUniformState(qureg, 0);
+
+    // post-multiply each term in-turn, mixing into output qureg, then undo using idempotency
+    for (qindex i=0; i<sum.numTerms; i++) {
+        PauliStr str =  paulis_getShiftedPauliStr(sum.strings[i], qureg.numQubits);
+        qcomp factor = paulis_hasOddNumY(str)? -1 : 1; // undoes transpose
+
+        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, str, factor);
+        localiser_statevec_setQuregToSuperposition(1, qureg, sum.coeffs[i], workspace, 0, workspace);
+        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, str, factor);
+    }
+
+    // workspace -> qureg, and qureg -> sum * qureg
+}
+
+} // end de-mangler
\ No newline at end of file
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index 1b57f7d2a..bf5b4c5b3 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -8,6 +8,7 @@
  */
 
 #include "quest/include/qureg.h"
+#include "quest/include/paulis.h"
 #include "quest/include/matrices.h"
 #include "quest/include/operations.h"
 #include "quest/include/calculations.h"
@@ -75,29 +76,6 @@ void validateAndApplyAnyCtrlAnyTargUnitaryMatrix(Qureg qureg, int* ctrls, int* s
 
 extern "C" {
 
-void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_target(qureg, target, __func__);
-    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
-
-    bool conj = false;
-    bool transp = false;
-    localiser_statevec_anyCtrlOneTargDenseMatr(qureg, {}, {}, target, matrix, conj, transp);
-}
-
-void postMultiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_target(qureg, target, __func__);
-    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
-    
-    // rho matrix ~ transpose(rho) (x) I ||rho>>
-    bool conj = false;
-    bool transp = true;
-    int qubit = util_getBraQubit(target, qureg);
-    localiser_statevec_anyCtrlOneTargDenseMatr(qureg, {}, {}, qubit, matrix, conj, transp);
-}
-
 void applyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
 
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, nullptr, nullptr, 0, &target, 1, matrix, __func__);
@@ -139,32 +117,6 @@ void applyMultiStateControlledCompMatr1(Qureg qureg, vector<int> controls, vecto
 
 extern "C" {
 
-void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_twoTargets(qureg, target1, target2, __func__);
-    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
-    validate_mixedAmpsFitInNode(qureg, 2, __func__);
-
-    bool conj = false;
-    bool transp = false;
-    localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, target1, target2, matrix, conj, transp);
-}
-
-void postMultiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_twoTargets(qureg, target1, target2, __func__);
-    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
-    validate_mixedAmpsFitInNode(qureg, 2, __func__);
-
-    // rho matrix ~ transpose(rho) (x) I ||rho>>
-    bool conj = false;
-    bool transp = true;
-    int qubit1 = util_getBraQubit(target1, qureg);
-    int qubit2 = util_getBraQubit(target2, qureg);
-    localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, qubit1, qubit2, matrix, conj, transp);
-}
-
 void applyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
 
     int targs[] = {target1, target2};
@@ -210,31 +162,6 @@ void applyMultiStateControlledCompMatr2(Qureg qureg, vector<int> controls, vecto
 
 extern "C" {
 
-void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_targets(qureg, targets, numTargets, __func__);
-    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
-    validate_mixedAmpsFitInNode(qureg, numTargets, __func__);
-
-    bool conj = false;
-    bool transp = false;
-    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, util_getVector(targets, numTargets), matrix, conj, transp);
-}
-
-void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_targets(qureg, targets, numTargets, __func__);
-    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
-    validate_mixedAmpsFitInNode(qureg, numTargets, __func__);
-
-    // rho matrix ~ transpose(rho) (x) I ||rho>>
-    bool conj = false;
-    bool transp = true;
-    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
-    localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, qubits, matrix, conj, transp);
-}
-
 void applyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
 
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, nullptr, nullptr, 0, targets, numTargets, matrix, __func__);
@@ -257,16 +184,6 @@ void applyMultiStateControlledCompMatr(Qureg qureg, int* controls, int* states,
 
 } // end de-mangler
 
-void multiplyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
-
-    multiplyCompMatr(qureg, targets.data(), targets.size(), matr);
-}
-
-void postMultiplyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
-
-    postMultiplyCompMatr(qureg, targets.data(), targets.size(), matr);
-}
-
 void applyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
 
     applyCompMatr(qureg, targets.data(), targets.size(), matr);
@@ -296,26 +213,6 @@ void applyMultiStateControlledCompMatr(Qureg qureg, vector<int> controls, vector
 
 extern "C" {
 
-void multiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_target(qureg, target, __func__);
-    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
-
-    bool conj = false;
-    localiser_statevec_anyCtrlOneTargDiagMatr(qureg, {}, {}, target, matrix, conj);
-}
-
-void postMultiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_target(qureg, target, __func__);
-    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
-
-    bool conj = false;
-    int qubit = util_getBraQubit(target, qureg);
-    localiser_statevec_anyCtrlOneTargDiagMatr(qureg, {}, {}, qubit, matrix, conj);
-}
-
 void applyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
 
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, nullptr, nullptr, 0, &target, 1, matrix, __func__);
@@ -357,27 +254,6 @@ void applyMultiStateControlledDiagMatr1(Qureg qureg, vector<int> controls, vecto
 
 extern "C" {
 
-void multiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_twoTargets(qureg, target1, target2, __func__);
-    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
-
-    bool conj = false;
-    localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, {}, {}, target1, target2, matrix, conj);
-}
-
-void postMultiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_twoTargets(qureg, target1, target2, __func__);
-    validate_matrixFields(matrix, __func__); // matrix can be non-unitary
-
-    bool conj = false;
-    int qubit1 = util_getBraQubit(target1, qureg);
-    int qubit2 = util_getBraQubit(target2, qureg);
-    localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, {}, {}, qubit1, qubit2, matrix, conj);
-}
-
 void applyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
 
     int targs[] = {target1, target2};
@@ -423,29 +299,6 @@ void applyMultiStateControlledDiagMatr2(Qureg qureg, vector<int> controls, vecto
 
 extern "C" {
 
-void multiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_targets(qureg, targets, numTargets, __func__);
-    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
-
-    bool conj = false;
-    qcomp exponent = 1;
-    auto qubits = util_getVector(targets, numTargets);
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
-}
-
-void postMultiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_targets(qureg, targets, numTargets, __func__);
-    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
-
-    bool conj = false;
-    qcomp exponent = 1;
-    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
-}
-
 void applyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
 
     validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, nullptr, nullptr, 0, targets, numTargets, matrix, __func__);
@@ -468,16 +321,6 @@ void applyMultiStateControlledDiagMatr(Qureg qureg, int* controls, int* states,
 
 } // end de-mangler
 
-void multiplyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
-
-    multiplyDiagMatr(qureg, targets.data(), targets.size(), matrix);
-}
-
-void postMultiplyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
-
-    postMultiplyDiagMatr(qureg, targets.data(), targets.size(), matrix);
-}
-
 void applyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
 
     applyDiagMatr(qureg, targets.data(), targets.size(), matrix);
@@ -504,35 +347,12 @@ void applyMultiStateControlledDiagMatr(Qureg qureg, vector<int> controls, vector
 /*
  * DiagMatrPower
  *
- * which still (except for multiply) assert unitarity,
- * even though a non-real exponent is possible
+ * which still assert unitarity despite that passing
+ * a non-real exponent is permitted
  */
 
 extern "C" {
 
-void multiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent) {
-    validate_quregFields(qureg, __func__);
-    validate_targets(qureg, targets, numTargets, __func__);
-    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
-    validate_matrixExpIsNonDiverging(matrix, exponent, __func__); // harmlessly re-validates fields and is-sync
-
-    bool conj = false;
-    auto qubits = util_getVector(targets, numTargets);
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
-}
-
-void postMultiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_targets(qureg, targets, numTargets, __func__);
-    validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
-    validate_matrixExpIsNonDiverging(matrix, exponent, __func__); // harmlessly re-validates fields and is-sync
-
-    bool conj = false;
-    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
-    localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
-}
-
 void applyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent)  {
     validate_quregFields(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
@@ -610,16 +430,6 @@ void applyMultiStateControlledDiagMatrPower(Qureg qureg, int* controls, int* sta
 
 } // end de-mangler
 
-void multiplyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qcomp exponent) {
-
-    multiplyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
-}
-
-void postMultiplyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qcomp exponent) {
-
-    postMultiplyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
-}
-
 void applyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qcomp exponent) {
 
     applyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
@@ -649,53 +459,6 @@ void applyMultiStateControlledDiagMatrPower(Qureg qureg, vector<int> controls, v
 
 extern "C" {
 
-void multiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_matrixFields(matrix, __func__);
-    validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
-
-    multiplyFullStateDiagMatrPower(qureg, matrix, 1); // harmlessly re-validates
-}
-
-void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
-    validate_quregFields(qureg, __func__);
-    validate_matrixFields(matrix, __func__);
-    validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
-    validate_matrixExpIsNonDiverging(matrix, exponent, __func__);
-
-    // rho -> matrix^exponent rho
-    bool leftMultiply = true;
-    bool rightMultiply = false;
-    bool rightConj = false;
-
-    (qureg.isDensityMatrix)?
-        localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, leftMultiply, rightMultiply, rightConj):
-        localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
-}
-
-void postMultiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_matrixFields(matrix, __func__);
-    validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
-
-    postMultiplyFullStateDiagMatrPower(qureg, matrix, 1); // harmlessly re-validates
-}
-
-void postMultiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_matrixFields(matrix, __func__);
-    validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
-    validate_matrixExpIsNonDiverging(matrix, exponent, __func__);
-
-    // rho -> rho matrix^exponent
-    bool leftMultiply = false;
-    bool rightMultiply = true;
-    bool rightConj = false;
-    localiser_densmatr_allTargDiagMatr(qureg, matrix, exponent, leftMultiply, rightMultiply, rightConj);
-}
-
 void applyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
     validate_quregFields(qureg, __func__);
     validate_matrixFields(matrix, __func__);
@@ -890,23 +653,6 @@ void applyMultiStateControlledHadamard(Qureg qureg, vector<int> controls, vector
 
 extern "C" {
 
-void multiplySwap(Qureg qureg, int qubit1, int qubit2) {
-    validate_quregFields(qureg, __func__);
-    validate_twoTargets(qureg, qubit1, qubit2, __func__);
-
-    localiser_statevec_anyCtrlSwap(qureg, {}, {}, qubit1, qubit2);
-}
-
-void postMultiplySwap(Qureg qureg, int qubit1, int qubit2) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_twoTargets(qureg, qubit1, qubit2, __func__);
-
-    qubit1 = util_getBraQubit(qubit1, qureg);
-    qubit2 = util_getBraQubit(qubit2, qureg);
-    localiser_statevec_anyCtrlSwap(qureg, {}, {}, qubit1, qubit2);
-}
-
 void applySwap(Qureg qureg, int qubit1, int qubit2) {
     validate_quregFields(qureg, __func__);
     validate_twoTargets(qureg, qubit1, qubit2, __func__);
@@ -1040,61 +786,6 @@ void applyMultiStateControlledSqrtSwap(Qureg qureg, vector<int> controls, vector
 
 extern "C" {
 
-void multiplyPauliX(Qureg qureg, int target) {
-    validate_quregFields(qureg, __func__);
-    validate_target(qureg, target, __func__);
-
-    PauliStr str = getPauliStr("X", {target});
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
-}
-
-void multiplyPauliY(Qureg qureg, int target) {
-    validate_quregFields(qureg, __func__);
-    validate_target(qureg, target, __func__);
-
-    PauliStr str = getPauliStr("Y", {target});
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
-}
-
-void multiplyPauliZ(Qureg qureg, int target) {
-    validate_quregFields(qureg, __func__);
-    validate_target(qureg, target, __func__);
-
-    PauliStr str = getPauliStr("Z", {target});
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
-}
-
-void postMultiplyPauliX(Qureg qureg, int target) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_target(qureg, target, __func__);
-
-    PauliStr str = getPauliStr("X", {target});
-    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
-}
-
-void postMultiplyPauliY(Qureg qureg, int target) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_target(qureg, target, __func__);
-
-    qcomp factor = -1; // undo transpose
-    PauliStr str = getPauliStr("Y", {target});
-    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str, factor);
-}
-
-void postMultiplyPauliZ(Qureg qureg, int target) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_target(qureg, target, __func__);
-
-    PauliStr str = getPauliStr("Z", {target});
-    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
-}
-
 void applyPauliX(Qureg qureg, int target) {
     validate_quregFields(qureg, __func__);
     validate_target(qureg, target, __func__);
@@ -1238,23 +929,6 @@ void applyMultiStateControlledPauliZ(Qureg qureg, vector<int> controls, vector<i
 
 extern "C" {
 
-void multiplyPauliStr(Qureg qureg, PauliStr str) {
-    validate_quregFields(qureg, __func__);
-    validate_pauliStrTargets(qureg, str, __func__);
-
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
-}
-
-void postMultiplyPauliStr(Qureg qureg, PauliStr str) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_pauliStrTargets(qureg, str, __func__);
-
-    qcomp factor = paulis_hasOddNumY(str)? -1 : 1; // undo transpose
-    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str, factor);
-}
-
 void applyPauliStr(Qureg qureg, PauliStr str) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrTargets(qureg, str, __func__);
@@ -1330,52 +1004,6 @@ void applyMultiStateControlledPauliStr(Qureg qureg, vector<int> controls, vector
 
 extern "C" {
 
-void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
-    validate_quregFields(qureg, __func__);
-    validate_quregFields(workspace, __func__);
-    validate_quregCanBeWorkspace(qureg, workspace, __func__);
-    validate_pauliStrSumFields(sum, __func__);
-    validate_pauliStrSumTargets(sum, qureg, __func__);
-
-    // clone qureg to workspace, set qureg to blank
-    localiser_statevec_setQuregToSuperposition(0, workspace, 1, qureg, 0, qureg);
-    localiser_statevec_initUniformState(qureg, 0);
-
-    // left-multiply each term in-turn, mixing into output qureg, then undo using idempotency
-    for (qindex i=0; i<sum.numTerms; i++) {
-        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, sum.strings[i]);
-        localiser_statevec_setQuregToSuperposition(1, qureg, sum.coeffs[i], workspace, 0, workspace);
-        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, sum.strings[i]);
-    }
-
-    // workspace -> qureg, and qureg -> sum * qureg
-}
-
-void postMultiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
-    validate_quregFields(qureg, __func__);
-    validate_quregFields(workspace, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_quregCanBeWorkspace(qureg, workspace, __func__);
-    validate_pauliStrSumFields(sum, __func__);
-    validate_pauliStrSumTargets(sum, qureg, __func__);
-
-    // clone qureg to workspace, set qureg to blank
-    localiser_statevec_setQuregToSuperposition(0, workspace, 1, qureg, 0, qureg);
-    localiser_statevec_initUniformState(qureg, 0);
-
-    // post-multiply each term in-turn, mixing into output qureg, then undo using idempotency
-    for (qindex i=0; i<sum.numTerms; i++) {
-        PauliStr str =  paulis_getShiftedPauliStr(sum.strings[i], qureg.numQubits);
-        qcomp factor = paulis_hasOddNumY(str)? -1 : 1; // undoes transpose
-
-        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, str, factor);
-        localiser_statevec_setQuregToSuperposition(1, qureg, sum.coeffs[i], workspace, 0, workspace);
-        localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, str, factor);
-    }
-
-    // workspace -> qureg, and qureg -> sum * qureg
-}
-
 void internal_applyFirstOrderTrotterRepetition(
     Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
     vector<int>& states, PauliStrSum sum, qcomp angle, bool reverse
@@ -1523,8 +1151,6 @@ void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, vector<i
 
 extern "C" {
 
-// don't think users will ever want to left-multiply only
-
 void applyRotateX(Qureg qureg, int target, qreal angle) {
     validate_quregFields(qureg, __func__);
     validate_target(qureg, target, __func__);
@@ -1734,25 +1360,6 @@ void applyMultiStateControlledRotateAroundAxis(Qureg qureg, vector<int> ctrls, v
 
 extern "C" {
 
-void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
-    validate_quregFields(qureg, __func__);
-    validate_pauliStrTargets(qureg, str, __func__);
-
-    qreal phase = util_getPhaseFromGateAngle(angle);
-    localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
-}
-
-void postMultiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_pauliStrTargets(qureg, str, __func__);
-
-    qreal factor = paulis_hasOddNumY(str)? -1 : 1;
-    qreal phase = factor * util_getPhaseFromGateAngle(angle);
-    str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
-}
-
 void applyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrTargets(qureg, str, __func__);
@@ -1841,25 +1448,6 @@ void applyMultiStateControlledPauliGadget(Qureg qureg, vector<int> controls, vec
 
 extern "C" {
 
-void multiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
-    validate_quregFields(qureg, __func__);
-    validate_targets(qureg, targets, numTargets, __func__);
-
-    qreal phase = util_getPhaseFromGateAngle(angle);
-    auto qubits = util_getVector(targets, numTargets);
-    localiser_statevec_anyCtrlPhaseGadget(qureg, {}, {}, qubits, phase);
-}
-
-void postMultiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_targets(qureg, targets, numTargets, __func__);
-
-    qreal phase = util_getPhaseFromGateAngle(angle);
-    auto qubits = util_getBraQubits(util_getVector(targets, numTargets), qureg);
-    localiser_statevec_anyCtrlPhaseGadget(qureg, {}, {}, qubits, phase);
-}
-
 void applyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
     validate_quregFields(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
@@ -1906,16 +1494,6 @@ void applyMultiStateControlledPhaseGadget(Qureg qureg, int* controls, int* state
 
 } // end de-mangler
 
-void multiplyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
-
-    multiplyPhaseGadget(qureg, targets.data(), targets.size(), angle);
-}
-
-void postMultiplyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
-
-    postMultiplyPhaseGadget(qureg, targets.data(), targets.size(), angle);
-}
-
 void applyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
 
     applyPhaseGadget(qureg, targets.data(), targets.size(), angle);
@@ -2026,30 +1604,11 @@ void applyMultiQubitPhaseFlip(Qureg qureg, vector<int> targets) {
 
 
 /*
- * many-qubit CNOTs
+ * many-qubit NOTs and CNOTs
  */
 
 extern "C" {
 
-void multiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
-    validate_quregFields(qureg, __func__);
-    validate_targets(qureg, targets, numTargets, __func__);
-
-    // harmlessly re-validates
-    PauliStr str = getPauliStr(std::string(numTargets, 'X'), targets, numTargets);
-    multiplyPauliStr(qureg, str);
-}
-
-void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
-    validate_quregFields(qureg, __func__);
-    validate_quregIsDensityMatrix(qureg, __func__);
-    validate_targets(qureg, targets, numTargets, __func__);
-
-    // harmlessly re-validates
-    PauliStr str = getPauliStr(std::string(numTargets, 'X'), targets, numTargets);
-    postMultiplyPauliStr(qureg, str);
-}
-
 void applyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
     validate_quregFields(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
@@ -2088,16 +1647,6 @@ void applyMultiStateControlledMultiQubitNot(Qureg qureg, int* controls, int* sta
 
 } // end de-mangler
 
-void multiplyMultiQubitNot(Qureg qureg, vector<int> targets) {
-
-    multiplyMultiQubitNot(qureg, targets.data(), targets.size());
-}
-
-void postMultiplyMultiQubitNot(Qureg qureg, vector<int> targets) {
-
-    postMultiplyMultiQubitNot(qureg, targets.data(), targets.size());
-}
-
 void applyMultiQubitNot(Qureg qureg, vector<int> targets) {
 
     applyMultiQubitNot(qureg, targets.data(), targets.size());
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 033751832..45f6d341c 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -9,6 +9,7 @@ target_sources(tests
   environment.cpp
   initialisations.cpp
   matrices.cpp
+  multiplication.cpp
   operations.cpp
   paulis.cpp
   qureg.cpp
diff --git a/tests/unit/multiplication.cpp b/tests/unit/multiplication.cpp
new file mode 100644
index 000000000..3e445dcec
--- /dev/null
+++ b/tests/unit/multiplication.cpp
@@ -0,0 +1,7 @@
+/*
+ * Testing of the multiplication API module is actually 
+ * performed in operations.cpp, in addition to testing
+ * the operations module, since their testing logics are
+ * inextricable. This file exists only to redirect the
+ * confused reader searching for the multiplication tests.
+ */
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index b8522321a..0fdf32b67 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -1,5 +1,6 @@
 /** @file
- * Unit tests of the operations module. Beware that because the 
+ * Unit tests of both multiplication and operations modules, since
+ * they use inextricable testing logic. Beware that because the 
  * operation functions have so much interface and test-semantic 
  * overlap (e.g. the logic of control qubits, of control-states,
  * of density matrix variants), this file has opted to make 
@@ -13,6 +14,9 @@
  * 
  * @defgroup unitops Operations
  * @ingroup unittests
+ * 
+ * @defgroup unitmult Multiplication
+ * @ingroup unittests
  */
 
 #include "quest.h"
@@ -46,9 +50,12 @@ using Catch::Matchers::ContainsSubstring;
  * UTILITIES
  */
 
-#define TEST_CATEGORY \
+#define TEST_CATEGORY_OPS \
     LABEL_UNIT_TAG "[operations]"
 
+#define TEST_CATEGORY_MULT \
+    LABEL_UNIT_TAG "[multiplication]"
+
 
 /*
  * reference operator matrices used by testing
@@ -1169,10 +1176,10 @@ void testOperation(auto operation, auto matrixRefGen) {
 // C-compatible version. Alas, to your imminent horror, this is.. erm...
 
 // #define TEST_ALL_CTRL_OPERATIONS( namesuffix, numtargs, argtype, matrixgen ) \
-//     TEST_CASE( "apply" #namesuffix,                     TEST_CATEGORY ) { testOperation<zero,     numtargs,argtype>( apply ## namesuffix,                     matrixgen); } \
-//     TEST_CASE( "applyControlled" #namesuffix,           TEST_CATEGORY ) { testOperation<one,      numtargs,argtype>( applyControlled ## namesuffix,           matrixgen); } \
-//     TEST_CASE( "applyMultiControlled" #namesuffix,      TEST_CATEGORY ) { testOperation<any,      numtargs,argtype>( applyMultiControlled ## namesuffix,      matrixgen); } \
-//     TEST_CASE( "applyMultiStateControlled" #namesuffix, TEST_CATEGORY ) { testOperation<anystates,numtargs,argtype>( applyMultiStateControlled ## namesuffix, matrixgen); } 
+//     TEST_CASE( "apply" #namesuffix,                     TEST_CATEGORY_OPS ) { testOperation<zero,     numtargs,argtype>( apply ## namesuffix,                     matrixgen); } \
+//     TEST_CASE( "applyControlled" #namesuffix,           TEST_CATEGORY_OPS ) { testOperation<one,      numtargs,argtype>( applyControlled ## namesuffix,           matrixgen); } \
+//     TEST_CASE( "applyMultiControlled" #namesuffix,      TEST_CATEGORY_OPS ) { testOperation<any,      numtargs,argtype>( applyMultiControlled ## namesuffix,      matrixgen); } \
+//     TEST_CASE( "applyMultiStateControlled" #namesuffix, TEST_CATEGORY_OPS ) { testOperation<anystates,numtargs,argtype>( applyMultiStateControlled ## namesuffix, matrixgen); } 
 
 
 /*
@@ -1267,7 +1274,7 @@ void testOperation(auto operation, auto matrixRefGen) {
 
 // defines a Catch2 test-case for the implied function
 #define TEST_CASE_OPERATION( namesuffix, numctrls, numtargs, argtype, matrixgen ) \
-    TEST_CASE( GET_FUNC_NAME_STR(numctrls, namesuffix), TEST_CATEGORY ) {         \
+    TEST_CASE( GET_FUNC_NAME_STR(numctrls, namesuffix), TEST_CATEGORY_OPS ) {         \
         testOperation<numctrls, numtargs, argtype, apply>(                        \
             GET_CASTED_FUNC(namesuffix, numctrls, numtargs, argtype),             \
             matrixgen);                                                           \
@@ -1283,7 +1290,7 @@ void testOperation(auto operation, auto matrixRefGen) {
 
 
 /** 
- * TESTS
+ * OPERATOR TESTS
  * 
  * @ingroup unitops
  * @{
@@ -1323,32 +1330,10 @@ TEST_ALL_CTRL_OPERATIONS( PhaseGadget, any, scalar, VariableSizeParameterisedMat
  * non-controlled operations with no C++ overloads
  */
 
-TEST_CASE( "applyPhaseFlip",          TEST_CATEGORY ) { testOperation<zero,one,none,apply>  (applyPhaseFlip,          VariableSizeMatrices::PF(1)); }
-TEST_CASE( "applyTwoQubitPhaseFlip",  TEST_CATEGORY ) { testOperation<zero,two,none,apply>  (applyTwoQubitPhaseFlip,  VariableSizeMatrices::PF(2)); }
-TEST_CASE( "applyPhaseShift",         TEST_CATEGORY ) { testOperation<zero,one,scalar,apply>(applyPhaseShift,         ParameterisedMatrices::PS  ); }
-TEST_CASE( "applyTwoQubitPhaseShift", TEST_CATEGORY ) { testOperation<zero,two,scalar,apply>(applyTwoQubitPhaseShift, ParameterisedMatrices::PS2 ); }
-
-TEST_CASE( "multiplySwap",            TEST_CATEGORY ) { testOperation<zero,two,none,multiply>(multiplySwap, FixedMatrices::SWAP); }
-TEST_CASE( "multiplyPauliX",          TEST_CATEGORY ) { testOperation<zero,one,none,multiply>(multiplyPauliX, FixedMatrices::X); }
-TEST_CASE( "multiplyPauliY",          TEST_CATEGORY ) { testOperation<zero,one,none,multiply>(multiplyPauliY, FixedMatrices::Y); }
-TEST_CASE( "multiplyPauliZ",          TEST_CATEGORY ) { testOperation<zero,one,none,multiply>(multiplyPauliZ, FixedMatrices::Z); }
-TEST_CASE( "multiplyPauliStr",        TEST_CATEGORY ) { testOperation<zero,any,paulistr,multiply>(multiplyPauliStr,    nullptr); }
-TEST_CASE( "multiplyPauliGadget",     TEST_CATEGORY ) { testOperation<zero,any,pauligad,multiply>(multiplyPauliGadget, nullptr); }
-TEST_CASE( "multiplyCompMatr1",       TEST_CATEGORY ) { testOperation<zero,one,compmatr,multiply>(multiplyCompMatr1,   nullptr); }
-TEST_CASE( "multiplyCompMatr2",       TEST_CATEGORY ) { testOperation<zero,two,compmatr,multiply>(multiplyCompMatr2,   nullptr); }
-TEST_CASE( "multiplyDiagMatr1",       TEST_CATEGORY ) { testOperation<zero,one,diagmatr,multiply>(multiplyDiagMatr1,   nullptr); }
-TEST_CASE( "multiplyDiagMatr2",       TEST_CATEGORY ) { testOperation<zero,two,diagmatr,multiply>(multiplyDiagMatr2,   nullptr); }
-
-TEST_CASE( "postMultiplySwap",            TEST_CATEGORY ) { testOperation<zero,two,none,postmultiply>(postMultiplySwap, FixedMatrices::SWAP); }
-TEST_CASE( "postMultiplyPauliX",          TEST_CATEGORY ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliX, FixedMatrices::X); }
-TEST_CASE( "postMultiplyPauliY",          TEST_CATEGORY ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliY, FixedMatrices::Y); }
-TEST_CASE( "postMultiplyPauliZ",          TEST_CATEGORY ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliZ, FixedMatrices::Z); }
-TEST_CASE( "postMultiplyPauliStr",        TEST_CATEGORY ) { testOperation<zero,any,paulistr,postmultiply>(postMultiplyPauliStr,    nullptr); }
-TEST_CASE( "postMultiplyPauliGadget",     TEST_CATEGORY ) { testOperation<zero,any,pauligad,postmultiply>(postMultiplyPauliGadget, nullptr); }
-TEST_CASE( "postMultiplyCompMatr1",       TEST_CATEGORY ) { testOperation<zero,one,compmatr,postmultiply>(postMultiplyCompMatr1,   nullptr); }
-TEST_CASE( "postMultiplyCompMatr2",       TEST_CATEGORY ) { testOperation<zero,two,compmatr,postmultiply>(postMultiplyCompMatr2,   nullptr); }
-TEST_CASE( "postMultiplyDiagMatr1",       TEST_CATEGORY ) { testOperation<zero,one,diagmatr,postmultiply>(postMultiplyDiagMatr1,   nullptr); }
-TEST_CASE( "postMultiplyDiagMatr2",       TEST_CATEGORY ) { testOperation<zero,two,diagmatr,postmultiply>(postMultiplyDiagMatr2,   nullptr); }
+TEST_CASE( "applyPhaseFlip",          TEST_CATEGORY_OPS ) { testOperation<zero,one,none,apply>  (applyPhaseFlip,          VariableSizeMatrices::PF(1)); }
+TEST_CASE( "applyTwoQubitPhaseFlip",  TEST_CATEGORY_OPS ) { testOperation<zero,two,none,apply>  (applyTwoQubitPhaseFlip,  VariableSizeMatrices::PF(2)); }
+TEST_CASE( "applyPhaseShift",         TEST_CATEGORY_OPS ) { testOperation<zero,one,scalar,apply>(applyPhaseShift,         ParameterisedMatrices::PS  ); }
+TEST_CASE( "applyTwoQubitPhaseShift", TEST_CATEGORY_OPS ) { testOperation<zero,two,scalar,apply>(applyTwoQubitPhaseShift, ParameterisedMatrices::PS2 ); }
 
 
 /*
@@ -1358,75 +1343,22 @@ TEST_CASE( "postMultiplyDiagMatr2",       TEST_CATEGORY ) { testOperation<zero,t
  * compiler ambiguity (spaghetti 4 lyf)
  */
 
-TEST_CASE( "applyMultiQubitPhaseFlip",  TEST_CATEGORY ) {
+TEST_CASE( "applyMultiQubitPhaseFlip",  TEST_CATEGORY_OPS ) {
     auto func = static_cast<void(*)(Qureg, int*, int)>(applyMultiQubitPhaseFlip);
     testOperation<zero,any,none,apply>(func, VariableSizeMatrices::PF);
 }
 
-TEST_CASE( "applyMultiQubitPhaseShift",  TEST_CATEGORY ) {
+TEST_CASE( "applyMultiQubitPhaseShift",  TEST_CATEGORY_OPS ) {
     auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(applyMultiQubitPhaseShift);
     testOperation<zero,any,scalar,apply>(func, VariableSizeParameterisedMatrices::PS);
 }
 
 
-TEST_CASE( "multiplyCompMatr",  TEST_CATEGORY ) { 
-    auto func = static_cast<void(*)(Qureg, int*, int, CompMatr)>(multiplyCompMatr);
-    testOperation<zero,any,compmatr,multiply>(func, nullptr); 
-}
-
-TEST_CASE( "multiplyDiagMatr",  TEST_CATEGORY ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr)>(multiplyDiagMatr);
-    testOperation<zero,any,diagmatr,multiply>(func, nullptr);
-}
-
-TEST_CASE( "multiplyDiagMatrPower",  TEST_CATEGORY ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr, qcomp)>(multiplyDiagMatrPower);
-    testOperation<zero,any,diagpower,multiply>(func, nullptr);
-}
-
-TEST_CASE( "multiplyMultiQubitNot",  TEST_CATEGORY ) {
-    auto func = static_cast<void(*)(Qureg, int*, int)>(multiplyMultiQubitNot);
-    testOperation<zero,any,none,multiply>(func, VariableSizeMatrices::X);
-}
-
-TEST_CASE( "multiplyPhaseGadget",  TEST_CATEGORY ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(multiplyPhaseGadget);
-    testOperation<zero,any,scalar,multiply>(func, VariableSizeParameterisedMatrices::Z);
-}
-
-
-TEST_CASE( "postMultiplyCompMatr",  TEST_CATEGORY ) { 
-    auto func = static_cast<void(*)(Qureg, int*, int, CompMatr)>(postMultiplyCompMatr);
-    testOperation<zero,any,compmatr,postmultiply>(func, nullptr); 
-}
-
-TEST_CASE( "postMultiplyDiagMatr",  TEST_CATEGORY ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr)>(postMultiplyDiagMatr);
-    testOperation<zero,any,diagmatr,postmultiply>(func, nullptr);
-}
-
-TEST_CASE( "postMultiplyDiagMatrPower",  TEST_CATEGORY ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr, qcomp)>(postMultiplyDiagMatrPower);
-    testOperation<zero,any,diagpower,postmultiply>(func, nullptr);
-}
-
-TEST_CASE( "postMultiplyMultiQubitNot",  TEST_CATEGORY ) {
-    auto func = static_cast<void(*)(Qureg, int*, int)>(postMultiplyMultiQubitNot);
-    testOperation<zero,any,none,postmultiply>(func, VariableSizeMatrices::X);
-}
-
-TEST_CASE( "postMultiplyPhaseGadget",  TEST_CATEGORY ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(postMultiplyPhaseGadget);
-    testOperation<zero,any,scalar,postmultiply>(func, VariableSizeParameterisedMatrices::Z);
-}
-
-
-
 /*
  * operations which need custom logic
  */
 
-TEST_CASE( "applyQuantumFourierTransform", TEST_CATEGORY ) {
+TEST_CASE( "applyQuantumFourierTransform", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -1474,7 +1406,7 @@ TEST_CASE( "applyQuantumFourierTransform", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "applyFullQuantumFourierTransform", TEST_CATEGORY ) {
+TEST_CASE( "applyFullQuantumFourierTransform", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -1519,7 +1451,7 @@ TEST_CASE( "applyFullQuantumFourierTransform", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "applyQubitProjector", TEST_CATEGORY ) {
+TEST_CASE( "applyQubitProjector", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -1545,7 +1477,7 @@ TEST_CASE( "applyQubitProjector", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "applyMultiQubitProjector", TEST_CATEGORY ) {
+TEST_CASE( "applyMultiQubitProjector", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -1571,7 +1503,7 @@ TEST_CASE( "applyMultiQubitProjector", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "applyForcedQubitMeasurement", TEST_CATEGORY ) {
+TEST_CASE( "applyForcedQubitMeasurement", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -1610,7 +1542,7 @@ TEST_CASE( "applyForcedQubitMeasurement", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "applyForcedMultiQubitMeasurement", TEST_CATEGORY ) {
+TEST_CASE( "applyForcedMultiQubitMeasurement", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -1656,7 +1588,7 @@ TEST_CASE( "applyForcedMultiQubitMeasurement", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "applyMultiQubitMeasurement", TEST_CATEGORY ) {
+TEST_CASE( "applyMultiQubitMeasurement", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -1693,7 +1625,7 @@ TEST_CASE( "applyMultiQubitMeasurement", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "applyMultiQubitMeasurementAndGetProb", TEST_CATEGORY ) {
+TEST_CASE( "applyMultiQubitMeasurementAndGetProb", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -1732,7 +1664,7 @@ TEST_CASE( "applyMultiQubitMeasurementAndGetProb", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "applyQubitMeasurement", TEST_CATEGORY ) {
+TEST_CASE( "applyQubitMeasurement", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -1768,7 +1700,7 @@ TEST_CASE( "applyQubitMeasurement", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "applyQubitMeasurementAndGetProb", TEST_CATEGORY ) {
+TEST_CASE( "applyQubitMeasurementAndGetProb", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -1806,7 +1738,7 @@ TEST_CASE( "applyQubitMeasurementAndGetProb", TEST_CATEGORY ) {
 }
 
 
-TEST_CASE( "multiplyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "applyFullStateDiagMatr", TEST_CATEGORY_OPS LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
 
@@ -1814,21 +1746,21 @@ TEST_CASE( "multiplyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
-        qmatrix refMatr = getRandomDiagonalMatrix(getPow2(numQubits));
-        auto apiFunc = multiplyFullStateDiagMatr;
+        qmatrix refMatr = getRandomDiagonalUnitary(numQubits);
+        auto apiFunc = applyFullStateDiagMatr;
 
         GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC ) {
 
-            auto refFunc = [&] (qvector& state, qmatrix matr) { multiplyReferenceOperator(state, matr); };
+            auto refFunc = [&] (qvector& state, qmatrix matr) { applyReferenceOperator(state, matr); };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedSV, cachedMatrs, apiFunc, refSV, refMatr, refFunc);
         }
 
         SECTION( LABEL_DENSMATR ) {
 
-            auto refFunc = [&] (qmatrix& state, qmatrix matr) { multiplyReferenceOperator(state, matr); };
+            auto refFunc = [&] (qmatrix& state, qmatrix matr) { applyReferenceOperator(state, matr); };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
         }
@@ -1838,7 +1770,7 @@ TEST_CASE( "multiplyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
 }
 
 
-TEST_CASE( "multiplyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "applyFullStateDiagMatrPower", TEST_CATEGORY_OPS LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
 
@@ -1846,22 +1778,31 @@ TEST_CASE( "multiplyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLOY_TA
 
     SECTION( LABEL_CORRECTNESS ) {
 
-        qmatrix refMatr = getRandomDiagonalMatrix(getPow2(numQubits));
-        qcomp exponent = getRandomComplex();
+        qmatrix refMatr = getRandomDiagonalUnitary(numQubits);
+
+        // supplying a complex exponent requires disabling
+        // numerical validation to relax unitarity
+        bool testRealExp = GENERATE( true, false );
+        qcomp exponent = (testRealExp)?
+            qcomp(getRandomReal(-2, 2), 0):
+            getRandomComplex();
 
         auto apiFunc = [&](Qureg qureg, FullStateDiagMatr matr) { 
-            return multiplyFullStateDiagMatrPower(qureg, matr, exponent);
+            return applyFullStateDiagMatrPower(qureg, matr, exponent);
         };
 
         CAPTURE( exponent );
-        
+
         GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
+        if (!testRealExp)
+            setValidationEpsilon(0);
+
         SECTION( LABEL_STATEVEC ) {
 
             auto refFunc = [&] (qvector& state, qmatrix matr) { 
                 matr = getPowerOfDiagonalMatrix(matr, exponent);
-                multiplyReferenceOperator(state, matr);
+                applyReferenceOperator(state, matr);
             };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedSV, cachedMatrs, apiFunc, refSV, refMatr, refFunc);
@@ -1871,18 +1812,148 @@ TEST_CASE( "multiplyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLOY_TA
 
             auto refFunc = [&] (qmatrix& state, qmatrix matr) { 
                 matr = getPowerOfDiagonalMatrix(matr, exponent);
-                multiplyReferenceOperator(state, matr);
+                applyReferenceOperator(state, matr);
             };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
         }
+
+        setValidationEpsilonToDefault();
+    }
+
+    /// @todo input validation
+}
+
+
+TEST_CASE( "applyNonUnitaryPauliGadget", TEST_CATEGORY_OPS ) {
+
+    PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // prepare a random Pauli string and angle
+        int numTargs = GENERATE_COPY( range(1, numQubits+1) );
+        auto targs = GENERATE_TARGS( numQubits, numTargs );
+        PauliStr str = getRandomPauliStr(targs);
+        qcomp angle = getRandomComplex();
+
+        // prepare the corresponding reference matrix exp(-i angle pauli)
+        auto matrRef = getExponentialOfPauliMatrix(angle, getMatrix(str, numQubits));
+
+        auto testFunc = [&](Qureg qureg, auto& stateRef) {
+            applyNonUnitaryPauliGadget(qureg, str, angle);
+            applyReferenceOperator(stateRef, matrRef);
+        };
+
+        CAPTURE( targs, angle );
+        SECTION( LABEL_STATEVEC ) { TEST_ON_CACHED_QUREGS(statevecQuregs, statevecRef, testFunc); }
+        SECTION( LABEL_DENSMATR ) { TEST_ON_CACHED_QUREGS(densmatrQuregs, densmatrRef, testFunc); }
     }
 
     /// @todo input validation
 }
 
 
-TEST_CASE( "postMultiplyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
+/** @} (end defgroup) */
+
+
+
+/** 
+ * OPERATOR TESTS
+ * 
+ * @ingroup unitmult
+ * @{
+ */
+
+
+TEST_CASE( "multiplySwap",            TEST_CATEGORY_MULT ) { testOperation<zero,two,none,multiply>(multiplySwap, FixedMatrices::SWAP); }
+TEST_CASE( "multiplyPauliX",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,multiply>(multiplyPauliX, FixedMatrices::X); }
+TEST_CASE( "multiplyPauliY",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,multiply>(multiplyPauliY, FixedMatrices::Y); }
+TEST_CASE( "multiplyPauliZ",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,multiply>(multiplyPauliZ, FixedMatrices::Z); }
+TEST_CASE( "multiplyPauliStr",        TEST_CATEGORY_MULT ) { testOperation<zero,any,paulistr,multiply>(multiplyPauliStr,    nullptr); }
+TEST_CASE( "multiplyPauliGadget",     TEST_CATEGORY_MULT ) { testOperation<zero,any,pauligad,multiply>(multiplyPauliGadget, nullptr); }
+TEST_CASE( "multiplyCompMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,compmatr,multiply>(multiplyCompMatr1,   nullptr); }
+TEST_CASE( "multiplyCompMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,compmatr,multiply>(multiplyCompMatr2,   nullptr); }
+TEST_CASE( "multiplyDiagMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,diagmatr,multiply>(multiplyDiagMatr1,   nullptr); }
+TEST_CASE( "multiplyDiagMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,diagmatr,multiply>(multiplyDiagMatr2,   nullptr); }
+
+TEST_CASE( "postMultiplySwap",            TEST_CATEGORY_MULT ) { testOperation<zero,two,none,postmultiply>(postMultiplySwap, FixedMatrices::SWAP); }
+TEST_CASE( "postMultiplyPauliX",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliX, FixedMatrices::X); }
+TEST_CASE( "postMultiplyPauliY",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliY, FixedMatrices::Y); }
+TEST_CASE( "postMultiplyPauliZ",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliZ, FixedMatrices::Z); }
+TEST_CASE( "postMultiplyPauliStr",        TEST_CATEGORY_MULT ) { testOperation<zero,any,paulistr,postmultiply>(postMultiplyPauliStr,    nullptr); }
+TEST_CASE( "postMultiplyPauliGadget",     TEST_CATEGORY_MULT ) { testOperation<zero,any,pauligad,postmultiply>(postMultiplyPauliGadget, nullptr); }
+TEST_CASE( "postMultiplyCompMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,compmatr,postmultiply>(postMultiplyCompMatr1,   nullptr); }
+TEST_CASE( "postMultiplyCompMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,compmatr,postmultiply>(postMultiplyCompMatr2,   nullptr); }
+TEST_CASE( "postMultiplyDiagMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,diagmatr,postmultiply>(postMultiplyDiagMatr1,   nullptr); }
+TEST_CASE( "postMultiplyDiagMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,diagmatr,postmultiply>(postMultiplyDiagMatr2,   nullptr); }
+
+
+/*
+ * C++ overloads which accept qubit lists as vectors
+ * and so which require explicit casting to resolve the
+ * compiler ambiguity (spaghetti 4 lyf)
+ */
+
+
+TEST_CASE( "multiplyCompMatr",  TEST_CATEGORY_MULT ) { 
+    auto func = static_cast<void(*)(Qureg, int*, int, CompMatr)>(multiplyCompMatr);
+    testOperation<zero,any,compmatr,multiply>(func, nullptr); 
+}
+
+TEST_CASE( "multiplyDiagMatr",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr)>(multiplyDiagMatr);
+    testOperation<zero,any,diagmatr,multiply>(func, nullptr);
+}
+
+TEST_CASE( "multiplyDiagMatrPower",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr, qcomp)>(multiplyDiagMatrPower);
+    testOperation<zero,any,diagpower,multiply>(func, nullptr);
+}
+
+TEST_CASE( "multiplyMultiQubitNot",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int)>(multiplyMultiQubitNot);
+    testOperation<zero,any,none,multiply>(func, VariableSizeMatrices::X);
+}
+
+TEST_CASE( "multiplyPhaseGadget",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(multiplyPhaseGadget);
+    testOperation<zero,any,scalar,multiply>(func, VariableSizeParameterisedMatrices::Z);
+}
+
+
+TEST_CASE( "postMultiplyCompMatr",  TEST_CATEGORY_MULT ) { 
+    auto func = static_cast<void(*)(Qureg, int*, int, CompMatr)>(postMultiplyCompMatr);
+    testOperation<zero,any,compmatr,postmultiply>(func, nullptr); 
+}
+
+TEST_CASE( "postMultiplyDiagMatr",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr)>(postMultiplyDiagMatr);
+    testOperation<zero,any,diagmatr,postmultiply>(func, nullptr);
+}
+
+TEST_CASE( "postMultiplyDiagMatrPower",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr, qcomp)>(postMultiplyDiagMatrPower);
+    testOperation<zero,any,diagpower,postmultiply>(func, nullptr);
+}
+
+TEST_CASE( "postMultiplyMultiQubitNot",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int)>(postMultiplyMultiQubitNot);
+    testOperation<zero,any,none,postmultiply>(func, VariableSizeMatrices::X);
+}
+
+TEST_CASE( "postMultiplyPhaseGadget",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(postMultiplyPhaseGadget);
+    testOperation<zero,any,scalar,postmultiply>(func, VariableSizeParameterisedMatrices::Z);
+}
+
+
+/*
+ * operations which need custom logic
+ */
+
+
+TEST_CASE( "multiplyFullStateDiagMatr", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
 
@@ -1891,13 +1962,20 @@ TEST_CASE( "postMultiplyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG
     SECTION( LABEL_CORRECTNESS ) {
 
         qmatrix refMatr = getRandomDiagonalMatrix(getPow2(numQubits));
-        auto apiFunc = postMultiplyFullStateDiagMatr;
+        auto apiFunc = multiplyFullStateDiagMatr;
 
         GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
+        SECTION( LABEL_STATEVEC ) {
+
+            auto refFunc = [&] (qvector& state, qmatrix matr) { multiplyReferenceOperator(state, matr); };
+
+            TEST_ON_CACHED_QUREG_AND_MATRIX( cachedSV, cachedMatrs, apiFunc, refSV, refMatr, refFunc);
+        }
+
         SECTION( LABEL_DENSMATR ) {
 
-            auto refFunc = [&] (qmatrix& state, qmatrix matr) { postMultiplyReferenceOperator(state, matr); };
+            auto refFunc = [&] (qmatrix& state, qmatrix matr) { multiplyReferenceOperator(state, matr); };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
         }
@@ -1907,7 +1985,7 @@ TEST_CASE( "postMultiplyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG
 }
 
 
-TEST_CASE( "postMultiplyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "postMultiplyFullStateDiagMatr", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
 
@@ -1916,22 +1994,13 @@ TEST_CASE( "postMultiplyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLO
     SECTION( LABEL_CORRECTNESS ) {
 
         qmatrix refMatr = getRandomDiagonalMatrix(getPow2(numQubits));
-        qcomp exponent = getRandomComplex();
-
-        auto apiFunc = [&](Qureg qureg, FullStateDiagMatr matr) { 
-            return postMultiplyFullStateDiagMatrPower(qureg, matr, exponent);
-        };
+        auto apiFunc = postMultiplyFullStateDiagMatr;
 
-        CAPTURE( exponent );
-        
         GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_DENSMATR ) {
 
-            auto refFunc = [&] (qmatrix& state, qmatrix matr) { 
-                matr = getPowerOfDiagonalMatrix(matr, exponent);
-                postMultiplyReferenceOperator(state, matr);
-            };
+            auto refFunc = [&] (qmatrix& state, qmatrix matr) { postMultiplyReferenceOperator(state, matr); };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
         }
@@ -1941,7 +2010,7 @@ TEST_CASE( "postMultiplyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLO
 }
 
 
-TEST_CASE( "applyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "multiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
 
@@ -1949,21 +2018,33 @@ TEST_CASE( "applyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
 
     SECTION( LABEL_CORRECTNESS ) {
 
-        qmatrix refMatr = getRandomDiagonalUnitary(numQubits);
-        auto apiFunc = applyFullStateDiagMatr;
+        qmatrix refMatr = getRandomDiagonalMatrix(getPow2(numQubits));
+        qcomp exponent = getRandomComplex();
+
+        auto apiFunc = [&](Qureg qureg, FullStateDiagMatr matr) { 
+            return multiplyFullStateDiagMatrPower(qureg, matr, exponent);
+        };
 
+        CAPTURE( exponent );
+        
         GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC ) {
 
-            auto refFunc = [&] (qvector& state, qmatrix matr) { applyReferenceOperator(state, matr); };
+            auto refFunc = [&] (qvector& state, qmatrix matr) { 
+                matr = getPowerOfDiagonalMatrix(matr, exponent);
+                multiplyReferenceOperator(state, matr);
+            };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedSV, cachedMatrs, apiFunc, refSV, refMatr, refFunc);
         }
 
         SECTION( LABEL_DENSMATR ) {
 
-            auto refFunc = [&] (qmatrix& state, qmatrix matr) { applyReferenceOperator(state, matr); };
+            auto refFunc = [&] (qmatrix& state, qmatrix matr) { 
+                matr = getPowerOfDiagonalMatrix(matr, exponent);
+                multiplyReferenceOperator(state, matr);
+            };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
         }
@@ -1973,7 +2054,7 @@ TEST_CASE( "applyFullStateDiagMatr", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
 }
 
 
-TEST_CASE( "applyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "postMultiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
 
@@ -1981,54 +2062,33 @@ TEST_CASE( "applyFullStateDiagMatrPower", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG )
 
     SECTION( LABEL_CORRECTNESS ) {
 
-        qmatrix refMatr = getRandomDiagonalUnitary(numQubits);
-
-        // supplying a complex exponent requires disabling
-        // numerical validation to relax unitarity
-        bool testRealExp = GENERATE( true, false );
-        qcomp exponent = (testRealExp)?
-            qcomp(getRandomReal(-2, 2), 0):
-            getRandomComplex();
+        qmatrix refMatr = getRandomDiagonalMatrix(getPow2(numQubits));
+        qcomp exponent = getRandomComplex();
 
         auto apiFunc = [&](Qureg qureg, FullStateDiagMatr matr) { 
-            return applyFullStateDiagMatrPower(qureg, matr, exponent);
+            return postMultiplyFullStateDiagMatrPower(qureg, matr, exponent);
         };
 
         CAPTURE( exponent );
-
+        
         GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
-        if (!testRealExp)
-            setValidationEpsilon(0);
-
-        SECTION( LABEL_STATEVEC ) {
-
-            auto refFunc = [&] (qvector& state, qmatrix matr) { 
-                matr = getPowerOfDiagonalMatrix(matr, exponent);
-                applyReferenceOperator(state, matr);
-            };
-
-            TEST_ON_CACHED_QUREG_AND_MATRIX( cachedSV, cachedMatrs, apiFunc, refSV, refMatr, refFunc);
-        }
-
         SECTION( LABEL_DENSMATR ) {
 
             auto refFunc = [&] (qmatrix& state, qmatrix matr) { 
                 matr = getPowerOfDiagonalMatrix(matr, exponent);
-                applyReferenceOperator(state, matr);
+                postMultiplyReferenceOperator(state, matr);
             };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
         }
-
-        setValidationEpsilonToDefault();
     }
 
     /// @todo input validation
 }
 
 
-TEST_CASE( "multiplyPauliStrSum", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "multiplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -2058,7 +2118,7 @@ TEST_CASE( "multiplyPauliStrSum", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
 }
 
 
-TEST_CASE( "postMultiplyPauliStrSum", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "postMultiplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -2087,35 +2147,6 @@ TEST_CASE( "postMultiplyPauliStrSum", TEST_CATEGORY LABEL_MIXED_DEPLOY_TAG ) {
 }
 
 
-TEST_CASE( "applyNonUnitaryPauliGadget", TEST_CATEGORY ) {
-
-    PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
-
-    SECTION( LABEL_CORRECTNESS ) {
-
-        // prepare a random Pauli string and angle
-        int numTargs = GENERATE_COPY( range(1, numQubits+1) );
-        auto targs = GENERATE_TARGS( numQubits, numTargs );
-        PauliStr str = getRandomPauliStr(targs);
-        qcomp angle = getRandomComplex();
-
-        // prepare the corresponding reference matrix exp(-i angle pauli)
-        auto matrRef = getExponentialOfPauliMatrix(angle, getMatrix(str, numQubits));
-
-        auto testFunc = [&](Qureg qureg, auto& stateRef) {
-            applyNonUnitaryPauliGadget(qureg, str, angle);
-            applyReferenceOperator(stateRef, matrRef);
-        };
-
-        CAPTURE( targs, angle );
-        SECTION( LABEL_STATEVEC ) { TEST_ON_CACHED_QUREGS(statevecQuregs, statevecRef, testFunc); }
-        SECTION( LABEL_DENSMATR ) { TEST_ON_CACHED_QUREGS(densmatrQuregs, densmatrRef, testFunc); }
-    }
-
-    /// @todo input validation
-}
-
-
 /** @} (end defgroup) */
 
 

From 2110b745bddaafa4554996b537ad7b0ec2fcc9c7 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 20 Jul 2025 03:37:11 +0200
Subject: [PATCH 14/32] separated API trotterisation from operations

which has the below benefits:
- the remaining functions in operations.cpp are precise and do not need to be user-configured for accuracy (i.e. no Trotter hyperparameters)
- the remaining functions in operations.cpp merely call the backend and do not include any bespoke logic (i.e. Trotter circuit scheduling)
- incoming new Trotter functions for dynamical simulation will be more clearly delineated from the "standard" (and relatively boring) operations
- the Trotter logic is isolated in preparation for it becoming more substantial with the introduction of randomisation, commuting groups, and that necessary for Linblad master equation solving
---
 quest/include/operations.h       | 280 ----------------------------
 quest/include/quest.h            |   1 +
 quest/include/trotterisation.h   | 309 +++++++++++++++++++++++++++++++
 quest/src/api/CMakeLists.txt     |   1 +
 quest/src/api/operations.cpp     | 147 ---------------
 quest/src/api/trotterisation.cpp | 175 +++++++++++++++++
 tests/unit/CMakeLists.txt        |   1 +
 tests/unit/operations.cpp        |  17 --
 tests/unit/trotterisation.cpp    |  36 ++++
 9 files changed, 523 insertions(+), 444 deletions(-)
 create mode 100644 quest/include/trotterisation.h
 create mode 100644 quest/src/api/trotterisation.cpp
 create mode 100644 tests/unit/trotterisation.cpp

diff --git a/quest/include/operations.h b/quest/include/operations.h
index bbeaf12d2..2a5c394b0 100644
--- a/quest/include/operations.h
+++ b/quest/include/operations.h
@@ -2094,286 +2094,6 @@ void applyMultiQubitPhaseShift(Qureg qureg, std::vector<int> targets, qreal angl
 
 
 
-/** 
- * @defgroup op_paulistrsum PauliStrSum gadgets
- * @brief Functions for apply Trotterised exponentials of weighted sums of Pauli tensors.
- * @{
- */
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/** @notyettested
- * 
- * Effects (an approximation to) the exponential of @p sum, weighted by @p angle, upon @p qureg,
- * via the symmetrized Trotter-Suzuki decomposition (<a href="https://arxiv.org/abs/math-ph/0506007">arXiv</a>).
- * Increasing @p reps (the number of Trotter repetitions) or @p order (an even, positive integer or one) 
- * improves the accuracy of the approximation (reducing the "Trotter error" due to non-commuting 
- * terms of @p sum), though increases the runtime linearly and exponentially respectively.
- * 
- * @formulae 
- * 
- * Let @f$ \hat{H} = @f$ @p sum and @f$ \theta = @f$ @p angle. This function approximates the action of
- * @f[
-      \exp \left(\iu \, \theta \, \hat{H} \right)
- * @f]
- * via a Trotter-Suzuki decomposition of the specified @p order and number of repetitions (@p reps).
- * Simulation is exact, regardless of @p order or @p reps, only when all terms in @p sum commute.
- * 
- * @important
- *   Note that @f$ \theta @f$ lacks the @f$ -\frac{1}{2} @f$ prefactor present in other functions like
- *   applyPauliGadget().
- * 
- * To be precise, let @f$ r = @f$ @p reps and assume @p sum is composed of
- * @f$ T @f$-many terms of the form
- * @f[
-      \hat{H} = \sum\limits_j^T c_j \, \hat{\sigma}_j
- * @f]
- * where @f$ c_j @f$ is the coefficient of the @f$ j @f$-th PauliStr @f$ \hat{\sigma}_j @f$.
- * 
- * - When @p order=1, this function performs first-order Trotterisation, whereby
- *   @f[
-       \exp(\iu \, \theta \, \hat{H} )
-          \approx 
-        \prod\limits^{r} 
-        \prod\limits_{j=1}^{T} 
-        \exp \left( \iu \, \frac{\theta \, c_j}{r} \, \hat\sigma_j \right).
- *   @f]
- * - When @p order=2, this function performs the lowest order "symmetrized" Suzuki decomposition, whereby 
- *   @f[
-       \exp(\iu \, \theta \, \hat{H} )
-          \approx 
-        \prod\limits^{r} \left[
-             \prod\limits_{j=1}^{T} \exp \left( \iu \frac{\theta \, c_j}{2 \, r}  \hat\sigma_j \right)
-              \prod\limits_{j=T}^{1} \exp \left( \iu \frac{\theta \, c_j}{2 \, r}  \hat\sigma_j \right)
-         \right].
- *   @f]
- * - Greater, even values of @p order (denoted by symbol @f$ n @f$) invoke higher-order symmetrized decompositions 
- *   @f$ S[\theta,n,r] @f$. Letting @f$ p = \left( 4 - 4^{1/(n-1)} \right)^{-1} @f$, these satisfy
- *   @f{align*}
-        S[\theta, n, 1] &= 
-            \left( \prod\limits^2 S[p \, \theta, n-2, 1] \right)
-            S[ (1-4p)\,\theta, n-2, 1]
-            \left( \prod\limits^2 S[p \, \theta, n-2, 1] \right),
-        \\
-        S[\theta, n, r] &= 
-            \prod\limits^{r} S\left[\frac{\theta}{r}, n, 1\right].
- *   @f}
- * 
- * > These formulations are taken from 'Finding Exponential Product Formulas
- * > of Higher Orders', Naomichi Hatano and Masuo Suzuki (2005) (<a href="https://arxiv.org/abs/math-ph/0506007">arXiv</a>).
- * 
- * @equivalences
- * 
- * - Time evolution of duration @f$ t @f$ under a time-independent Hamiltonian @p sum = @f$ \hat{H} @f$, as
- *   per the unitary time evolution operator
- *   @f[
-        \hat{U}(t) = \exp(- \iu \, t  \,\hat{H} \, / \, \hbar) 
- *   @f]
- *   is approximated via @f$ \theta = - t / \hbar @f$.
- *   ```
-     qreal time = 3.14;
-     qreal angle = - time / hbar;
-     applyTrotterizedPauliStrSumGadget(qureg, sum, angle, order, reps);
- *   ```
- * - This function is equivalent to applyNonUnitaryTrotterizedPauliStrSumGadget() when passing
- *   a @p qcomp instance with a zero imaginary component as the @p angle parameter. This latter 
- *   function is useful for generalising dynamical simulation to imaginary-time evolution.
- * 
- * @constraints
- * - Unitarity of the prescribed exponential(s) requires that @p sum is Hermitian, ergo containing
- *   only real coefficients. Validation will check that @p sum is approximately Hermitian, permitting
- *   coefficients with imaginary components smaller (in magnitude) than epsilon.
- *   @f[ 
-        \max\limits_{i} \Big|c_i| \le \valeps
- *   @f]
- *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon().
- *   Otherwise, use applyNonUnitaryTrotterizedPauliStrSumGadget() to permit non-Hermitian @p sum
- *   and ergo effect a non-unitary exponential(s). 
- * - The @p angle parameter is necessarily real despite the validation epsilon, but can be relaxed
- *   to an arbitrary complex scalar using applyNonUnitaryTrotterizedPauliStrSumGadget().
- * - This function only ever effects @f$ \exp \left(\iu \, \theta \, \hat{H} \right) @f$ exactly
- *   when all PauliStr in @p sum = @f$ \hat{H} @f$ commute. 
- * 
- * @param[in,out] qureg  the state to modify.
- * @param[in]     sum    a weighted sum of Pauli strings to approximately exponentiate.
- * @param[in]     angle  an effective prefactor of @p sum in the exponent.
- * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...)
- * @param[in]     reps   the number of Trotter repetitions
- * 
- * @throws @validationerror
- * - if @p qureg or @p sum are uninitialised.
- * - if @p sum is not approximately Hermitian.
- * - if @p sum contains non-identities on qubits beyond the size of @p qureg.
- * - if @p order is not 1 nor a positive, @b even integer.
- * - if @p reps is not a positive integer.
- * 
- * @see
- *  - applyPauliGadget()
- *  - applyNonUnitaryTrotterizedPauliStrSumGadget()
- * 
- * @author Tyson Jones
- */
-void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @see
-///  - applyTrotterizedPauliStrSumGadget()
-///  - applyControlledCompMatr1()
-void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @see
-///  - applyTrotterizedPauliStrSumGadget()
-///  - applyMultiControlledCompMatr1()
-void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
-
-
-/// @notyetdoced
-/// @notyettested
-/// @see
-///  - applyTrotterizedPauliStrSumGadget()
-///  - applyMultiStateControlledCompMatr1()
-void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
-
-
-/** @notyettested
- * 
- * A generalisation of applyTrotterizedPauliStrSumGadget() which accepts a complex angle and permits
- * @p sum to be non-Hermitian, thereby effecting a potentially non-unitary and non-CPTP operation.
- * 
- * @formulae 
- * 
- * Let @f$ \hat{H} = @f$ @p sum and @f$ \theta = @f$ @p angle. This function approximates the action of
- * @f[
-      \exp \left(\iu \, \theta \, \hat{H} \right)
- * @f]
- * via a Trotter-Suzuki decomposition of the specified @p order and number of repetitions (@p reps). 
- * 
- * See applyTrotterizedPauliStrSumGadget() for more information about the decomposition.
- *
- * @equivalences
- * 
- * - When @p angle is set to @f$ \theta = \iu \, \tau @f$ and @p sum = @f$ \hat{H} @f$ is Hermitian,
- *   this function (approximately) evolves @p qureg in imaginary-time. That is, letting 
- *   @f$ \hat{U}(t) = \exp(-\iu \, t \, \hat{H}) @f$ be the normalised unitary evolution operator, this 
- *   function effects the imaginary-time operator
-     @f[
-        \hat{V}(\tau) = \hat{U}(t=-\iu \tau) = \exp(- \tau \hat{H}).
- *   @f]
- *   This operation drives the system toward the (unnormalised) groundstate.
- *   Let @f$ \{ \ket{\phi_i} \} @f$ and @f$ \{ \ket{\lambda_i} \} @f$ be the eigenstates and respective
- *   eigenvalues of @f$ \hat{H} @f$, which are real due to Hermiticity.
- *   @f[
-         \hat{H} = \sum \limits_i \lambda_i \ket{\phi_i}\bra{\phi_i},
-         \;\;\;\;\; \lambda_i \in \mathbb{R}.
- *   @f]
- *   
- *   - When @p qureg is a statevector @f$ \svpsi @f$ and can ergo be expressed in the basis of 
- *     @f$ \{ \ket{\phi_i} \} @f$ as @f$ \svpsi = \sum_i \alpha_i \ket{\phi_i} @f$, 
- *     this function approximates
- *     @f[
-          \svpsi \, \rightarrow  \, \hat{V}(\tau) \svpsi =
-          \sum\limits_i \alpha_i \exp(- \tau \, \lambda_i) \ket{\phi_i}.
- *     @f]
- *   - When @p qureg is a density matrix and is ergo expressible as
- *     @f$ \dmrho = \sum\limits_{ij} \alpha_{ij} \ket{\phi_i}\bra{\phi_j} @f$, this function effects
- *     @f[
-          \dmrho \, \rightarrow \, \hat{V}(\tau) \dmrho \hat{V}(\tau)^\dagger =
-          \sum\limits_{ij} \alpha_{ij} \exp(-\tau (\lambda_i + \lambda_j)) \ket{\phi_i}\bra{\phi_j}.
- *     @f]
- *
- *   As @f$ \tau \rightarrow \infty @f$, the resulting unnormalised state approaches statevector
- *   @f$ \svpsi \rightarrow \alpha_0 \exp(-\tau \lambda_0) \ket{\phi_0} @f$ or density matrix
- *   @f$ \dmrho \rightarrow \alpha_{0,0} \exp(-2 \tau \lambda_0) \ket{\phi_0}\bra{\phi_0} @f$,
- *   where @f$ \lambda_0 @f$ is the minimum eigenvalue and @f$ \ket{\phi_0} @f$ is the groundstate.
- *   Assuming the initial overlap @f$ \alpha_0 @f$ is not zero (or exponentially tiny), 
- *   subsequent renormalisation via setQuregToRenormalized() produces the pure 
- *   ground-state @f$ \ket{\phi_0} @f$.
- *
- *   ```
-     // pray for a non-zero initial overlap
-     initRandomPureState(qureg); // works even for density matrices
-
-     // minimize then renormalise
-     qreal tau = 10; // impatient infinity
-     int order = 4;
-     int reps = 100;
-     applyNonUnitaryTrotterizedPauliStrSumGadget(qureg, hamil, tau * 1i, order, reps);
-     setQuregToRenormalized(qureg);
-
-     // ground-state (phi_0)
-     reportQureg(qureg);
-
-     // lowest lying eigenvalue (lambda_0)
-     qreal expec = calcExpecPauliStrSum(qureg, hamil);
-     reportScalar("expec", expec);
- *   ```
- *
- *   Note degenerate eigenvalues will yield a pure superposition of the corresponding eigenstates, with 
- *   coefficients informed by the initial, relative populations.
- * 
- * - When @p angle is real and @p sum is Hermitian (has approximately real coefficients), this
- *   function is equivalent to applyTrotterizedPauliStrSumGadget()
- * 
- * @constraints
- * - This function only ever effects @f$ \exp \left(\iu \, \theta \, \hat{H} \right) @f$ exactly
- *   when all PauliStr in @p sum = @f$ \hat{H} @f$ commute. 
- * 
- * @param[in,out] qureg  the state to modify.
- * @param[in]     sum    a weighted sum of Pauli strings to approximately exponentiate.
- * @param[in]     angle  an effective prefactor of @p sum in the exponent.
- * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...)
- * @param[in]     reps   the number of Trotter repetitions
- * 
- * @throws @validationerror
- * - if @p qureg or @p sum are uninitialised.
- * - if @p sum contains non-identities on qubits beyond the size of @p qureg.
- * - if @p order is not 1 nor a positive, @b even integer.
- * - if @p reps is not a positive integer.
- * 
- * @author Tyson Jones
- */
-void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);
-
-
-// end de-mangler
-#ifdef __cplusplus
-}
-#endif
-
-#ifdef __cplusplus
-
-
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see applyMultiControlledTrotterizedPauliStrSumGadget()
-void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, std::vector<int> controls, PauliStrSum sum, qreal angle, int order, int reps);
-
-
-/// @notyettested
-/// @notyetvalidated
-/// @notyetdoced
-/// @cppvectoroverload
-/// @see applyMultiStateControlledTrotterizedPauliStrSumGadget()
-void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, std::vector<int> controls, std::vector<int> states, PauliStrSum sum, qreal angle, int order, int reps);
-
-
-#endif // __cplusplus
-
-/** @} */
-
-
-
 /** 
  * @defgroup op_nots Many-not gates
  * @brief Functions for effecting many-qubit NOT gates
diff --git a/quest/include/quest.h b/quest/include/quest.h
index afcb316be..c0a30ed1f 100644
--- a/quest/include/quest.h
+++ b/quest/include/quest.h
@@ -48,6 +48,7 @@
 #include "quest/include/debug.h"
 #include "quest/include/decoherence.h"
 #include "quest/include/environment.h"
+#include "quest/include/trotterisation.h"
 #include "quest/include/initialisations.h"
 #include "quest/include/channels.h"
 #include "quest/include/multiplication.h"
diff --git a/quest/include/trotterisation.h b/quest/include/trotterisation.h
new file mode 100644
index 000000000..6c16e3d9a
--- /dev/null
+++ b/quest/include/trotterisation.h
@@ -0,0 +1,309 @@
+/** @file
+ * API signatures for effecting Trotterised operators which
+ * approximate the action of exponentials of PauliStrSum
+ * 
+ * @author Tyson Jones
+ * 
+ * @defgroup trotterisation Trotterisation
+ * @ingroup api
+ * @brief Functions for Trottersing operations upon Quregs.
+ * @{
+ */
+
+#ifndef TROTTERISATION_H
+#define TROTTERISATION_H
+
+#include "quest/include/qureg.h"
+#include "quest/include/paulis.h"
+#include "quest/include/matrices.h"
+
+#ifdef __cplusplus
+    #include <vector>
+#endif
+
+
+
+/** 
+ * @defgroup trotter_paulistrsum PauliStrSum gadgets
+ * @brief Functions for using Trotterisation to approximate the action of 
+ *        exponentials of weighted sums of Pauli tensors upon Quregs.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** @notyettested
+ * 
+ * Effects (an approximation to) the exponential of @p sum, weighted by @p angle, upon @p qureg,
+ * via the symmetrized Trotter-Suzuki decomposition (<a href="https://arxiv.org/abs/math-ph/0506007">arXiv</a>).
+ * Increasing @p reps (the number of Trotter repetitions) or @p order (an even, positive integer or one) 
+ * improves the accuracy of the approximation (reducing the "Trotter error" due to non-commuting 
+ * terms of @p sum), though increases the runtime linearly and exponentially respectively.
+ * 
+ * @formulae 
+ * 
+ * Let @f$ \hat{H} = @f$ @p sum and @f$ \theta = @f$ @p angle. This function approximates the action of
+ * @f[
+      \exp \left(\iu \, \theta \, \hat{H} \right)
+ * @f]
+ * via a Trotter-Suzuki decomposition of the specified @p order and number of repetitions (@p reps).
+ * Simulation is exact, regardless of @p order or @p reps, only when all terms in @p sum commute.
+ * 
+ * @important
+ *   Note that @f$ \theta @f$ lacks the @f$ -\frac{1}{2} @f$ prefactor present in other functions like
+ *   applyPauliGadget().
+ * 
+ * To be precise, let @f$ r = @f$ @p reps and assume @p sum is composed of
+ * @f$ T @f$-many terms of the form
+ * @f[
+      \hat{H} = \sum\limits_j^T c_j \, \hat{\sigma}_j
+ * @f]
+ * where @f$ c_j @f$ is the coefficient of the @f$ j @f$-th PauliStr @f$ \hat{\sigma}_j @f$.
+ * 
+ * - When @p order=1, this function performs first-order Trotterisation, whereby
+ *   @f[
+       \exp(\iu \, \theta \, \hat{H} )
+          \approx 
+        \prod\limits^{r} 
+        \prod\limits_{j=1}^{T} 
+        \exp \left( \iu \, \frac{\theta \, c_j}{r} \, \hat\sigma_j \right).
+ *   @f]
+ * - When @p order=2, this function performs the lowest order "symmetrized" Suzuki decomposition, whereby 
+ *   @f[
+       \exp(\iu \, \theta \, \hat{H} )
+          \approx 
+        \prod\limits^{r} \left[
+             \prod\limits_{j=1}^{T} \exp \left( \iu \frac{\theta \, c_j}{2 \, r}  \hat\sigma_j \right)
+              \prod\limits_{j=T}^{1} \exp \left( \iu \frac{\theta \, c_j}{2 \, r}  \hat\sigma_j \right)
+         \right].
+ *   @f]
+ * - Greater, even values of @p order (denoted by symbol @f$ n @f$) invoke higher-order symmetrized decompositions 
+ *   @f$ S[\theta,n,r] @f$. Letting @f$ p = \left( 4 - 4^{1/(n-1)} \right)^{-1} @f$, these satisfy
+ *   @f{align*}
+        S[\theta, n, 1] &= 
+            \left( \prod\limits^2 S[p \, \theta, n-2, 1] \right)
+            S[ (1-4p)\,\theta, n-2, 1]
+            \left( \prod\limits^2 S[p \, \theta, n-2, 1] \right),
+        \\
+        S[\theta, n, r] &= 
+            \prod\limits^{r} S\left[\frac{\theta}{r}, n, 1\right].
+ *   @f}
+ * 
+ * > These formulations are taken from 'Finding Exponential Product Formulas
+ * > of Higher Orders', Naomichi Hatano and Masuo Suzuki (2005) (<a href="https://arxiv.org/abs/math-ph/0506007">arXiv</a>).
+ * 
+ * @equivalences
+ * 
+ * - Time evolution of duration @f$ t @f$ under a time-independent Hamiltonian @p sum = @f$ \hat{H} @f$, as
+ *   per the unitary time evolution operator
+ *   @f[
+        \hat{U}(t) = \exp(- \iu \, t  \,\hat{H} \, / \, \hbar) 
+ *   @f]
+ *   is approximated via @f$ \theta = - t / \hbar @f$.
+ *   ```
+     qreal time = 3.14;
+     qreal angle = - time / hbar;
+     applyTrotterizedPauliStrSumGadget(qureg, sum, angle, order, reps);
+ *   ```
+ * - This function is equivalent to applyNonUnitaryTrotterizedPauliStrSumGadget() when passing
+ *   a @p qcomp instance with a zero imaginary component as the @p angle parameter. This latter 
+ *   function is useful for generalising dynamical simulation to imaginary-time evolution.
+ * 
+ * @constraints
+ * - Unitarity of the prescribed exponential(s) requires that @p sum is Hermitian, ergo containing
+ *   only real coefficients. Validation will check that @p sum is approximately Hermitian, permitting
+ *   coefficients with imaginary components smaller (in magnitude) than epsilon.
+ *   @f[ 
+        \max\limits_{i} \Big|c_i| \le \valeps
+ *   @f]
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon().
+ *   Otherwise, use applyNonUnitaryTrotterizedPauliStrSumGadget() to permit non-Hermitian @p sum
+ *   and ergo effect a non-unitary exponential(s). 
+ * - The @p angle parameter is necessarily real despite the validation epsilon, but can be relaxed
+ *   to an arbitrary complex scalar using applyNonUnitaryTrotterizedPauliStrSumGadget().
+ * - This function only ever effects @f$ \exp \left(\iu \, \theta \, \hat{H} \right) @f$ exactly
+ *   when all PauliStr in @p sum = @f$ \hat{H} @f$ commute. 
+ * 
+ * @param[in,out] qureg  the state to modify.
+ * @param[in]     sum    a weighted sum of Pauli strings to approximately exponentiate.
+ * @param[in]     angle  an effective prefactor of @p sum in the exponent.
+ * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...)
+ * @param[in]     reps   the number of Trotter repetitions
+ * 
+ * @throws @validationerror
+ * - if @p qureg or @p sum are uninitialised.
+ * - if @p sum is not approximately Hermitian.
+ * - if @p sum contains non-identities on qubits beyond the size of @p qureg.
+ * - if @p order is not 1 nor a positive, @b even integer.
+ * - if @p reps is not a positive integer.
+ * 
+ * @see
+ *  - applyPauliGadget()
+ *  - applyNonUnitaryTrotterizedPauliStrSumGadget()
+ * 
+ * @author Tyson Jones
+ */
+void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see
+///  - applyTrotterizedPauliStrSumGadget()
+///  - applyControlledCompMatr1()
+void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see
+///  - applyTrotterizedPauliStrSumGadget()
+///  - applyMultiControlledCompMatr1()
+void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
+
+
+/// @notyetdoced
+/// @notyettested
+/// @see
+///  - applyTrotterizedPauliStrSumGadget()
+///  - applyMultiStateControlledCompMatr1()
+void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
+
+
+/** @notyettested
+ * 
+ * A generalisation of applyTrotterizedPauliStrSumGadget() which accepts a complex angle and permits
+ * @p sum to be non-Hermitian, thereby effecting a potentially non-unitary and non-CPTP operation.
+ * 
+ * @formulae 
+ * 
+ * Let @f$ \hat{H} = @f$ @p sum and @f$ \theta = @f$ @p angle. This function approximates the action of
+ * @f[
+      \exp \left(\iu \, \theta \, \hat{H} \right)
+ * @f]
+ * via a Trotter-Suzuki decomposition of the specified @p order and number of repetitions (@p reps). 
+ * 
+ * See applyTrotterizedPauliStrSumGadget() for more information about the decomposition.
+ *
+ * @equivalences
+ * 
+ * - When @p angle is set to @f$ \theta = \iu \, \tau @f$ and @p sum = @f$ \hat{H} @f$ is Hermitian,
+ *   this function (approximately) evolves @p qureg in imaginary-time. That is, letting 
+ *   @f$ \hat{U}(t) = \exp(-\iu \, t \, \hat{H}) @f$ be the normalised unitary evolution operator, this 
+ *   function effects the imaginary-time operator
+     @f[
+        \hat{V}(\tau) = \hat{U}(t=-\iu \tau) = \exp(- \tau \hat{H}).
+ *   @f]
+ *   This operation drives the system toward the (unnormalised) groundstate.
+ *   Let @f$ \{ \ket{\phi_i} \} @f$ and @f$ \{ \ket{\lambda_i} \} @f$ be the eigenstates and respective
+ *   eigenvalues of @f$ \hat{H} @f$, which are real due to Hermiticity.
+ *   @f[
+         \hat{H} = \sum \limits_i \lambda_i \ket{\phi_i}\bra{\phi_i},
+         \;\;\;\;\; \lambda_i \in \mathbb{R}.
+ *   @f]
+ *   
+ *   - When @p qureg is a statevector @f$ \svpsi @f$ and can ergo be expressed in the basis of 
+ *     @f$ \{ \ket{\phi_i} \} @f$ as @f$ \svpsi = \sum_i \alpha_i \ket{\phi_i} @f$, 
+ *     this function approximates
+ *     @f[
+          \svpsi \, \rightarrow  \, \hat{V}(\tau) \svpsi =
+          \sum\limits_i \alpha_i \exp(- \tau \, \lambda_i) \ket{\phi_i}.
+ *     @f]
+ *   - When @p qureg is a density matrix and is ergo expressible as
+ *     @f$ \dmrho = \sum\limits_{ij} \alpha_{ij} \ket{\phi_i}\bra{\phi_j} @f$, this function effects
+ *     @f[
+          \dmrho \, \rightarrow \, \hat{V}(\tau) \dmrho \hat{V}(\tau)^\dagger =
+          \sum\limits_{ij} \alpha_{ij} \exp(-\tau (\lambda_i + \lambda_j)) \ket{\phi_i}\bra{\phi_j}.
+ *     @f]
+ *
+ *   As @f$ \tau \rightarrow \infty @f$, the resulting unnormalised state approaches statevector
+ *   @f$ \svpsi \rightarrow \alpha_0 \exp(-\tau \lambda_0) \ket{\phi_0} @f$ or density matrix
+ *   @f$ \dmrho \rightarrow \alpha_{0,0} \exp(-2 \tau \lambda_0) \ket{\phi_0}\bra{\phi_0} @f$,
+ *   where @f$ \lambda_0 @f$ is the minimum eigenvalue and @f$ \ket{\phi_0} @f$ is the groundstate.
+ *   Assuming the initial overlap @f$ \alpha_0 @f$ is not zero (or exponentially tiny), 
+ *   subsequent renormalisation via setQuregToRenormalized() produces the pure 
+ *   ground-state @f$ \ket{\phi_0} @f$.
+ *
+ *   ```
+     // pray for a non-zero initial overlap
+     initRandomPureState(qureg); // works even for density matrices
+
+     // minimize then renormalise
+     qreal tau = 10; // impatient infinity
+     int order = 4;
+     int reps = 100;
+     applyNonUnitaryTrotterizedPauliStrSumGadget(qureg, hamil, tau * 1i, order, reps);
+     setQuregToRenormalized(qureg);
+
+     // ground-state (phi_0)
+     reportQureg(qureg);
+
+     // lowest lying eigenvalue (lambda_0)
+     qreal expec = calcExpecPauliStrSum(qureg, hamil);
+     reportScalar("expec", expec);
+ *   ```
+ *
+ *   Note degenerate eigenvalues will yield a pure superposition of the corresponding eigenstates, with 
+ *   coefficients informed by the initial, relative populations.
+ * 
+ * - When @p angle is real and @p sum is Hermitian (has approximately real coefficients), this
+ *   function is equivalent to applyTrotterizedPauliStrSumGadget()
+ * 
+ * @constraints
+ * - This function only ever effects @f$ \exp \left(\iu \, \theta \, \hat{H} \right) @f$ exactly
+ *   when all PauliStr in @p sum = @f$ \hat{H} @f$ commute. 
+ * 
+ * @param[in,out] qureg  the state to modify.
+ * @param[in]     sum    a weighted sum of Pauli strings to approximately exponentiate.
+ * @param[in]     angle  an effective prefactor of @p sum in the exponent.
+ * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...)
+ * @param[in]     reps   the number of Trotter repetitions
+ * 
+ * @throws @validationerror
+ * - if @p qureg or @p sum are uninitialised.
+ * - if @p sum contains non-identities on qubits beyond the size of @p qureg.
+ * - if @p order is not 1 nor a positive, @b even integer.
+ * - if @p reps is not a positive integer.
+ * 
+ * @author Tyson Jones
+ */
+void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __cplusplus
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see applyMultiControlledTrotterizedPauliStrSumGadget()
+void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, std::vector<int> controls, PauliStrSum sum, qreal angle, int order, int reps);
+
+
+/// @notyettested
+/// @notyetvalidated
+/// @notyetdoced
+/// @cppvectoroverload
+/// @see applyMultiStateControlledTrotterizedPauliStrSumGadget()
+void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, std::vector<int> controls, std::vector<int> states, PauliStrSum sum, qreal angle, int order, int reps);
+
+
+#endif // __cplusplus
+
+/** @} */
+
+
+
+#endif // TROTTERISATION_H
+
+/** @} */ // (end file-wide doxygen defgroup)
diff --git a/quest/src/api/CMakeLists.txt b/quest/src/api/CMakeLists.txt
index d02797506..0979f2f6c 100644
--- a/quest/src/api/CMakeLists.txt
+++ b/quest/src/api/CMakeLists.txt
@@ -12,5 +12,6 @@ target_sources(QuEST
   operations.cpp
   paulis.cpp
   qureg.cpp
+  trotterisation.cpp
   types.cpp
 )
\ No newline at end of file
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index bf5b4c5b3..ede4ff011 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -998,153 +998,6 @@ void applyMultiStateControlledPauliStr(Qureg qureg, vector<int> controls, vector
 
 
 
-/*
- * Pauli string sums
- */
-
-extern "C" {
-
-void internal_applyFirstOrderTrotterRepetition(
-    Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
-    vector<int>& states, PauliStrSum sum, qcomp angle, bool reverse
-) {
-    // apply each sum term as a gadget, in forward or reverse order
-    for (qindex i=0; i<sum.numTerms; i++) {
-        int j = reverse? sum.numTerms - i - 1 : i;
-        qcomp coeff = sum.coeffs[j];
-        PauliStr str = sum.strings[j];
-
-        // effect |psi> -> exp(i angle * sum)|psi>
-        qcomp arg = angle * coeff;
-        localiser_statevec_anyCtrlPauliGadget(qureg, ketCtrls, states, str, arg);
-
-        if (!qureg.isDensityMatrix)
-            continue;
-
-        // effect rho -> rho dagger(i angle * sum)
-        arg *= paulis_hasOddNumY(str) ? 1 : -1;
-        str = paulis_getShiftedPauliStr(str, qureg.numQubits);
-        localiser_statevec_anyCtrlPauliGadget(qureg, braCtrls, states, str, arg);
-    }
-}
-
-void internal_applyHigherOrderTrotterRepetition(
-    Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
-    vector<int>& states, PauliStrSum sum, qcomp angle, int order
-) {
-    if (order == 1) {
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle, false);
-    
-    } else if (order == 2) {
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle/2, false);
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle/2, true);
-    
-    } else {
-        qreal p = 1. / (4 - std::pow(4, 1./(order-1)));
-        qcomp a = p * angle;
-        qcomp b = (1-4*p) * angle;
-
-        int lower = order - 2;
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, b, lower);
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
-    }
-}
-
-void internal_applyAllTrotterRepetitions(
-    Qureg qureg, int* controls, int* states, int numControls, 
-    PauliStrSum sum, qcomp angle, int order, int reps
-) {
-    // exp(i angle sum) = identity when angle=0
-    if (angle == qcomp(0,0))
-        return;
-
-    // prepare control-qubit lists once for all invoked gadgets below
-    auto ketCtrlsVec = util_getVector(controls, numControls);
-    auto braCtrlsVec = (qureg.isDensityMatrix)? util_getBraQubits(ketCtrlsVec, qureg) : vector<int>{};
-    auto statesVec = util_getVector(states, numControls);
-
-    qcomp arg = angle / reps;
-
-    // perform carefully-ordered sequence of gadgets
-    for (int r=0; r<reps; r++)
-        internal_applyHigherOrderTrotterRepetition(
-            qureg, ketCtrlsVec, braCtrlsVec, statesVec, sum, arg, order);
-
-    /// @todo
-    /// the accuracy of Trotterisation is greatly improved by randomisation
-    /// or (even sub-optimal) grouping into commuting terms. Should we 
-    /// implement these above or into another function?
-}
-
-void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps) {
-    validate_quregFields(qureg, __func__);
-    validate_pauliStrSumFields(sum, __func__);
-    validate_pauliStrSumTargets(sum, qureg, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
-    // sum is permitted to be non-Hermitian
-
-    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps);
-}
-
-void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps) {
-    validate_quregFields(qureg, __func__);
-    validate_pauliStrSumFields(sum, __func__);
-    validate_pauliStrSumTargets(sum, qureg, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
-    validate_pauliStrSumIsHermitian(sum, __func__);
-
-    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps);
-}
-
-void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps) {
-    validate_quregFields(qureg, __func__);
-    validate_pauliStrSumFields(sum, __func__);
-    validate_controlAndPauliStrSumTargets(qureg, control, sum, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
-    validate_pauliStrSumIsHermitian(sum, __func__);
-
-    internal_applyAllTrotterRepetitions(qureg, &control, nullptr, 1, sum, angle, order, reps);
-}
-
-void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
-    validate_quregFields(qureg, __func__);
-    validate_pauliStrSumFields(sum, __func__);
-    validate_controlsAndPauliStrSumTargets(qureg, controls, numControls, sum, __func__);
-    validate_trotterParams(qureg, order, reps, __func__);
-    validate_pauliStrSumIsHermitian(sum, __func__);
-
-    internal_applyAllTrotterRepetitions(qureg, controls, nullptr, numControls, sum, angle, order, reps);
-}
-
-void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
-    validate_quregFields(qureg, __func__);
-    validate_pauliStrSumFields(sum, __func__);
-    validate_controlsAndPauliStrSumTargets(qureg, controls, numControls, sum, __func__);
-    validate_controlStates(states, numControls, __func__); // permits states==nullptr
-    validate_trotterParams(qureg, order, reps, __func__);
-    validate_pauliStrSumIsHermitian(sum, __func__);
-
-    internal_applyAllTrotterRepetitions(qureg, controls, states, numControls, sum, angle, order, reps);
-}
-
-} // end de-mangler
-
-void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, vector<int> controls, PauliStrSum sum, qreal angle, int order, int reps) {
-
-    applyMultiControlledTrotterizedPauliStrSumGadget(qureg, controls.data(), controls.size(), sum, angle, order, reps);
-}
-
-void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, vector<int> controls, vector<int> states, PauliStrSum sum, qreal angle, int order, int reps) {
-    validate_controlsMatchStates(controls.size(), states.size(), __func__);
-
-    applyMultiStateControlledTrotterizedPauliStrSumGadget(qureg, controls.data(), states.data(), controls.size(), sum, angle, order, reps);
-}
-
-
-
 /*
  * individual axis rotations
  */
diff --git a/quest/src/api/trotterisation.cpp b/quest/src/api/trotterisation.cpp
new file mode 100644
index 000000000..43d371059
--- /dev/null
+++ b/quest/src/api/trotterisation.cpp
@@ -0,0 +1,175 @@
+/** @file
+ * API definitions for functions which involve Trotterising
+ * exponential operators, such as PauliStrSum gadgets, and
+ * so are inherently approximate.
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest/include/qureg.h"
+#include "quest/include/paulis.h"
+#include "quest/include/matrices.h"
+
+#include "quest/src/core/validation.hpp"
+#include "quest/src/core/utilities.hpp"
+#include "quest/src/core/localiser.hpp"
+
+#include <vector>
+
+using std::vector;
+
+
+
+/*
+ * INTERNAL UTILS
+ */
+
+extern bool paulis_hasOddNumY(PauliStr str);
+extern PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift);
+
+void internal_applyFirstOrderTrotterRepetition(
+    Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
+    vector<int>& states, PauliStrSum sum, qcomp angle, bool reverse
+) {
+    // apply each sum term as a gadget, in forward or reverse order
+    for (qindex i=0; i<sum.numTerms; i++) {
+        int j = reverse? sum.numTerms - i - 1 : i;
+        qcomp coeff = sum.coeffs[j];
+        PauliStr str = sum.strings[j];
+
+        // effect |psi> -> exp(i angle * sum)|psi>
+        qcomp arg = angle * coeff;
+        localiser_statevec_anyCtrlPauliGadget(qureg, ketCtrls, states, str, arg);
+
+        if (!qureg.isDensityMatrix)
+            continue;
+
+        // effect rho -> rho dagger(i angle * sum)
+        arg *= paulis_hasOddNumY(str) ? 1 : -1;
+        str = paulis_getShiftedPauliStr(str, qureg.numQubits);
+        localiser_statevec_anyCtrlPauliGadget(qureg, braCtrls, states, str, arg);
+    }
+}
+
+void internal_applyHigherOrderTrotterRepetition(
+    Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
+    vector<int>& states, PauliStrSum sum, qcomp angle, int order
+) {
+    if (order == 1) {
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle, false);
+    
+    } else if (order == 2) {
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle/2, false);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle/2, true);
+    
+    } else {
+        qreal p = 1. / (4 - std::pow(4, 1./(order-1)));
+        qcomp a = p * angle;
+        qcomp b = (1-4*p) * angle;
+
+        int lower = order - 2;
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, b, lower);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
+    }
+}
+
+void internal_applyAllTrotterRepetitions(
+    Qureg qureg, int* controls, int* states, int numControls, 
+    PauliStrSum sum, qcomp angle, int order, int reps
+) {
+    // exp(i angle sum) = identity when angle=0
+    if (angle == qcomp(0,0))
+        return;
+
+    // prepare control-qubit lists once for all invoked gadgets below
+    auto ketCtrlsVec = util_getVector(controls, numControls);
+    auto braCtrlsVec = (qureg.isDensityMatrix)? util_getBraQubits(ketCtrlsVec, qureg) : vector<int>{};
+    auto statesVec = util_getVector(states, numControls);
+
+    qcomp arg = angle / reps;
+
+    // perform carefully-ordered sequence of gadgets
+    for (int r=0; r<reps; r++)
+        internal_applyHigherOrderTrotterRepetition(
+            qureg, ketCtrlsVec, braCtrlsVec, statesVec, sum, arg, order);
+
+    /// @todo
+    /// the accuracy of Trotterisation is greatly improved by randomisation
+    /// or (even sub-optimal) grouping into commuting terms. Should we 
+    /// implement these above or into another function?
+}
+
+
+
+/*
+ * PAULI STR SUM GADGETS
+ */
+
+extern "C" {
+
+void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_pauliStrSumTargets(sum, qureg, __func__);
+    validate_trotterParams(qureg, order, reps, __func__);
+    // sum is permitted to be non-Hermitian
+
+    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps);
+}
+
+void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_pauliStrSumTargets(sum, qureg, __func__);
+    validate_pauliStrSumIsHermitian(sum, __func__);
+    validate_trotterParams(qureg, order, reps, __func__);
+
+    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps);
+}
+
+void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_pauliStrSumIsHermitian(sum, __func__);
+    validate_controlAndPauliStrSumTargets(qureg, control, sum, __func__);
+    validate_trotterParams(qureg, order, reps, __func__);
+    
+    internal_applyAllTrotterRepetitions(qureg, &control, nullptr, 1, sum, angle, order, reps);
+}
+
+void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_pauliStrSumIsHermitian(sum, __func__);
+    validate_controlsAndPauliStrSumTargets(qureg, controls, numControls, sum, __func__);
+    validate_trotterParams(qureg, order, reps, __func__);
+
+    internal_applyAllTrotterRepetitions(qureg, controls, nullptr, numControls, sum, angle, order, reps);
+}
+
+void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(sum, __func__);
+    validate_pauliStrSumIsHermitian(sum, __func__);
+    validate_controlsAndPauliStrSumTargets(qureg, controls, numControls, sum, __func__);
+    validate_controlStates(states, numControls, __func__); // permits states==nullptr
+    validate_trotterParams(qureg, order, reps, __func__);
+
+    internal_applyAllTrotterRepetitions(qureg, controls, states, numControls, sum, angle, order, reps);
+}
+
+} // end de-mangler
+
+void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, vector<int> controls, PauliStrSum sum, qreal angle, int order, int reps) {
+
+    applyMultiControlledTrotterizedPauliStrSumGadget(qureg, controls.data(), controls.size(), sum, angle, order, reps);
+}
+
+void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, vector<int> controls, vector<int> states, PauliStrSum sum, qreal angle, int order, int reps) {
+    validate_controlsMatchStates(controls.size(), states.size(), __func__);
+
+    applyMultiStateControlledTrotterizedPauliStrSumGadget(qureg, controls.data(), states.data(), controls.size(), sum, angle, order, reps);
+}
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 45f6d341c..d617ba8df 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -13,5 +13,6 @@ target_sources(tests
   operations.cpp
   paulis.cpp
   qureg.cpp
+  trotterisation.cpp
   types.cpp
 )
\ No newline at end of file
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index 0fdf32b67..872dd13ae 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -2148,20 +2148,3 @@ TEST_CASE( "postMultiplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG
 
 
 /** @} (end defgroup) */
-
-
-
-/**
- * @todo
- * UNTESTED FUNCTIONS
- */
-
-void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);
-
-void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps);
-
-void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps);
-
-void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
-
-void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
diff --git a/tests/unit/trotterisation.cpp b/tests/unit/trotterisation.cpp
new file mode 100644
index 000000000..174b7c66e
--- /dev/null
+++ b/tests/unit/trotterisation.cpp
@@ -0,0 +1,36 @@
+/** @file
+ * Unit tests of the trotterisation module.
+ *
+ * @author Tyson Jones
+ * 
+ * @defgroup unittrotter Trotterisation
+ * @ingroup unittests
+ */
+
+#include "quest.h"
+
+
+
+/*
+ * UTILITIES
+ */
+
+#define TEST_CATEGORY \
+    LABEL_UNIT_TAG "[trotterisation]"
+
+
+
+/**
+ * @todo
+ * UNTESTED FUNCTIONS
+ */
+
+void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);
+
+void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps);
+
+void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps);
+
+void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
+
+void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps);

From e0a0e96bb183d6d6fc9d4221cb06f93caf0b4d65 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 21 Jul 2025 22:21:33 +0200
Subject: [PATCH 15/32] patched Trotterisation of order >= 4

Previously, the order >= 4 scenario of Trotterisation did not correctly invoke recursion, but instead called first order Trotterisation five times without symmetrisation. This meant passing order=4 erroneously excluded symmetrisation (halving the Trotter depth), and passing order>=6 merely performed unsymmetrised fourth-order Trotterisation.

Thankfully the exacted operation was still a valid Trotter approximation of the intended unitary, albeit of lower order and ergo accuracy than expected. This was not caught by the unit tests since they do not exist, as warned in the function documentation. Eep!
---
 quest/src/api/trotterisation.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/quest/src/api/trotterisation.cpp b/quest/src/api/trotterisation.cpp
index 43d371059..d8d5351c2 100644
--- a/quest/src/api/trotterisation.cpp
+++ b/quest/src/api/trotterisation.cpp
@@ -68,11 +68,11 @@ void internal_applyHigherOrderTrotterRepetition(
         qcomp b = (1-4*p) * angle;
 
         int lower = order - 2;
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, b, lower);
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
+        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower); // angle -> a
+        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
+        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, b, lower); // angle -> b
+        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
+        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
     }
 }
 

From 53c68280df3ce7bfb14a6e0fd047d2a3dab71c1b Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Tue, 22 Jul 2025 15:15:59 +0200
Subject: [PATCH 16/32] added multiply projector functions

specifically:
- multiplyQubitProjector
- postMultiplyQubitProjector
- multiplyMultiQubitProjector
- postMultiplyMultiQubitProjector

Also updated multiplication doc warnings
---
 quest/include/multiplication.h    | 163 +++++++++++++++++++++---------
 quest/src/api/multiplication.cpp  |  64 ++++++++++++
 quest/src/api/operations.cpp      |   4 +-
 quest/src/core/localiser.cpp      |   1 -
 quest/src/cpu/cpu_subroutines.cpp |   4 +
 tests/unit/operations.cpp         | 102 +++++++++++++++++++
 6 files changed, 286 insertions(+), 52 deletions(-)

diff --git a/quest/include/multiplication.h b/quest/include/multiplication.h
index 8ebc7aa7f..e8cfd5912 100644
--- a/quest/include/multiplication.h
+++ b/quest/include/multiplication.h
@@ -96,9 +96,7 @@ extern "C" {
 void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
 
 
-/** @notyettested
- * 
- * Multiplies a general one-qubit dense @p matrix upon the specified @p target 
+/** Multiplies a general one-qubit dense @p matrix upon the specified @p target 
  * qubit of the density matrix @p qureg, from the right-hand side.
  *  
  * @formulae
@@ -177,10 +175,8 @@ void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matr);
 
 
 /// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
 /// @see
-/// - postMultiplyCompMatr1
+/// - postMultiplyCompMatr1()
 void postMultiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix);
 
 
@@ -217,10 +213,8 @@ void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix
 
 
 /// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
 /// @see
-/// - postMultiplyCompMatr1
+/// - postMultiplyCompMatr1()
 void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix);
 
 
@@ -272,9 +266,8 @@ extern "C" {
 void multiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matr);
 
 
-/// @notyettested
-/// @notyetvalidated
 /// @notyetdoced
+/// @see postMultiplyCompMatr1()
 void postMultiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix);
 
 
@@ -305,9 +298,8 @@ extern "C" {
 void multiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matr);
 
 
-/// @notyettested
-/// @notyetvalidated
 /// @notyetdoced
+/// @see postMultiplyCompMatr1()
 void postMultiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix);
 
 
@@ -338,9 +330,8 @@ extern "C" {
 void multiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
 
 
-/// @notyettested
-/// @notyetvalidated
 /// @notyetdoced
+/// @see postMultiplyCompMatr1()
 void postMultiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
 
 
@@ -351,9 +342,10 @@ void postMultiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr ma
 void multiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
 
 
-/// @notyettested
-/// @notyetvalidated
 /// @notyetdoced
+/// @see 
+/// - postMultiplyCompMatr1()
+/// - applyDiagMatrPower()
 void postMultiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
 
 
@@ -419,27 +411,31 @@ extern "C" {
 /// @notyetdoced
 /// @notyetvalidated
 /// @see
-/// - multiplyCompMatr1
+/// - multiplyCompMatr1()
 void multiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
 
 
 /// @notyetdoced
-/// @notyettested
 /// @notyetvalidated
+/// @see
+/// - postMultiplyCompMatr1()
+/// - applyFullStateDiagMatr()
 void postMultiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
 
 
 /// @notyetdoced
 /// @notyetvalidated
 /// @see
-/// - multiplyCompMatr1
-/// - applyDiagMatrPower
+/// - multiplyCompMatr1()
+/// - applyFullStateDiagMatr()
 void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
 
 
 /// @notyetdoced
-/// @notyettested
 /// @notyetvalidated
+/// @see
+/// - postMultiplyCompMatr1()
+/// - applyFullStateDiagMatr()
 void postMultiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
 
 
@@ -466,13 +462,16 @@ extern "C" {
 
 
 /// @notyetdoced
-/// @see multiplyCompMatr1()
+/// @see 
+/// - multiplyCompMatr1()
+/// - applySwap()
 void multiplySwap(Qureg qureg, int qubit1, int qubit2);
 
 
 /// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
+/// @see 
+/// - multiplyCompMatr1()
+/// - applySwap()
 void postMultiplySwap(Qureg qureg, int qubit1, int qubit2);
 
 
@@ -499,38 +498,44 @@ extern "C" {
 
 
 /// @notyetdoced
-/// @notyettested
-/// @see multiplyCompMatr1()
+/// @see 
+/// - multiplyCompMatr1()
+/// - applyPauliX()
 void multiplyPauliX(Qureg qureg, int target);
 
 
 /// @notyetdoced
-/// @notyettested
-/// @see multiplyCompMatr1()
+/// @see 
+/// - multiplyCompMatr1()
+/// - applyPauliY()
 void multiplyPauliY(Qureg qureg, int target);
 
 
 /// @notyetdoced
-/// @notyettested
-/// @see multiplyCompMatr1()
+/// @see 
+/// - multiplyCompMatr1()
+/// - applyPauliZ()
 void multiplyPauliZ(Qureg qureg, int target);
 
 
 /// @notyetdoced
-/// @notyettested
-/// @see postMultiplyCompMatr1()
+/// @see 
+/// - postMultiplyCompMatr1()
+/// - applyPauliX()
 void postMultiplyPauliX(Qureg qureg, int target);
 
 
 /// @notyetdoced
-/// @notyettested
-/// @see postMultiplyCompMatr1()
+/// @see 
+/// - postMultiplyCompMatr1()
+/// - applyPauliY()
 void postMultiplyPauliY(Qureg qureg, int target);
 
 
 /// @notyetdoced
-/// @notyettested
-/// @see postMultiplyCompMatr1()
+/// @see 
+/// - postMultiplyCompMatr1()
+/// - applyPauliZ()
 void postMultiplyPauliZ(Qureg qureg, int target);
 
 
@@ -557,13 +562,16 @@ extern "C" {
 
 
 /// @notyetdoced
-/// @see multiplyCompMatr1()
+/// @see 
+/// - multiplyCompMatr1()
+/// - applyPauliStr()
 void multiplyPauliStr(Qureg qureg, PauliStr str);
 
 
 /// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
+/// @see 
+/// - postMultiplyCompMatr1()
+/// - applyPauliStr()
 void postMultiplyPauliStr(Qureg qureg, PauliStr str);
 
 
@@ -597,8 +605,9 @@ void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
 
 
 /// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
+/// @see 
+/// - postMultiplyCompMatr1()
+/// - applyPauliGadget()
 void postMultiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
 
 
@@ -627,13 +636,14 @@ extern "C" {
 /// @notyetdoced
 /// @see 
 /// - multiplyCompMatr1()
-/// - applyPhaseGadget
+/// - applyPhaseGadget()
 void multiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
 
 
 /// @notyetdoced
-/// @notyettested
-/// @notyetvalidated
+/// @see
+/// - postMultiplyCompMatr1()
+/// - applyPhaseGadget()
 void postMultiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
 
 
@@ -681,13 +691,17 @@ extern "C" {
 
 
 /// @notyetdoced
-/// @see multiplyCompMatr1()
+/// @see 
+/// - multiplyCompMatr1()
+/// - applyMultiQubitNot()
 void multiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
 
 
 /// @notyetdoced
-/// @notyettested
 /// @notyetvalidated
+/// @see
+/// - postMultiplyCompMatr1()
+/// - applyMultiQubitNot()
 void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
 
 
@@ -699,7 +713,6 @@ void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
 #ifdef __cplusplus
 
 
-/// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
@@ -707,7 +720,6 @@ void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
 void multiplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
 
 
-/// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
@@ -721,6 +733,57 @@ void postMultiplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
 
 
 
+/** 
+ * @defgroup mult_projectors Projectors
+ * @brief Functions for pre- or post-multiplying projectors upon density matrices.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/// @notyetdoced
+/// @notyetvalidated
+/// @see
+/// - multiplyCompMatr1()
+/// - applyQubitProjector()
+void multiplyQubitProjector(Qureg qureg, int qubit, int outcome);
+
+
+/// @notyetdoced
+/// @notyetvalidated
+/// @see
+/// - multiplyCompMatr1()
+/// - applyMultiQubitProjector()
+void multiplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits);
+
+
+/// @notyetdoced
+/// @notyetvalidated
+/// @see
+/// - postMultiplyCompMatr1()
+/// - applyQubitProjector()
+void postMultiplyQubitProjector(Qureg qureg, int qubit, int outcome);
+
+
+/// @notyetdoced
+/// @notyetvalidated
+/// @see
+/// - postMultiplyCompMatr1()
+/// - applyMultiQubitProjector()
+void postMultiplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+
+
 /** 
  * @defgroup mult_paulistrsum PauliStrSum
  * @brief Functions for pre- or post-multiplying weighted sums of Pauli 
@@ -741,8 +804,8 @@ void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
 
 
 /// @notyetdoced
-/// @notyettested
 /// @notyetvalidated
+/// @see multiplyCompMatr1()
 void postMultiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
 
 
diff --git a/quest/src/api/multiplication.cpp b/quest/src/api/multiplication.cpp
index 0bb0a6b72..e0c37d47c 100644
--- a/quest/src/api/multiplication.cpp
+++ b/quest/src/api/multiplication.cpp
@@ -569,6 +569,70 @@ void postMultiplyMultiQubitNot(Qureg qureg, vector<int> targets) {
 
 
 
+/*
+ * projectors
+ */
+
+extern "C" {
+
+void multiplyQubitProjector(Qureg qureg, int qubit, int outcome) {
+    validate_quregFields(qureg, __func__);
+    validate_target(qureg, qubit, __func__);
+    validate_measurementOutcomeIsValid(outcome, __func__); 
+
+    qreal prob = 1;
+    localiser_statevec_multiQubitProjector(qureg, {qubit}, {outcome}, prob);
+}
+
+void multiplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits) {
+    validate_quregFields(qureg, __func__);
+    validate_targets(qureg, qubits, numQubits, __func__);
+    validate_measurementOutcomesAreValid(outcomes, numQubits, __func__);
+
+    qreal prob = 1;
+    auto qubitVec = util_getVector(qubits, numQubits);
+    auto outcomeVec = util_getVector(outcomes, numQubits);
+    localiser_statevec_multiQubitProjector(qureg, qubitVec, outcomeVec, prob);
+}
+
+void postMultiplyQubitProjector(Qureg qureg, int qubit, int outcome) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_target(qureg, qubit, __func__);
+    validate_measurementOutcomeIsValid(outcome, __func__); 
+    
+    qreal prob = 1;
+    localiser_statevec_multiQubitProjector(qureg, {util_getBraQubit(qubit,qureg)}, {outcome}, prob);
+}
+
+void postMultiplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_targets(qureg, qubits, numQubits, __func__);
+    validate_measurementOutcomesAreValid(outcomes, numQubits, __func__);
+
+    qreal prob = 1;
+    auto qubitVec = util_getBraQubits(util_getVector(qubits, numQubits), qureg);
+    auto outcomeVec = util_getVector(outcomes, numQubits);
+    localiser_statevec_multiQubitProjector(qureg, qubitVec, outcomeVec, prob);
+}
+
+} // end de-mangler
+
+void multiplyMultiQubitProjector(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+    validate_measurementOutcomesMatchTargets(qubits.size(), outcomes.size(), __func__);
+
+    multiplyMultiQubitProjector(qureg, qubits.data(), outcomes.data(), outcomes.size());
+}
+
+void postMultiplyMultiQubitProjector(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+    validate_measurementOutcomesMatchTargets(qubits.size(), outcomes.size(), __func__);
+
+    postMultiplyMultiQubitProjector(qureg, qubits.data(), outcomes.data(), outcomes.size());
+}
+
+
+
 /*
  * Pauli string sums
  */
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index ede4ff011..ce5edb579 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -1534,8 +1534,9 @@ void applyQubitProjector(Qureg qureg, int target, int outcome) {
     validate_target(qureg, target, __func__);
     validate_measurementOutcomeIsValid(outcome, __func__); 
     
-    // we permit the outcome to be negligibly likely, leaving state = null
     qreal prob = 1;
+
+    // density matrix has an optimised func in lieu of calling the statevector func twice
     (qureg.isDensityMatrix)?
         localiser_densmatr_multiQubitProjector(qureg, {target}, {outcome}, prob):
         localiser_statevec_multiQubitProjector(qureg, {target}, {outcome}, prob);
@@ -1550,6 +1551,7 @@ void applyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQu
     auto qubitVec = util_getVector(qubits, numQubits);
     auto outcomeVec = util_getVector(outcomes, numQubits);
 
+    // density matrix has an optimised func in lieu of calling the statevector func twice
     (qureg.isDensityMatrix)?
         localiser_densmatr_multiQubitProjector(qureg, qubitVec, outcomeVec, prob):
         localiser_statevec_multiQubitProjector(qureg, qubitVec, outcomeVec, prob);
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 71cb8f211..2fa09c9f4 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -2305,7 +2305,6 @@ qreal localiser_densmatr_calcHilbertSchmidtDistance(Qureg quregA, Qureg quregB)
 
 
 void localiser_statevec_multiQubitProjector(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {
-    assert_localiserGivenStateVec(qureg);
 
     // this routine is always embarrassingly parallel; however, we handle the
     // prefix-qubits here so that the backend can receive only the suffix qubits
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index ce51f2b78..c519d2007 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -2345,6 +2345,10 @@ void cpu_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vecto
 template <int NumQubits>
 void cpu_densmatr_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, vector<int> outcomes, qreal prob) {
 
+    // this function is merely an optimisation to avoid calling the above
+    // cpu_statevec_multiQubitProjector_sub() twice upon a density matrix;
+    // pre- and post-multiply projector versions DO just call above.
+
     // qubits are unconstrained, and can include prefix qubits
     assert_numTargsMatchesTemplateParam(qubits.size(), NumQubits);
 
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index 872dd13ae..3ceec40ee 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -2088,6 +2088,108 @@ TEST_CASE( "postMultiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_
 }
 
 
+TEST_CASE( "multiplyQubitProjector", TEST_CATEGORY_OPS ) {
+
+    PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        GENERATE( range(0,10) );
+        int target = GENERATE_COPY( range(0,numQubits) );
+        int outcome = GENERATE( 0, 1 );
+
+        qmatrix projector = getProjector(outcome);
+
+        auto testFunc = [&](Qureg qureg, auto& ref) {
+            multiplyQubitProjector(qureg, target, outcome);
+            multiplyReferenceOperator(ref, {target}, projector);
+        };
+
+        CAPTURE( target, outcome );
+        SECTION( LABEL_STATEVEC ) { TEST_ON_CACHED_QUREGS(statevecQuregs, statevecRef, testFunc); }
+        SECTION( LABEL_DENSMATR ) { TEST_ON_CACHED_QUREGS(densmatrQuregs, densmatrRef, testFunc); }
+    }
+
+    /// @todo input validation
+}
+
+
+TEST_CASE( "postMultiplyQubitProjector", TEST_CATEGORY_OPS ) {
+
+    PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        GENERATE( range(0,10) );
+        int target = GENERATE_COPY( range(0,numQubits) );
+        int outcome = GENERATE( 0, 1 );
+
+        qmatrix projector = getProjector(outcome);
+
+        auto testFunc = [&](Qureg qureg, auto& ref) {
+            postMultiplyQubitProjector(qureg, target, outcome);
+            postMultiplyReferenceOperator(ref, {target}, projector);
+        };
+
+        CAPTURE( target, outcome );
+        SECTION( LABEL_DENSMATR ) { TEST_ON_CACHED_QUREGS(densmatrQuregs, densmatrRef, testFunc); }
+    }
+
+    /// @todo input validation
+}
+
+
+TEST_CASE( "multiplyMultiQubitProjector", TEST_CATEGORY_OPS ) {
+
+    PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        int numTargs = GENERATE_COPY( range(1,numQubits+1) );
+        auto targets = GENERATE_TARGS( numQubits, numTargs );
+        auto outcomes = getRandomOutcomes(numTargs);
+
+        qmatrix projector = getProjector(targets, outcomes, numQubits);
+
+        auto testFunc = [&](Qureg qureg, auto& ref) {
+            multiplyMultiQubitProjector(qureg, targets.data(), outcomes.data(), numTargs);
+            multiplyReferenceOperator(ref, projector);
+        };
+
+        CAPTURE( targets, outcomes );
+        SECTION( LABEL_STATEVEC ) { TEST_ON_CACHED_QUREGS(statevecQuregs, statevecRef, testFunc); }
+        SECTION( LABEL_DENSMATR ) { TEST_ON_CACHED_QUREGS(densmatrQuregs, densmatrRef, testFunc); }
+    }
+
+    /// @todo input validation
+}
+
+
+TEST_CASE( "postMultiplyMultiQubitProjector", TEST_CATEGORY_OPS ) {
+
+    PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        int numTargs = GENERATE_COPY( range(1,numQubits+1) );
+        auto targets = GENERATE_TARGS( numQubits, numTargs );
+        auto outcomes = getRandomOutcomes(numTargs);
+
+        qmatrix projector = getProjector(targets, outcomes, numQubits);
+
+        auto testFunc = [&](Qureg qureg, auto& ref) {
+            postMultiplyMultiQubitProjector(qureg, targets.data(), outcomes.data(), numTargs);
+            postMultiplyReferenceOperator(ref, projector);
+        };
+
+        CAPTURE( targets, outcomes );
+        SECTION( LABEL_DENSMATR ) { TEST_ON_CACHED_QUREGS(densmatrQuregs, densmatrRef, testFunc); }
+    }
+
+    /// @todo input validation
+}
+
+
 TEST_CASE( "multiplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );

From 2db00e55a376972d4d92a066dc9d7deaddb27af8 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Wed, 13 Aug 2025 23:09:25 +0200
Subject: [PATCH 17/32] renamed multiply and postmultiply functions to
 leftapply and rightapply (#668)

as discussed in issue #663
---
 README.md                         |   4 +-
 docs/tutorial.md                  |   4 +-
 quest/include/deprecated.h        |  30 ++---
 quest/include/multiplication.h    | 214 +++++++++++++++---------------
 quest/include/operations.h        |  16 +--
 quest/src/api/multiplication.cpp  | 136 +++++++++----------
 quest/src/core/accelerator.cpp    |  20 +--
 quest/src/core/accelerator.hpp    |   4 +-
 quest/src/core/errors.cpp         |   8 +-
 quest/src/core/errors.hpp         |   2 +-
 quest/src/core/localiser.cpp      |   6 +-
 quest/src/core/localiser.hpp      |   2 +-
 quest/src/cpu/cpu_subroutines.cpp |   6 +-
 quest/src/cpu/cpu_subroutines.hpp |   2 +-
 quest/src/gpu/gpu_kernels.cuh     |   6 +-
 quest/src/gpu/gpu_subroutines.cpp |   4 +-
 quest/src/gpu/gpu_subroutines.hpp |   2 +-
 tests/unit/operations.cpp         | 180 ++++++++++++-------------
 tests/utils/evolve.cpp            |  40 +++---
 tests/utils/evolve.hpp            |  46 +++----
 20 files changed, 366 insertions(+), 366 deletions(-)

diff --git a/README.md b/README.md
index 9b03e0578..39e193f13 100644
--- a/README.md
+++ b/README.md
@@ -116,8 +116,8 @@ applyMultiQubitProjector(qureg, targs, outcomes, ntargs);
 applyControlledPauliGadget(qureg, ctrl, paulistr, angle);
 applyMultiStateControlledSwap(qureg, ctrls, states, nctrls, targ1, targ2);
 
-multiplyCompMatr1(qureg, targ, getInlineCompMatr1( {{1,2i},{3i,4}} ));
-multiplyDiagMatrPower(qureg, targs, ntargs, diagmatr, exponent);
+leftapplyCompMatr1(qureg, targ, getInlineCompMatr1( {{1,2i},{3i,4}} ));
+leftapplyDiagMatrPower(qureg, targs, ntargs, diagmatr, exponent);
 ```
 and extremely powerful
 ```cpp
diff --git a/docs/tutorial.md b/docs/tutorial.md
index 68b3fbf1b..9c6fcf20e 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -713,9 +713,9 @@ We can even directy mix density matrices together
 mixQureg(rho1, rho2, prob);
 ```
 
-Sometimes we wish to left-multiply general operators upon density matrices without also right-multiplying their adjoint - i.e. our operators should _not_ be effected as unitaries. We can do this with the `multiply*()` functions.
+Sometimes we wish to left-multiply general operators upon density matrices without also right-multiplying their adjoint - i.e. our operators should _not_ be effected as unitaries. We can do this with the `leftapply*()` and `rightapply*()` functions.
 ```cpp
-multiplyDiagMatrPower(rho, fullmatrix, 0.5);
+leftapplyDiagMatrPower(rho, fullmatrix, 0.5);
 ```
 
 
diff --git a/quest/include/deprecated.h b/quest/include/deprecated.h
index efd7afd9d..47df35ad5 100644
--- a/quest/include/deprecated.h
+++ b/quest/include/deprecated.h
@@ -370,7 +370,7 @@ typedef enum pauliOpType _NoWarnPauliOpType;
 
 
 #define applyMultiControlledMatrixN(...) \
-    _ERROR_FUNC_REMOVED("applyMultiControlledMatrixN()") // our new multiplyCompMatr doesn't accept controls
+    _ERROR_FUNC_REMOVED("applyMultiControlledMatrixN()") // our new leftapplyCompMatr doesn't accept controls
 
 
 #define syncQuESTSuccess(...) \
@@ -800,8 +800,8 @@ static inline QuESTEnv _createQuESTEnv() {
     createFullStateDiagMatrFromPauliStrSumFile(fn)
 
 #define applyDiagonalOp(...) \
-    _WARN_FUNC_RENAMED("applyDiagonalOp()", "multiplyFullStateDiagMatr()") \
-    multiplyFullStateDiagMatr(__VA_ARGS__)
+    _WARN_FUNC_RENAMED("applyDiagonalOp()", "leftapplyFullStateDiagMatr()") \
+    leftapplyFullStateDiagMatr(__VA_ARGS__)
 
 #define calcExpecDiagonalOp(...) \
     _WARN_FUNC_RENAMED("calcExpecDiagonalOp()", "calcExpecNonHermitianFullStateDiagMatr()") \
@@ -822,8 +822,8 @@ static inline QuESTEnv _createQuESTEnv() {
     applyDiagMatr(__VA_ARGS__)
 
 #define applySubDiagonalOp(...) \
-    _WARN_FUNC_RENAMED("applySubDiagonalOp()", "multiplyDiagMatr()") \
-    multiplyDiagMatr(__VA_ARGS__)
+    _WARN_FUNC_RENAMED("applySubDiagonalOp()", "leftapplyDiagMatr()") \
+    leftapplyDiagMatr(__VA_ARGS__)
 
 static inline void _applyGateSubDiagonalOp(Qureg qureg, int* targets, int numTargets, DiagMatr op) {
     qreal eps = getValidationEpsilon();
@@ -1029,21 +1029,21 @@ static inline qreal _calcExpecPauliSum(Qureg qureg, _NoWarnPauliOpType* allPauli
 static inline void _applyPauliSum(Qureg inQureg, _NoWarnPauliOpType* allPauliCodes, qreal* termCoeffs, int numSumTerms, Qureg outQureg) {
     PauliStrSum sum = _createPauliStrSumFromCodes(inQureg.numQubits, allPauliCodes, termCoeffs, numSumTerms);
     setQuregToClone(outQureg, inQureg); 
-    multiplyPauliStrSum(outQureg, sum, inQureg);
+    leftapplyPauliStrSum(outQureg, sum, inQureg);
     destroyPauliStrSum(sum);
 }
 
 #define applyPauliSum(...) \
-    _WARN_FUNC_RENAMED("applyPauliSum(inQureg, ..., outQureg)", "multiplyPauliStrSum(outQureg, PauliStrSum)") \
+    _WARN_FUNC_RENAMED("applyPauliSum(inQureg, ..., outQureg)", "leftapplyPauliStrSum(outQureg, PauliStrSum)") \
     _applyPauliSum(__VA_ARGS__)
 
 static inline void _applyPauliHamil(Qureg inQureg, PauliStrSum hamil, Qureg outQureg) {
     setQuregToClone(outQureg, inQureg); 
-    multiplyPauliStrSum(outQureg, hamil, inQureg);
+    leftapplyPauliStrSum(outQureg, hamil, inQureg);
 }
 
 #define applyPauliHamil(...) \
-    _WARN_FUNC_RENAMED("applyPauliHamil(inQureg, PauliHamil, outQureg)", "multiplyPauliStrSum(qureg, PauliStrSum, workspace)") \
+    _WARN_FUNC_RENAMED("applyPauliHamil(inQureg, PauliHamil, outQureg)", "leftapplyPauliStrSum(qureg, PauliStrSum, workspace)") \
     _applyPauliHamil(__VA_ARGS__)
 
 
@@ -1109,16 +1109,16 @@ static inline void _applyPauliHamil(Qureg inQureg, PauliStrSum hamil, Qureg outQ
 
 
 #define applyMatrix2(qureg, targ, ...) \
-    _WARN_FUNC_RENAMED("applyMatrix2()", "multiplyCompMatr1()") \
-    multiplyCompMatr1(qureg, targ, _GET_COMP_MATR_1_FROM_COMPLEX_MATRIX_2(__VA_ARGS__))
+    _WARN_FUNC_RENAMED("applyMatrix2()", "leftapplyCompMatr1()") \
+    leftapplyCompMatr1(qureg, targ, _GET_COMP_MATR_1_FROM_COMPLEX_MATRIX_2(__VA_ARGS__))
 
 #define applyMatrix4(qureg, targ1, targ2, ...) \
-    _WARN_FUNC_RENAMED("applyMatrix4()", "multiplyCompMatr2()") \
-    multiplyCompMatr2(qureg, targ1, targ2, _GET_COMP_MATR_2_FROM_COMPLEX_MATRIX_4(__VA_ARGS__))
+    _WARN_FUNC_RENAMED("applyMatrix4()", "leftapplyCompMatr2()") \
+    leftapplyCompMatr2(qureg, targ1, targ2, _GET_COMP_MATR_2_FROM_COMPLEX_MATRIX_4(__VA_ARGS__))
 
 #define applyMatrixN(...) \
-    _WARN_FUNC_RENAMED("applyMatrixN()", "multiplyCompMatr()") \
-    multiplyCompMatr(__VA_ARGS__)
+    _WARN_FUNC_RENAMED("applyMatrixN()", "leftapplyCompMatr()") \
+    leftapplyCompMatr(__VA_ARGS__)
 
 
 
diff --git a/quest/include/multiplication.h b/quest/include/multiplication.h
index e8cfd5912..81d27a500 100644
--- a/quest/include/multiplication.h
+++ b/quest/include/multiplication.h
@@ -7,8 +7,8 @@
  * 
  * @defgroup multiplication Multiplication
  * @ingroup api
- * @brief Functions for directly multiplying operators upon 
- *        density matrices.
+ * @brief Functions for directly pre- or post-multiplying operators 
+ *        upon density matrices.
  * @{
  */
 
@@ -75,12 +75,12 @@ extern "C" {
         {0.3i, 0.4i}
     });
 
-    multiplyCompMatr1(qureg, 2, matrix); 
+    leftapplyCompMatr1(qureg, 2, matrix); 
  * ```
  *
  * @param[in,out] qureg  the state to modify.
  * @param[in]     target the index of the target qubit.
- * @param[in]     matrix the Z-basis matrix to multiply.
+ * @param[in]     matrix the Z-basis matrix to multiply upon the left.
  * @throws @validationerror
  * - if @p qureg or @p matrix are uninitialised.
  * - if @p target is an invalid qubit index.
@@ -88,12 +88,12 @@ extern "C" {
  * - getCompMatr1()
  * - getInlineCompMatr1()
  * - applyCompMatr1()
- * - postMultiplyCompMatr1()
+ * - rightapplyCompMatr1()
  * - applyQubitProjector()
- * - multiplyCompMatr()
+ * - leftapplyCompMatr()
  * @author Tyson Jones
  */
-void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
+void leftapplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
 
 
 /** Multiplies a general one-qubit dense @p matrix upon the specified @p target 
@@ -124,7 +124,7 @@ void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
         {0.3i, 0.4i}
     });
 
-    postMultiplyCompMatr1(qureg, 2, matrix); 
+    rightapplyCompMatr1(qureg, 2, matrix); 
  * ```
  *
  * @param[in,out] qureg  the state to modify.
@@ -138,11 +138,11 @@ void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
  * - getCompMatr1()
  * - getInlineCompMatr1()
  * - applyCompMatr1()
- * - multiplyCompMatr1()
- * - multiplyCompMatr()
+ * - leftapplyCompMatr1()
+ * - leftapplyCompMatr()
  * @author Tyson Jones
  */
-void postMultiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
+void rightapplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
 
 
 // end de-mangler
@@ -170,14 +170,14 @@ extern "C" {
 /// @notyetdoced
 /// @see
 /// - applyCompMatr2()
-/// - multiplyCompMatr1()
-void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matr);
+/// - leftapplyCompMatr1()
+void leftapplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matr);
 
 
 /// @notyetdoced
 /// @see
-/// - postMultiplyCompMatr1()
-void postMultiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix);
+/// - rightapplyCompMatr1()
+void rightapplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix);
 
 
 // end de-mangler
@@ -207,15 +207,15 @@ extern "C" {
  * 
  * @see
  * - applyCompMatr()
- * - multiplyCompMatr1()
+ * - leftapplyCompMatr1()
  */
-void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix);
+void leftapplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix);
 
 
 /// @notyetdoced
 /// @see
-/// - postMultiplyCompMatr1()
-void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix);
+/// - rightapplyCompMatr1()
+void rightapplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix);
 
 
 // end de-mangler
@@ -230,16 +230,16 @@ void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr ma
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see multiplyCompMatr()
-void multiplyCompMatr(Qureg qureg, std::vector<int> targets, CompMatr matr);
+/// @see leftapplyCompMatr()
+void leftapplyCompMatr(Qureg qureg, std::vector<int> targets, CompMatr matr);
 
 
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see postMultiplyCompMatr()
-void postMultiplyCompMatr(Qureg qureg, std::vector<int> targets, CompMatr matr);
+/// @see rightapplyCompMatr()
+void rightapplyCompMatr(Qureg qureg, std::vector<int> targets, CompMatr matr);
 
 
 #endif 
@@ -262,13 +262,13 @@ extern "C" {
 
 
 /// @notyetdoced
-/// @see multiplyCompMatr1()
-void multiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matr);
+/// @see leftapplyCompMatr1()
+void leftapplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matr);
 
 
 /// @notyetdoced
-/// @see postMultiplyCompMatr1()
-void postMultiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix);
+/// @see rightapplyCompMatr1()
+void rightapplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix);
 
 
 // end de-mangler
@@ -294,13 +294,13 @@ extern "C" {
 
 
 /// @notyetdoced
-/// @see multiplyCompMatr1()
-void multiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matr);
+/// @see leftapplyCompMatr1()
+void leftapplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matr);
 
 
 /// @notyetdoced
-/// @see postMultiplyCompMatr1()
-void postMultiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix);
+/// @see rightapplyCompMatr1()
+void rightapplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix);
 
 
 // end de-mangler
@@ -326,27 +326,27 @@ extern "C" {
 
 
 /// @notyetdoced
-/// @see multiplyCompMatr1()
-void multiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
+/// @see leftapplyCompMatr1()
+void leftapplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
 
 
 /// @notyetdoced
-/// @see postMultiplyCompMatr1()
-void postMultiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
+/// @see rightapplyCompMatr1()
+void rightapplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix);
 
 
 /// @notyetdoced
 /// @see
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applyDiagMatrPower()
-void multiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
+void leftapplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
 
 
 /// @notyetdoced
 /// @see 
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyDiagMatrPower()
-void postMultiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
+void rightapplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent);
 
 
 // end de-mangler
@@ -361,32 +361,32 @@ void postMultiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMa
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see multiplyDiagMatr()
-void multiplyDiagMatr(Qureg qureg, std::vector<int> targets, DiagMatr matrix);
+/// @see leftapplyDiagMatr()
+void leftapplyDiagMatr(Qureg qureg, std::vector<int> targets, DiagMatr matrix);
 
 
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see postMultiplyDiagMatr()
-void postMultiplyDiagMatr(Qureg qureg, std::vector<int> targets, DiagMatr matrix);
+/// @see rightapplyDiagMatr()
+void rightapplyDiagMatr(Qureg qureg, std::vector<int> targets, DiagMatr matrix);
 
 
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see multiplyDiagMatrPower()
-void multiplyDiagMatrPower(Qureg qureg, std::vector<int> targets, DiagMatr matrix, qcomp exponent);
+/// @see leftapplyDiagMatrPower()
+void leftapplyDiagMatrPower(Qureg qureg, std::vector<int> targets, DiagMatr matrix, qcomp exponent);
 
 
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see postMultiplyDiagMatrPower()
-void postMultiplyDiagMatrPower(Qureg qureg, std::vector<int> targets, DiagMatr matrix, qcomp exponent);
+/// @see rightapplyDiagMatrPower()
+void rightapplyDiagMatrPower(Qureg qureg, std::vector<int> targets, DiagMatr matrix, qcomp exponent);
 
 
 #endif 
@@ -411,32 +411,32 @@ extern "C" {
 /// @notyetdoced
 /// @notyetvalidated
 /// @see
-/// - multiplyCompMatr1()
-void multiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
+/// - leftapplyCompMatr1()
+void leftapplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
 
 
 /// @notyetdoced
 /// @notyetvalidated
 /// @see
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyFullStateDiagMatr()
-void postMultiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
+void rightapplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix);
 
 
 /// @notyetdoced
 /// @notyetvalidated
 /// @see
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applyFullStateDiagMatr()
-void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
+void leftapplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
 
 
 /// @notyetdoced
 /// @notyetvalidated
 /// @see
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyFullStateDiagMatr()
-void postMultiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
+void rightapplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
 
 
 // end de-mangler
@@ -463,16 +463,16 @@ extern "C" {
 
 /// @notyetdoced
 /// @see 
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applySwap()
-void multiplySwap(Qureg qureg, int qubit1, int qubit2);
+void leftapplySwap(Qureg qureg, int qubit1, int qubit2);
 
 
 /// @notyetdoced
 /// @see 
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applySwap()
-void postMultiplySwap(Qureg qureg, int qubit1, int qubit2);
+void rightapplySwap(Qureg qureg, int qubit1, int qubit2);
 
 
 // end de-mangler
@@ -499,44 +499,44 @@ extern "C" {
 
 /// @notyetdoced
 /// @see 
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applyPauliX()
-void multiplyPauliX(Qureg qureg, int target);
+void leftapplyPauliX(Qureg qureg, int target);
 
 
 /// @notyetdoced
 /// @see 
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applyPauliY()
-void multiplyPauliY(Qureg qureg, int target);
+void leftapplyPauliY(Qureg qureg, int target);
 
 
 /// @notyetdoced
 /// @see 
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applyPauliZ()
-void multiplyPauliZ(Qureg qureg, int target);
+void leftapplyPauliZ(Qureg qureg, int target);
 
 
 /// @notyetdoced
 /// @see 
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyPauliX()
-void postMultiplyPauliX(Qureg qureg, int target);
+void rightapplyPauliX(Qureg qureg, int target);
 
 
 /// @notyetdoced
 /// @see 
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyPauliY()
-void postMultiplyPauliY(Qureg qureg, int target);
+void rightapplyPauliY(Qureg qureg, int target);
 
 
 /// @notyetdoced
 /// @see 
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyPauliZ()
-void postMultiplyPauliZ(Qureg qureg, int target);
+void rightapplyPauliZ(Qureg qureg, int target);
 
 
 // end de-mangler
@@ -563,16 +563,16 @@ extern "C" {
 
 /// @notyetdoced
 /// @see 
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applyPauliStr()
-void multiplyPauliStr(Qureg qureg, PauliStr str);
+void leftapplyPauliStr(Qureg qureg, PauliStr str);
 
 
 /// @notyetdoced
 /// @see 
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyPauliStr()
-void postMultiplyPauliStr(Qureg qureg, PauliStr str);
+void rightapplyPauliStr(Qureg qureg, PauliStr str);
 
 
 // end de-mangler
@@ -599,16 +599,16 @@ extern "C" {
 
 /// @notyetdoced
 /// @see 
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applyPauliGadget()
-void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
+void leftapplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
 
 
 /// @notyetdoced
 /// @see 
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyPauliGadget()
-void postMultiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
+void rightapplyPauliGadget(Qureg qureg, PauliStr str, qreal angle);
 
 
 // end de-mangler
@@ -635,16 +635,16 @@ extern "C" {
 
 /// @notyetdoced
 /// @see 
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applyPhaseGadget()
-void multiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
+void leftapplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
 
 
 /// @notyetdoced
 /// @see
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyPhaseGadget()
-void postMultiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
+void rightapplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle);
 
 
 // end de-mangler
@@ -659,16 +659,16 @@ void postMultiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal an
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see multiplyPhaseGadget()
-void multiplyPhaseGadget(Qureg qureg, std::vector<int> targets, qreal angle);
+/// @see leftapplyPhaseGadget()
+void leftapplyPhaseGadget(Qureg qureg, std::vector<int> targets, qreal angle);
 
 
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see postMultiplyPhaseGadget()
-void postMultiplyPhaseGadget(Qureg qureg, std::vector<int> targets, qreal angle);
+/// @see rightapplyPhaseGadget()
+void rightapplyPhaseGadget(Qureg qureg, std::vector<int> targets, qreal angle);
 
 
 #endif
@@ -692,17 +692,17 @@ extern "C" {
 
 /// @notyetdoced
 /// @see 
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applyMultiQubitNot()
-void multiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
+void leftapplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
 
 
 /// @notyetdoced
 /// @notyetvalidated
 /// @see
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyMultiQubitNot()
-void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
+void rightapplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
 
 
 // end de-mangler
@@ -716,15 +716,15 @@ void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets);
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see multiplyMultiQubitNot()
-void multiplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
+/// @see leftapplyMultiQubitNot()
+void leftapplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
 
 
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see postMultiplyMultiQubitNot()
-void postMultiplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
+/// @see rightapplyMultiQubitNot()
+void rightapplyMultiQubitNot(Qureg qureg, std::vector<int> targets);
 
 
 #endif
@@ -748,33 +748,33 @@ extern "C" {
 /// @notyetdoced
 /// @notyetvalidated
 /// @see
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applyQubitProjector()
-void multiplyQubitProjector(Qureg qureg, int qubit, int outcome);
+void leftapplyQubitProjector(Qureg qureg, int qubit, int outcome);
 
 
 /// @notyetdoced
 /// @notyetvalidated
 /// @see
-/// - multiplyCompMatr1()
+/// - leftapplyCompMatr1()
 /// - applyMultiQubitProjector()
-void multiplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits);
+void leftapplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits);
 
 
 /// @notyetdoced
 /// @notyetvalidated
 /// @see
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyQubitProjector()
-void postMultiplyQubitProjector(Qureg qureg, int qubit, int outcome);
+void rightapplyQubitProjector(Qureg qureg, int qubit, int outcome);
 
 
 /// @notyetdoced
 /// @notyetvalidated
 /// @see
-/// - postMultiplyCompMatr1()
+/// - rightapplyCompMatr1()
 /// - applyMultiQubitProjector()
-void postMultiplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits);
+void rightapplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits);
 
 
 // end de-mangler
@@ -799,14 +799,14 @@ extern "C" {
 
 /// @notyetdoced
 /// @notyetvalidated
-/// @see multiplyCompMatr1()
-void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
+/// @see leftapplyCompMatr1()
+void leftapplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
 
 
 /// @notyetdoced
 /// @notyetvalidated
-/// @see multiplyCompMatr1()
-void postMultiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
+/// @see leftapplyCompMatr1()
+void rightapplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace);
 
 
 // end de-mangler
diff --git a/quest/include/operations.h b/quest/include/operations.h
index 2a5c394b0..075ab6689 100644
--- a/quest/include/operations.h
+++ b/quest/include/operations.h
@@ -115,8 +115,8 @@ digraph {
  * @see
  * - getCompMatr1()
  * - getInlineCompMatr1()
- * - multiplyCompMatr1()
- * - postMultiplyCompMatr1()
+ * - leftapplyCompMatr1()
+ * - rightapplyCompMatr1()
  * - applyControlledCompMatr1()
  * - applyCompMatr2()
  * - applyCompMatr()
@@ -322,8 +322,8 @@ digraph {
  *
  * @see
  * - applyCompMatr1()
- * - multiplyCompMatr2()
- * - postMultiplyCompMatr2()
+ * - leftapplyCompMatr2()
+ * - rightapplyCompMatr2()
  */
 void applyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix);
 
@@ -515,8 +515,8 @@ extern "C" {
  *
  * @see
  * - applyCompMatr1()
- * - multiplyCompMatr()
- * - postMultiplyCompMatr()
+ * - leftapplyCompMatr()
+ * - rightapplyCompMatr()
  */
 void applyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matr);
 
@@ -600,8 +600,8 @@ extern "C" {
 /** @notyetdoced
  * @see 
  * - applyCompMatr1()
- * - multiplyCompMatr2()
- * - postMultiplyCompMatr2()
+ * - leftapplyCompMatr2()
+ * - rightapplyCompMatr2()
  */
 void applyDiagMatr1(Qureg qureg, int target, DiagMatr1 matr);
 
diff --git a/quest/src/api/multiplication.cpp b/quest/src/api/multiplication.cpp
index e0c37d47c..c4b508e0c 100644
--- a/quest/src/api/multiplication.cpp
+++ b/quest/src/api/multiplication.cpp
@@ -27,7 +27,7 @@ using std::vector;
 
 extern "C" {
 
-void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
+void leftapplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
     validate_quregFields(qureg, __func__);
     validate_target(qureg, target, __func__);
     validate_matrixFields(matrix, __func__);
@@ -37,7 +37,7 @@ void multiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
     localiser_statevec_anyCtrlOneTargDenseMatr(qureg, {}, {}, target, matrix, conj, transp);
 }
 
-void postMultiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
+void rightapplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_target(qureg, target, __func__);
@@ -60,7 +60,7 @@ void postMultiplyCompMatr1(Qureg qureg, int target, CompMatr1 matrix) {
 
 extern "C" {
 
-void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
+void leftapplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
     validate_quregFields(qureg, __func__);
     validate_twoTargets(qureg, target1, target2, __func__);
     validate_matrixFields(matrix, __func__);
@@ -71,7 +71,7 @@ void multiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix)
     localiser_statevec_anyCtrlTwoTargDenseMatr(qureg, {}, {}, target1, target2, matrix, conj, transp);
 }
 
-void postMultiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
+void rightapplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matrix) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_twoTargets(qureg, target1, target2, __func__);
@@ -96,7 +96,7 @@ void postMultiplyCompMatr2(Qureg qureg, int target1, int target2, CompMatr2 matr
 
 extern "C" {
 
-void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
+void leftapplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
     validate_quregFields(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
     validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync
@@ -107,7 +107,7 @@ void multiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix
     localiser_statevec_anyCtrlAnyTargDenseMatr(qureg, {}, {}, util_getVector(targets, numTargets), matrix, conj, transp);
 }
 
-void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
+void rightapplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr matrix) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
@@ -123,14 +123,14 @@ void postMultiplyCompMatr(Qureg qureg, int* targets, int numTargets, CompMatr ma
 
 } // end de-mangler
 
-void multiplyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
+void leftapplyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
 
-    multiplyCompMatr(qureg, targets.data(), targets.size(), matr);
+    leftapplyCompMatr(qureg, targets.data(), targets.size(), matr);
 }
 
-void postMultiplyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
+void rightapplyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
 
-    postMultiplyCompMatr(qureg, targets.data(), targets.size(), matr);
+    rightapplyCompMatr(qureg, targets.data(), targets.size(), matr);
 }
 
 
@@ -141,7 +141,7 @@ void postMultiplyCompMatr(Qureg qureg, vector<int> targets, CompMatr matr) {
 
 extern "C" {
 
-void multiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
+void leftapplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
     validate_quregFields(qureg, __func__);
     validate_target(qureg, target, __func__);
     validate_matrixFields(matrix, __func__);
@@ -150,7 +150,7 @@ void multiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
     localiser_statevec_anyCtrlOneTargDiagMatr(qureg, {}, {}, target, matrix, conj);
 }
 
-void postMultiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
+void rightapplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_target(qureg, target, __func__);
@@ -171,7 +171,7 @@ void postMultiplyDiagMatr1(Qureg qureg, int target, DiagMatr1 matrix) {
 
 extern "C" {
 
-void multiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
+void leftapplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
     validate_quregFields(qureg, __func__);
     validate_twoTargets(qureg, target1, target2, __func__);
     validate_matrixFields(matrix, __func__);
@@ -180,7 +180,7 @@ void multiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix)
     localiser_statevec_anyCtrlTwoTargDiagMatr(qureg, {}, {}, target1, target2, matrix, conj);
 }
 
-void postMultiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
+void rightapplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matrix) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_twoTargets(qureg, target1, target2, __func__);
@@ -202,7 +202,7 @@ void postMultiplyDiagMatr2(Qureg qureg, int target1, int target2, DiagMatr2 matr
 
 extern "C" {
 
-void multiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
+void leftapplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
     validate_quregFields(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
     validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync
@@ -213,7 +213,7 @@ void multiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix
     localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
 }
 
-void postMultiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
+void rightapplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr matrix) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
@@ -227,14 +227,14 @@ void postMultiplyDiagMatr(Qureg qureg, int* targets, int numTargets, DiagMatr ma
 
 } // end de-mangler
 
-void multiplyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
+void leftapplyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
 
-    multiplyDiagMatr(qureg, targets.data(), targets.size(), matrix);
+    leftapplyDiagMatr(qureg, targets.data(), targets.size(), matrix);
 }
 
-void postMultiplyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
+void rightapplyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
 
-    postMultiplyDiagMatr(qureg, targets.data(), targets.size(), matrix);
+    rightapplyDiagMatr(qureg, targets.data(), targets.size(), matrix);
 }
 
 
@@ -245,7 +245,7 @@ void postMultiplyDiagMatr(Qureg qureg, vector<int> targets, DiagMatr matrix) {
 
 extern "C" {
 
-void multiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent) {
+void leftapplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent) {
     validate_quregFields(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
     validate_matrixDimMatchesTargets(matrix, numTargets, __func__); // also validates fields and is-sync, but not unitarity
@@ -256,7 +256,7 @@ void multiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr m
     localiser_statevec_anyCtrlAnyTargDiagMatr(qureg, {}, {}, qubits, matrix, exponent, conj);
 }
 
-void postMultiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent) {
+void rightapplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMatr matrix, qcomp exponent) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
@@ -270,14 +270,14 @@ void postMultiplyDiagMatrPower(Qureg qureg, int* targets, int numTargets, DiagMa
 
 } // end de-mangler
 
-void multiplyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qcomp exponent) {
+void leftapplyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qcomp exponent) {
 
-    multiplyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
+    leftapplyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
 }
 
-void postMultiplyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qcomp exponent) {
+void rightapplyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix, qcomp exponent) {
 
-    postMultiplyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
+    rightapplyDiagMatrPower(qureg, targets.data(), targets.size(), matrix, exponent);
 }
 
 
@@ -288,15 +288,15 @@ void postMultiplyDiagMatrPower(Qureg qureg, vector<int> targets, DiagMatr matrix
 
 extern "C" {
 
-void multiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
+void leftapplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
     validate_quregFields(qureg, __func__);
     validate_matrixFields(matrix, __func__);
     validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
 
-    multiplyFullStateDiagMatrPower(qureg, matrix, 1); // harmlessly re-validates
+    leftapplyFullStateDiagMatrPower(qureg, matrix, 1); // harmlessly re-validates
 }
 
-void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
+void leftapplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
     validate_quregFields(qureg, __func__);
     validate_matrixFields(matrix, __func__);
     validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
@@ -312,16 +312,16 @@ void multiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp
         localiser_statevec_allTargDiagMatr(qureg, matrix, exponent);
 }
 
-void postMultiplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
+void rightapplyFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matrix) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_matrixFields(matrix, __func__);
     validate_matrixAndQuregAreCompatible(matrix, qureg, false, __func__); // matrix can be non-unitary
 
-    postMultiplyFullStateDiagMatrPower(qureg, matrix, 1); // harmlessly re-validates
+    rightapplyFullStateDiagMatrPower(qureg, matrix, 1); // harmlessly re-validates
 }
 
-void postMultiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
+void rightapplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_matrixFields(matrix, __func__);
@@ -345,14 +345,14 @@ void postMultiplyFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, q
 
 extern "C" {
 
-void multiplySwap(Qureg qureg, int qubit1, int qubit2) {
+void leftapplySwap(Qureg qureg, int qubit1, int qubit2) {
     validate_quregFields(qureg, __func__);
     validate_twoTargets(qureg, qubit1, qubit2, __func__);
 
     localiser_statevec_anyCtrlSwap(qureg, {}, {}, qubit1, qubit2);
 }
 
-void postMultiplySwap(Qureg qureg, int qubit1, int qubit2) {
+void rightapplySwap(Qureg qureg, int qubit1, int qubit2) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_twoTargets(qureg, qubit1, qubit2, __func__);
@@ -374,7 +374,7 @@ extern PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift);
 
 extern "C" {
 
-void multiplyPauliX(Qureg qureg, int target) {
+void leftapplyPauliX(Qureg qureg, int target) {
     validate_quregFields(qureg, __func__);
     validate_target(qureg, target, __func__);
 
@@ -382,7 +382,7 @@ void multiplyPauliX(Qureg qureg, int target) {
     localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
 }
 
-void multiplyPauliY(Qureg qureg, int target) {
+void leftapplyPauliY(Qureg qureg, int target) {
     validate_quregFields(qureg, __func__);
     validate_target(qureg, target, __func__);
 
@@ -390,7 +390,7 @@ void multiplyPauliY(Qureg qureg, int target) {
     localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
 }
 
-void multiplyPauliZ(Qureg qureg, int target) {
+void leftapplyPauliZ(Qureg qureg, int target) {
     validate_quregFields(qureg, __func__);
     validate_target(qureg, target, __func__);
 
@@ -398,7 +398,7 @@ void multiplyPauliZ(Qureg qureg, int target) {
     localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
 }
 
-void postMultiplyPauliX(Qureg qureg, int target) {
+void rightapplyPauliX(Qureg qureg, int target) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_target(qureg, target, __func__);
@@ -408,7 +408,7 @@ void postMultiplyPauliX(Qureg qureg, int target) {
     localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
 }
 
-void postMultiplyPauliY(Qureg qureg, int target) {
+void rightapplyPauliY(Qureg qureg, int target) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_target(qureg, target, __func__);
@@ -419,7 +419,7 @@ void postMultiplyPauliY(Qureg qureg, int target) {
     localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str, factor);
 }
 
-void postMultiplyPauliZ(Qureg qureg, int target) {
+void rightapplyPauliZ(Qureg qureg, int target) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_target(qureg, target, __func__);
@@ -441,14 +441,14 @@ extern bool paulis_hasOddNumY(PauliStr str);
 
 extern "C" {
 
-void multiplyPauliStr(Qureg qureg, PauliStr str) {
+void leftapplyPauliStr(Qureg qureg, PauliStr str) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrTargets(qureg, str, __func__);
 
     localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str);
 }
 
-void postMultiplyPauliStr(Qureg qureg, PauliStr str) {
+void rightapplyPauliStr(Qureg qureg, PauliStr str) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_pauliStrTargets(qureg, str, __func__);
@@ -468,7 +468,7 @@ void postMultiplyPauliStr(Qureg qureg, PauliStr str) {
 
 extern "C" {
 
-void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
+void leftapplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrTargets(qureg, str, __func__);
 
@@ -476,7 +476,7 @@ void multiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
     localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
 }
 
-void postMultiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
+void rightapplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_pauliStrTargets(qureg, str, __func__);
@@ -497,7 +497,7 @@ void postMultiplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
 
 extern "C" {
 
-void multiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
+void leftapplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
     validate_quregFields(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
 
@@ -506,7 +506,7 @@ void multiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle)
     localiser_statevec_anyCtrlPhaseGadget(qureg, {}, {}, qubits, phase);
 }
 
-void postMultiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
+void rightapplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal angle) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
@@ -518,14 +518,14 @@ void postMultiplyPhaseGadget(Qureg qureg, int* targets, int numTargets, qreal an
 
 } // end de-mangler
 
-void multiplyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
+void leftapplyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
 
-    multiplyPhaseGadget(qureg, targets.data(), targets.size(), angle);
+    leftapplyPhaseGadget(qureg, targets.data(), targets.size(), angle);
 }
 
-void postMultiplyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
+void rightapplyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
 
-    postMultiplyPhaseGadget(qureg, targets.data(), targets.size(), angle);
+    rightapplyPhaseGadget(qureg, targets.data(), targets.size(), angle);
 }
 
 
@@ -536,35 +536,35 @@ void postMultiplyPhaseGadget(Qureg qureg, vector<int> targets, qreal angle) {
 
 extern "C" {
 
-void multiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
+void leftapplyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
     validate_quregFields(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
 
     // harmlessly re-validates
     PauliStr str = getPauliStr(std::string(numTargets, 'X'), targets, numTargets);
-    multiplyPauliStr(qureg, str);
+    leftapplyPauliStr(qureg, str);
 }
 
-void postMultiplyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
+void rightapplyMultiQubitNot(Qureg qureg, int* targets, int numTargets) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_targets(qureg, targets, numTargets, __func__);
 
     // harmlessly re-validates
     PauliStr str = getPauliStr(std::string(numTargets, 'X'), targets, numTargets);
-    postMultiplyPauliStr(qureg, str);
+    rightapplyPauliStr(qureg, str);
 }
 
 } // end de-mangler
 
-void multiplyMultiQubitNot(Qureg qureg, vector<int> targets) {
+void leftapplyMultiQubitNot(Qureg qureg, vector<int> targets) {
 
-    multiplyMultiQubitNot(qureg, targets.data(), targets.size());
+    leftapplyMultiQubitNot(qureg, targets.data(), targets.size());
 }
 
-void postMultiplyMultiQubitNot(Qureg qureg, vector<int> targets) {
+void rightapplyMultiQubitNot(Qureg qureg, vector<int> targets) {
 
-    postMultiplyMultiQubitNot(qureg, targets.data(), targets.size());
+    rightapplyMultiQubitNot(qureg, targets.data(), targets.size());
 }
 
 
@@ -575,7 +575,7 @@ void postMultiplyMultiQubitNot(Qureg qureg, vector<int> targets) {
 
 extern "C" {
 
-void multiplyQubitProjector(Qureg qureg, int qubit, int outcome) {
+void leftapplyQubitProjector(Qureg qureg, int qubit, int outcome) {
     validate_quregFields(qureg, __func__);
     validate_target(qureg, qubit, __func__);
     validate_measurementOutcomeIsValid(outcome, __func__); 
@@ -584,7 +584,7 @@ void multiplyQubitProjector(Qureg qureg, int qubit, int outcome) {
     localiser_statevec_multiQubitProjector(qureg, {qubit}, {outcome}, prob);
 }
 
-void multiplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits) {
+void leftapplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits) {
     validate_quregFields(qureg, __func__);
     validate_targets(qureg, qubits, numQubits, __func__);
     validate_measurementOutcomesAreValid(outcomes, numQubits, __func__);
@@ -595,7 +595,7 @@ void multiplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int nu
     localiser_statevec_multiQubitProjector(qureg, qubitVec, outcomeVec, prob);
 }
 
-void postMultiplyQubitProjector(Qureg qureg, int qubit, int outcome) {
+void rightapplyQubitProjector(Qureg qureg, int qubit, int outcome) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_target(qureg, qubit, __func__);
@@ -605,7 +605,7 @@ void postMultiplyQubitProjector(Qureg qureg, int qubit, int outcome) {
     localiser_statevec_multiQubitProjector(qureg, {util_getBraQubit(qubit,qureg)}, {outcome}, prob);
 }
 
-void postMultiplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits) {
+void rightapplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, int numQubits) {
     validate_quregFields(qureg, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_targets(qureg, qubits, numQubits, __func__);
@@ -619,16 +619,16 @@ void postMultiplyMultiQubitProjector(Qureg qureg, int* qubits, int* outcomes, in
 
 } // end de-mangler
 
-void multiplyMultiQubitProjector(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+void leftapplyMultiQubitProjector(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
     validate_measurementOutcomesMatchTargets(qubits.size(), outcomes.size(), __func__);
 
-    multiplyMultiQubitProjector(qureg, qubits.data(), outcomes.data(), outcomes.size());
+    leftapplyMultiQubitProjector(qureg, qubits.data(), outcomes.data(), outcomes.size());
 }
 
-void postMultiplyMultiQubitProjector(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
+void rightapplyMultiQubitProjector(Qureg qureg, vector<int> qubits, vector<int> outcomes) {
     validate_measurementOutcomesMatchTargets(qubits.size(), outcomes.size(), __func__);
 
-    postMultiplyMultiQubitProjector(qureg, qubits.data(), outcomes.data(), outcomes.size());
+    rightapplyMultiQubitProjector(qureg, qubits.data(), outcomes.data(), outcomes.size());
 }
 
 
@@ -639,7 +639,7 @@ void postMultiplyMultiQubitProjector(Qureg qureg, vector<int> qubits, vector<int
 
 extern "C" {
 
-void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
+void leftapplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
     validate_quregFields(qureg, __func__);
     validate_quregFields(workspace, __func__);
     validate_quregCanBeWorkspace(qureg, workspace, __func__);
@@ -660,7 +660,7 @@ void multiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
     // workspace -> qureg, and qureg -> sum * qureg
 }
 
-void postMultiplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
+void rightapplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
     validate_quregFields(qureg, __func__);
     validate_quregFields(workspace, __func__);
     validate_quregIsDensityMatrix(qureg, __func__);
diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 5e3e40f55..1016f61ef 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -392,22 +392,22 @@ void accel_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qco
 }
 
 
-auto getDenseMatrAllTargDiagMatrFunc(bool isGpu, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight) {
+auto getDenseMatrAllTargDiagMatrFunc(bool isGpu, qcomp exponent, bool applyLeft, bool applyRight, bool conjRight) {
 
     // this helper function exists, dissimilar from the function-agnostic macros used
     // by other functions, because densmatr_allTargDiagMatr_sub() does not accept every
     // possible combination of its boolean template parameters 
-    assert_fullStateDiagMatrTemplateParamsAreValid(multiplyLeft, multiplyRight, conjRight);
+    assert_fullStateDiagMatrTemplateParamsAreValid(applyLeft, applyRight, conjRight);
 
     bool hasPower = exponent != qcomp(1, 0);
 
-    if (multiplyLeft && multiplyRight && conjRight)
+    if (applyLeft && applyRight && conjRight)
         return GET_CPU_OR_GPU_FOUR_BOOL_FUNC_OPTIMISED_FOR_FIRST_BOOL( isGpu, densmatr_allTargDiagMatr_sub, hasPower, true,true,true );
 
-    if (multiplyLeft && ! multiplyRight && ! conjRight)
+    if (applyLeft && ! applyRight && ! conjRight)
         return GET_CPU_OR_GPU_FOUR_BOOL_FUNC_OPTIMISED_FOR_FIRST_BOOL( isGpu, densmatr_allTargDiagMatr_sub, hasPower, true,false,false );
 
-    if (! multiplyLeft && multiplyRight && ! conjRight)
+    if (! applyLeft && applyRight && ! conjRight)
         return GET_CPU_OR_GPU_FOUR_BOOL_FUNC_OPTIMISED_FOR_FIRST_BOOL( isGpu, densmatr_allTargDiagMatr_sub, hasPower, false,true,false );
 
     // unreachable
@@ -415,7 +415,7 @@ auto getDenseMatrAllTargDiagMatrFunc(bool isGpu, qcomp exponent, bool multiplyLe
 }
 
 
-void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight) {
+void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool applyLeft, bool applyRight, bool conjRight) {
 
     // matr is always local, qureg can be local or distributed...
     assert_fullStateDiagMatrIsLocal(matr);
@@ -425,8 +425,8 @@ void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qc
     bool matrGPU = matr.isGpuAccelerated;
 
     // which determines which function is called
-    auto gpuFunc = getDenseMatrAllTargDiagMatrFunc(true,  exponent, multiplyLeft, multiplyRight, conjRight);
-    auto cpuFunc = getDenseMatrAllTargDiagMatrFunc(false, exponent, multiplyLeft, multiplyRight, conjRight);
+    auto gpuFunc = getDenseMatrAllTargDiagMatrFunc(true,  exponent, applyLeft, applyRight, conjRight);
+    auto cpuFunc = getDenseMatrAllTargDiagMatrFunc(false, exponent, applyLeft, applyRight, conjRight);
 
     // when deployments match, we trivially call the common backend
     if ( quregGPU &&  matrGPU) gpuFunc(qureg, matr, exponent);
@@ -476,7 +476,7 @@ void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qc
 }
 
 
-void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight) {
+void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool applyLeft, bool applyRight, bool conjRight) {
 
     assert_fullStateDiagMatrIsDistributed(matr);
     assert_acceleratorQuregIsDistributed(qureg);
@@ -501,7 +501,7 @@ void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qc
     temp.cpuElems = qureg.cpuCommBuffer;
     temp.gpuElems = qureg.gpuCommBuffer;
 
-    accel_densmatr_allTargDiagMatr_subA(qureg, temp, exponent, multiplyLeft, multiplyRight, conjRight);
+    accel_densmatr_allTargDiagMatr_subA(qureg, temp, exponent, applyLeft, applyRight, conjRight);
 }
 
 
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index 5480d8133..cd413d9a1 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -209,8 +209,8 @@ void accel_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, v
 
 void accel_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
-void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight);
-void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight);
+void accel_densmatr_allTargDiagMatr_subA(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool applyLeft, bool applyRight, bool conjRight);
+void accel_densmatr_allTargDiagMatr_subB(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool applyLeft, bool applyRight, bool conjRight);
 
 
 
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index c1cfd2a02..51261ee1e 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -467,12 +467,12 @@ void assert_fullStateDiagMatrIsDistributed(FullStateDiagMatr matr) {
         raiseInternalError("An accelerator function received a non-distributed FullStateDiagMatr where a distributed one was expected.");
 }
 
-void assert_fullStateDiagMatrTemplateParamsAreValid(bool multiplyLeft, bool multiplyRight, bool conjRight) {
+void assert_fullStateDiagMatrTemplateParamsAreValid(bool applyLeft, bool applyRight, bool conjRight) {
 
     bool valid = (
-        (  multiplyLeft &&   multiplyRight &&   conjRight) || // matr qureg conj(matr)
-        (  multiplyLeft && ! multiplyRight && ! conjRight) || // matr qureg
-        (! multiplyLeft &&   multiplyRight && ! conjRight)    //      qureg matr
+        (  applyLeft &&   applyRight &&   conjRight) || // matr qureg conj(matr)
+        (  applyLeft && ! applyRight && ! conjRight) || // matr qureg
+        (! applyLeft &&   applyRight && ! conjRight)    //      qureg matr
     );
 
     if (!valid)
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 7097650a7..50af5e8aa 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -168,7 +168,7 @@ void assert_fullStateDiagMatrIsLocal(FullStateDiagMatr matr);
 
 void assert_fullStateDiagMatrIsDistributed(FullStateDiagMatr matr);
 
-void assert_fullStateDiagMatrTemplateParamsAreValid(bool multiplyLeft, bool multiplyRight, bool conjRight);
+void assert_fullStateDiagMatrTemplateParamsAreValid(bool applyLeft, bool applyRight, bool conjRight);
 
 void assert_acceleratorQuregIsDistributed(Qureg qureg);
 
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 2fa09c9f4..731c598d8 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -1171,7 +1171,7 @@ void localiser_statevec_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qco
 }
 
 
-void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight) {
+void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool applyLeft, bool applyRight, bool conjRight) {
     assert_localiserGivenDensMatr(qureg);
 
     // the diagonal matr has quadratically fewer elements than the density-matrix
@@ -1197,7 +1197,7 @@ void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qco
     // when the matrix is not distributed, we call the same routine despite whether qureg 
     // is distributed or not; that merely changes how many qureg columns get updated
     if (!matrDist) {
-        accel_densmatr_allTargDiagMatr_subA(qureg, matr, exponent, multiplyLeft, multiplyRight, conjRight);
+        accel_densmatr_allTargDiagMatr_subA(qureg, matr, exponent, applyLeft, applyRight, conjRight);
         return;
     }
 
@@ -1206,7 +1206,7 @@ void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qco
 
     // matr elems are inside qureg buffer, but we still pass matr struct along to
     // accelerator, because it is going to perform mischief to re-use subA().
-    accel_densmatr_allTargDiagMatr_subB(qureg, matr, exponent, multiplyLeft, multiplyRight, conjRight); 
+    accel_densmatr_allTargDiagMatr_subB(qureg, matr, exponent, applyLeft, applyRight, conjRight); 
 }
 
 
diff --git a/quest/src/core/localiser.hpp b/quest/src/core/localiser.hpp
index 50413fe68..6d615919d 100644
--- a/quest/src/core/localiser.hpp
+++ b/quest/src/core/localiser.hpp
@@ -101,7 +101,7 @@ void localiser_statevec_anyCtrlTwoTargDiagMatr(Qureg qureg, vector<int> ctrls, v
 void localiser_statevec_anyCtrlAnyTargDiagMatr(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, DiagMatr matr, qcomp exponent, bool conj);
 
 void localiser_statevec_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
-void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool multiplyLeft, bool multiplyRight, bool conjRight);
+void localiser_densmatr_allTargDiagMatr(Qureg qureg, FullStateDiagMatr matr, qcomp exponent, bool applyLeft, bool applyRight, bool conjRight);
 
 
 /*
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index c519d2007..dd6f64b59 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -750,7 +750,7 @@ void cpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 }
 
 
-template <bool HasPower, bool MultiplyLeft, bool MultiplyRight, bool ConjRight>
+template <bool HasPower, bool ApplyLeft, bool ApplyRight, bool ConjRight>
 void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent) {
 
     // unlike other functions, this function handles all scenarios of...
@@ -773,7 +773,7 @@ void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
         qcomp fac = 1;
 
         // update fac to effect rho -> (matr * rho) or (matr^exponent * rho)
-        if constexpr (MultiplyLeft) {
+        if constexpr (ApplyLeft) {
 
             // i = global row of nth local amp
             qindex i = fast_getQuregGlobalRowFromFlatIndex(n, matr.numElems);
@@ -789,7 +789,7 @@ void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 
         // update fac to additional include rho -> (rho * matr) or 
         // (rho * conj(matr)), or the same exponentiated
-        if constexpr (MultiplyRight) {
+        if constexpr (ApplyRight) {
 
             // m = global index corresponding to n
             qindex m = concatenateBits(qureg.rank, n, qureg.logNumAmpsPerNode);
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index ea570ba2b..b81e28905 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -82,7 +82,7 @@ template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower> void cpu_st
 
 template <bool HasPower> void cpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
-template <bool HasPower, bool MultiplyLeft, bool MultiplyRight, bool ConjRight> void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
+template <bool HasPower, bool ApplyLeft, bool ApplyRight, bool ConjRight> void cpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
 
 /*
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 4fdd7ceea..a74d9ddcf 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -563,7 +563,7 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub(
  */
 
 
-template <bool HasPower, bool MultiplyLeft, bool MultiplyRight, bool ConjRight> 
+template <bool HasPower, bool ApplyLeft, bool ApplyRight, bool ConjRight> 
 __global__ void kernel_densmatr_allTargDiagMatr_sub(
     cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
     cu_qcomp* elems, qindex numElems, cu_qcomp exponent
@@ -572,7 +572,7 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub(
 
     cu_qcomp fac = getCuQcomp(1, 0);
 
-    if constexpr (MultiplyLeft) {
+    if constexpr (ApplyLeft) {
 
         qindex i = fast_getQuregGlobalRowFromFlatIndex(n, numElems);
         cu_qcomp term = elems[i];
@@ -583,7 +583,7 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub(
         fac = term;
     }
 
-    if constexpr (MultiplyRight) {
+    if constexpr (ApplyRight) {
 
         qindex m = concatenateBits(rank, n, logNumAmpsPerNode);
         qindex j = fast_getQuregGlobalColFromFlatIndex(m, numElems);
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 0e7bb9385..bb9688209 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -749,7 +749,7 @@ void gpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
 }
 
 
-template <bool HasPower, bool MultiplyLeft, bool MultiplyRight, bool ConjRight>
+template <bool HasPower, bool ApplyLeft, bool ApplyRight, bool ConjRight>
 void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent) {
 
     assert_exponentMatchesTemplateParam(exponent, HasPower);
@@ -760,7 +760,7 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp
     qindex numBlocks = getNumBlocks(numThreads);
 
     kernel_densmatr_allTargDiagMatr_sub 
-        <HasPower, MultiplyLeft, MultiplyRight, ConjRight> 
+        <HasPower, ApplyLeft, ApplyRight, ConjRight> 
         <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
             toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
             toCuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toCuQcomp(exponent)
diff --git a/quest/src/gpu/gpu_subroutines.hpp b/quest/src/gpu/gpu_subroutines.hpp
index 7ec3f6696..aac2966ff 100644
--- a/quest/src/gpu/gpu_subroutines.hpp
+++ b/quest/src/gpu/gpu_subroutines.hpp
@@ -75,7 +75,7 @@ template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower> void gpu_st
 
 template <bool HasPower> void gpu_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
-template <bool HasPower, bool MultiplyLeft, bool MultiplyRight, bool ConjRight> void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
+template <bool HasPower, bool ApplyLeft, bool ApplyRight, bool ConjRight> void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp exponent);
 
 
 /*
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index 3ceec40ee..cd39ac003 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -594,17 +594,17 @@ qmatrix getReferenceMatrix(auto matrixRefGen, vector<int> targs, auto additional
  * Let |psi> be a statevector, rho be a density matrix, 
  * and matr be an operator matrix. The options perform:
  * 
- * apply:    |psi> -> matr |psi>,  rho -> matr rho adj(matr)
- * multiply: |psi> -> matr |psi>,  rho -> matr rho
- * postmultiply:                   rho -> rho matr
+ * apply:     |psi> -> matr |psi>,  rho -> matr rho adj(matr)
+ * leftapply: |psi> -> matr |psi>,  rho -> matr rho
+ * rightapply:                      rho -> rho matr
  * 
  * Note this is necessarily a template parameter (rather
  * than just a runtime parameter) only because the
- * postMultiplyReferenceOperator() function is defined
+ * rightapplyReferenceOperator() function is defined
  * only upon qmatrix (for density matrices)
  */
 
-enum ApplyFlag { apply, multiply, postmultiply };
+enum ApplyFlag { apply, leftapply, rightapply };
 
 
 /*
@@ -752,8 +752,8 @@ void testOperationCorrectness(auto operation, auto matrixRefGen) {
 
         // update reference state (ctrls & states happen to only ever be used by apply)
         if constexpr (Apply == apply)        applyReferenceOperator(       stateRef, ctrls, states, targs, matrixRef);
-        if constexpr (Apply == multiply)     multiplyReferenceOperator(    stateRef, ctrls, states, targs, matrixRef);
-        if constexpr (Apply == postmultiply) postMultiplyReferenceOperator(stateRef, ctrls, states, targs, matrixRef);
+        if constexpr (Apply == leftapply)     leftapplyReferenceOperator(    stateRef, ctrls, states, targs, matrixRef);
+        if constexpr (Apply == rightapply) rightapplyReferenceOperator(stateRef, ctrls, states, targs, matrixRef);
     };
 
     // report operation's input parameters if any subsequent test fails
@@ -761,7 +761,7 @@ void testOperationCorrectness(auto operation, auto matrixRefGen) {
 
     // test API operation on all available deployment combinations (e.g. OMP, MPI, MPI+GPU, etc),
     // though the postMultiply*() functions do not accept statevectors
-    if constexpr (Apply != postmultiply) {
+    if constexpr (Apply != rightapply) {
         SECTION( LABEL_STATEVEC ) { 
             TEST_ON_CACHED_QUREGS(statevecQuregs, statevecRef, testFunc); 
         }
@@ -1125,7 +1125,7 @@ void testOperationValidation(auto operation) {
     SECTION( "qureg type" ) {
 
         // only postMultiply*() functions discriminate Qureg
-        if (Apply != postmultiply)
+        if (Apply != rightapply)
             return;
 
         // use any statevector
@@ -1866,27 +1866,27 @@ TEST_CASE( "applyNonUnitaryPauliGadget", TEST_CATEGORY_OPS ) {
  */
 
 
-TEST_CASE( "multiplySwap",            TEST_CATEGORY_MULT ) { testOperation<zero,two,none,multiply>(multiplySwap, FixedMatrices::SWAP); }
-TEST_CASE( "multiplyPauliX",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,multiply>(multiplyPauliX, FixedMatrices::X); }
-TEST_CASE( "multiplyPauliY",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,multiply>(multiplyPauliY, FixedMatrices::Y); }
-TEST_CASE( "multiplyPauliZ",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,multiply>(multiplyPauliZ, FixedMatrices::Z); }
-TEST_CASE( "multiplyPauliStr",        TEST_CATEGORY_MULT ) { testOperation<zero,any,paulistr,multiply>(multiplyPauliStr,    nullptr); }
-TEST_CASE( "multiplyPauliGadget",     TEST_CATEGORY_MULT ) { testOperation<zero,any,pauligad,multiply>(multiplyPauliGadget, nullptr); }
-TEST_CASE( "multiplyCompMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,compmatr,multiply>(multiplyCompMatr1,   nullptr); }
-TEST_CASE( "multiplyCompMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,compmatr,multiply>(multiplyCompMatr2,   nullptr); }
-TEST_CASE( "multiplyDiagMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,diagmatr,multiply>(multiplyDiagMatr1,   nullptr); }
-TEST_CASE( "multiplyDiagMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,diagmatr,multiply>(multiplyDiagMatr2,   nullptr); }
-
-TEST_CASE( "postMultiplySwap",            TEST_CATEGORY_MULT ) { testOperation<zero,two,none,postmultiply>(postMultiplySwap, FixedMatrices::SWAP); }
-TEST_CASE( "postMultiplyPauliX",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliX, FixedMatrices::X); }
-TEST_CASE( "postMultiplyPauliY",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliY, FixedMatrices::Y); }
-TEST_CASE( "postMultiplyPauliZ",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,postmultiply>(postMultiplyPauliZ, FixedMatrices::Z); }
-TEST_CASE( "postMultiplyPauliStr",        TEST_CATEGORY_MULT ) { testOperation<zero,any,paulistr,postmultiply>(postMultiplyPauliStr,    nullptr); }
-TEST_CASE( "postMultiplyPauliGadget",     TEST_CATEGORY_MULT ) { testOperation<zero,any,pauligad,postmultiply>(postMultiplyPauliGadget, nullptr); }
-TEST_CASE( "postMultiplyCompMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,compmatr,postmultiply>(postMultiplyCompMatr1,   nullptr); }
-TEST_CASE( "postMultiplyCompMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,compmatr,postmultiply>(postMultiplyCompMatr2,   nullptr); }
-TEST_CASE( "postMultiplyDiagMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,diagmatr,postmultiply>(postMultiplyDiagMatr1,   nullptr); }
-TEST_CASE( "postMultiplyDiagMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,diagmatr,postmultiply>(postMultiplyDiagMatr2,   nullptr); }
+TEST_CASE( "leftapplySwap",            TEST_CATEGORY_MULT ) { testOperation<zero,two,none,leftapply>(leftapplySwap, FixedMatrices::SWAP); }
+TEST_CASE( "leftapplyPauliX",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,leftapply>(leftapplyPauliX, FixedMatrices::X); }
+TEST_CASE( "leftapplyPauliY",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,leftapply>(leftapplyPauliY, FixedMatrices::Y); }
+TEST_CASE( "leftapplyPauliZ",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,leftapply>(leftapplyPauliZ, FixedMatrices::Z); }
+TEST_CASE( "leftapplyPauliStr",        TEST_CATEGORY_MULT ) { testOperation<zero,any,paulistr,leftapply>(leftapplyPauliStr,    nullptr); }
+TEST_CASE( "leftapplyPauliGadget",     TEST_CATEGORY_MULT ) { testOperation<zero,any,pauligad,leftapply>(leftapplyPauliGadget, nullptr); }
+TEST_CASE( "leftapplyCompMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,compmatr,leftapply>(leftapplyCompMatr1,   nullptr); }
+TEST_CASE( "leftapplyCompMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,compmatr,leftapply>(leftapplyCompMatr2,   nullptr); }
+TEST_CASE( "leftapplyDiagMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,diagmatr,leftapply>(leftapplyDiagMatr1,   nullptr); }
+TEST_CASE( "leftapplyDiagMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,diagmatr,leftapply>(leftapplyDiagMatr2,   nullptr); }
+
+TEST_CASE( "rightapplySwap",            TEST_CATEGORY_MULT ) { testOperation<zero,two,none,rightapply>(rightapplySwap, FixedMatrices::SWAP); }
+TEST_CASE( "rightapplyPauliX",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,rightapply>(rightapplyPauliX, FixedMatrices::X); }
+TEST_CASE( "rightapplyPauliY",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,rightapply>(rightapplyPauliY, FixedMatrices::Y); }
+TEST_CASE( "rightapplyPauliZ",          TEST_CATEGORY_MULT ) { testOperation<zero,one,none,rightapply>(rightapplyPauliZ, FixedMatrices::Z); }
+TEST_CASE( "rightapplyPauliStr",        TEST_CATEGORY_MULT ) { testOperation<zero,any,paulistr,rightapply>(rightapplyPauliStr,    nullptr); }
+TEST_CASE( "rightapplyPauliGadget",     TEST_CATEGORY_MULT ) { testOperation<zero,any,pauligad,rightapply>(rightapplyPauliGadget, nullptr); }
+TEST_CASE( "rightapplyCompMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,compmatr,rightapply>(rightapplyCompMatr1,   nullptr); }
+TEST_CASE( "rightapplyCompMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,compmatr,rightapply>(rightapplyCompMatr2,   nullptr); }
+TEST_CASE( "rightapplyDiagMatr1",       TEST_CATEGORY_MULT ) { testOperation<zero,one,diagmatr,rightapply>(rightapplyDiagMatr1,   nullptr); }
+TEST_CASE( "rightapplyDiagMatr2",       TEST_CATEGORY_MULT ) { testOperation<zero,two,diagmatr,rightapply>(rightapplyDiagMatr2,   nullptr); }
 
 
 /*
@@ -1896,55 +1896,55 @@ TEST_CASE( "postMultiplyDiagMatr2",       TEST_CATEGORY_MULT ) { testOperation<z
  */
 
 
-TEST_CASE( "multiplyCompMatr",  TEST_CATEGORY_MULT ) { 
-    auto func = static_cast<void(*)(Qureg, int*, int, CompMatr)>(multiplyCompMatr);
-    testOperation<zero,any,compmatr,multiply>(func, nullptr); 
+TEST_CASE( "leftapplyCompMatr",  TEST_CATEGORY_MULT ) { 
+    auto func = static_cast<void(*)(Qureg, int*, int, CompMatr)>(leftapplyCompMatr);
+    testOperation<zero,any,compmatr,leftapply>(func, nullptr); 
 }
 
-TEST_CASE( "multiplyDiagMatr",  TEST_CATEGORY_MULT ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr)>(multiplyDiagMatr);
-    testOperation<zero,any,diagmatr,multiply>(func, nullptr);
+TEST_CASE( "leftapplyDiagMatr",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr)>(leftapplyDiagMatr);
+    testOperation<zero,any,diagmatr,leftapply>(func, nullptr);
 }
 
-TEST_CASE( "multiplyDiagMatrPower",  TEST_CATEGORY_MULT ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr, qcomp)>(multiplyDiagMatrPower);
-    testOperation<zero,any,diagpower,multiply>(func, nullptr);
+TEST_CASE( "leftapplyDiagMatrPower",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr, qcomp)>(leftapplyDiagMatrPower);
+    testOperation<zero,any,diagpower,leftapply>(func, nullptr);
 }
 
-TEST_CASE( "multiplyMultiQubitNot",  TEST_CATEGORY_MULT ) {
-    auto func = static_cast<void(*)(Qureg, int*, int)>(multiplyMultiQubitNot);
-    testOperation<zero,any,none,multiply>(func, VariableSizeMatrices::X);
+TEST_CASE( "leftapplyMultiQubitNot",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int)>(leftapplyMultiQubitNot);
+    testOperation<zero,any,none,leftapply>(func, VariableSizeMatrices::X);
 }
 
-TEST_CASE( "multiplyPhaseGadget",  TEST_CATEGORY_MULT ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(multiplyPhaseGadget);
-    testOperation<zero,any,scalar,multiply>(func, VariableSizeParameterisedMatrices::Z);
+TEST_CASE( "leftapplyPhaseGadget",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(leftapplyPhaseGadget);
+    testOperation<zero,any,scalar,leftapply>(func, VariableSizeParameterisedMatrices::Z);
 }
 
 
-TEST_CASE( "postMultiplyCompMatr",  TEST_CATEGORY_MULT ) { 
-    auto func = static_cast<void(*)(Qureg, int*, int, CompMatr)>(postMultiplyCompMatr);
-    testOperation<zero,any,compmatr,postmultiply>(func, nullptr); 
+TEST_CASE( "rightapplyCompMatr",  TEST_CATEGORY_MULT ) { 
+    auto func = static_cast<void(*)(Qureg, int*, int, CompMatr)>(rightapplyCompMatr);
+    testOperation<zero,any,compmatr,rightapply>(func, nullptr); 
 }
 
-TEST_CASE( "postMultiplyDiagMatr",  TEST_CATEGORY_MULT ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr)>(postMultiplyDiagMatr);
-    testOperation<zero,any,diagmatr,postmultiply>(func, nullptr);
+TEST_CASE( "rightapplyDiagMatr",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr)>(rightapplyDiagMatr);
+    testOperation<zero,any,diagmatr,rightapply>(func, nullptr);
 }
 
-TEST_CASE( "postMultiplyDiagMatrPower",  TEST_CATEGORY_MULT ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr, qcomp)>(postMultiplyDiagMatrPower);
-    testOperation<zero,any,diagpower,postmultiply>(func, nullptr);
+TEST_CASE( "rightapplyDiagMatrPower",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, DiagMatr, qcomp)>(rightapplyDiagMatrPower);
+    testOperation<zero,any,diagpower,rightapply>(func, nullptr);
 }
 
-TEST_CASE( "postMultiplyMultiQubitNot",  TEST_CATEGORY_MULT ) {
-    auto func = static_cast<void(*)(Qureg, int*, int)>(postMultiplyMultiQubitNot);
-    testOperation<zero,any,none,postmultiply>(func, VariableSizeMatrices::X);
+TEST_CASE( "rightapplyMultiQubitNot",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int)>(rightapplyMultiQubitNot);
+    testOperation<zero,any,none,rightapply>(func, VariableSizeMatrices::X);
 }
 
-TEST_CASE( "postMultiplyPhaseGadget",  TEST_CATEGORY_MULT ) {
-    auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(postMultiplyPhaseGadget);
-    testOperation<zero,any,scalar,postmultiply>(func, VariableSizeParameterisedMatrices::Z);
+TEST_CASE( "rightapplyPhaseGadget",  TEST_CATEGORY_MULT ) {
+    auto func = static_cast<void(*)(Qureg, int*, int, qreal)>(rightapplyPhaseGadget);
+    testOperation<zero,any,scalar,rightapply>(func, VariableSizeParameterisedMatrices::Z);
 }
 
 
@@ -1953,7 +1953,7 @@ TEST_CASE( "postMultiplyPhaseGadget",  TEST_CATEGORY_MULT ) {
  */
 
 
-TEST_CASE( "multiplyFullStateDiagMatr", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "leftapplyFullStateDiagMatr", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
 
@@ -1962,20 +1962,20 @@ TEST_CASE( "multiplyFullStateDiagMatr", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TA
     SECTION( LABEL_CORRECTNESS ) {
 
         qmatrix refMatr = getRandomDiagonalMatrix(getPow2(numQubits));
-        auto apiFunc = multiplyFullStateDiagMatr;
+        auto apiFunc = leftapplyFullStateDiagMatr;
 
         GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_STATEVEC ) {
 
-            auto refFunc = [&] (qvector& state, qmatrix matr) { multiplyReferenceOperator(state, matr); };
+            auto refFunc = [&] (qvector& state, qmatrix matr) { leftapplyReferenceOperator(state, matr); };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedSV, cachedMatrs, apiFunc, refSV, refMatr, refFunc);
         }
 
         SECTION( LABEL_DENSMATR ) {
 
-            auto refFunc = [&] (qmatrix& state, qmatrix matr) { multiplyReferenceOperator(state, matr); };
+            auto refFunc = [&] (qmatrix& state, qmatrix matr) { leftapplyReferenceOperator(state, matr); };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
         }
@@ -1985,7 +1985,7 @@ TEST_CASE( "multiplyFullStateDiagMatr", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TA
 }
 
 
-TEST_CASE( "postMultiplyFullStateDiagMatr", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "rightapplyFullStateDiagMatr", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
 
@@ -1994,13 +1994,13 @@ TEST_CASE( "postMultiplyFullStateDiagMatr", TEST_CATEGORY_MULT LABEL_MIXED_DEPLO
     SECTION( LABEL_CORRECTNESS ) {
 
         qmatrix refMatr = getRandomDiagonalMatrix(getPow2(numQubits));
-        auto apiFunc = postMultiplyFullStateDiagMatr;
+        auto apiFunc = rightapplyFullStateDiagMatr;
 
         GENERATE( range(0, getNumTestedMixedDeploymentRepetitions()) );
 
         SECTION( LABEL_DENSMATR ) {
 
-            auto refFunc = [&] (qmatrix& state, qmatrix matr) { postMultiplyReferenceOperator(state, matr); };
+            auto refFunc = [&] (qmatrix& state, qmatrix matr) { rightapplyReferenceOperator(state, matr); };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
         }
@@ -2010,7 +2010,7 @@ TEST_CASE( "postMultiplyFullStateDiagMatr", TEST_CATEGORY_MULT LABEL_MIXED_DEPLO
 }
 
 
-TEST_CASE( "multiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "leftapplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
 
@@ -2022,7 +2022,7 @@ TEST_CASE( "multiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_DEPL
         qcomp exponent = getRandomComplex();
 
         auto apiFunc = [&](Qureg qureg, FullStateDiagMatr matr) { 
-            return multiplyFullStateDiagMatrPower(qureg, matr, exponent);
+            return leftapplyFullStateDiagMatrPower(qureg, matr, exponent);
         };
 
         CAPTURE( exponent );
@@ -2033,7 +2033,7 @@ TEST_CASE( "multiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_DEPL
 
             auto refFunc = [&] (qvector& state, qmatrix matr) { 
                 matr = getPowerOfDiagonalMatrix(matr, exponent);
-                multiplyReferenceOperator(state, matr);
+                leftapplyReferenceOperator(state, matr);
             };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedSV, cachedMatrs, apiFunc, refSV, refMatr, refFunc);
@@ -2043,7 +2043,7 @@ TEST_CASE( "multiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_DEPL
 
             auto refFunc = [&] (qmatrix& state, qmatrix matr) { 
                 matr = getPowerOfDiagonalMatrix(matr, exponent);
-                multiplyReferenceOperator(state, matr);
+                leftapplyReferenceOperator(state, matr);
             };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
@@ -2054,7 +2054,7 @@ TEST_CASE( "multiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_DEPL
 }
 
 
-TEST_CASE( "postMultiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "rightapplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, cachedSV, cachedDM, refSV, refDM );
 
@@ -2066,7 +2066,7 @@ TEST_CASE( "postMultiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_
         qcomp exponent = getRandomComplex();
 
         auto apiFunc = [&](Qureg qureg, FullStateDiagMatr matr) { 
-            return postMultiplyFullStateDiagMatrPower(qureg, matr, exponent);
+            return rightapplyFullStateDiagMatrPower(qureg, matr, exponent);
         };
 
         CAPTURE( exponent );
@@ -2077,7 +2077,7 @@ TEST_CASE( "postMultiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_
 
             auto refFunc = [&] (qmatrix& state, qmatrix matr) { 
                 matr = getPowerOfDiagonalMatrix(matr, exponent);
-                postMultiplyReferenceOperator(state, matr);
+                rightapplyReferenceOperator(state, matr);
             };
 
             TEST_ON_CACHED_QUREG_AND_MATRIX( cachedDM, cachedMatrs, apiFunc, refDM, refMatr, refFunc);
@@ -2088,7 +2088,7 @@ TEST_CASE( "postMultiplyFullStateDiagMatrPower", TEST_CATEGORY_MULT LABEL_MIXED_
 }
 
 
-TEST_CASE( "multiplyQubitProjector", TEST_CATEGORY_OPS ) {
+TEST_CASE( "leftapplyQubitProjector", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -2101,8 +2101,8 @@ TEST_CASE( "multiplyQubitProjector", TEST_CATEGORY_OPS ) {
         qmatrix projector = getProjector(outcome);
 
         auto testFunc = [&](Qureg qureg, auto& ref) {
-            multiplyQubitProjector(qureg, target, outcome);
-            multiplyReferenceOperator(ref, {target}, projector);
+            leftapplyQubitProjector(qureg, target, outcome);
+            leftapplyReferenceOperator(ref, {target}, projector);
         };
 
         CAPTURE( target, outcome );
@@ -2114,7 +2114,7 @@ TEST_CASE( "multiplyQubitProjector", TEST_CATEGORY_OPS ) {
 }
 
 
-TEST_CASE( "postMultiplyQubitProjector", TEST_CATEGORY_OPS ) {
+TEST_CASE( "rightapplyQubitProjector", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -2127,8 +2127,8 @@ TEST_CASE( "postMultiplyQubitProjector", TEST_CATEGORY_OPS ) {
         qmatrix projector = getProjector(outcome);
 
         auto testFunc = [&](Qureg qureg, auto& ref) {
-            postMultiplyQubitProjector(qureg, target, outcome);
-            postMultiplyReferenceOperator(ref, {target}, projector);
+            rightapplyQubitProjector(qureg, target, outcome);
+            rightapplyReferenceOperator(ref, {target}, projector);
         };
 
         CAPTURE( target, outcome );
@@ -2139,7 +2139,7 @@ TEST_CASE( "postMultiplyQubitProjector", TEST_CATEGORY_OPS ) {
 }
 
 
-TEST_CASE( "multiplyMultiQubitProjector", TEST_CATEGORY_OPS ) {
+TEST_CASE( "leftapplyMultiQubitProjector", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -2152,8 +2152,8 @@ TEST_CASE( "multiplyMultiQubitProjector", TEST_CATEGORY_OPS ) {
         qmatrix projector = getProjector(targets, outcomes, numQubits);
 
         auto testFunc = [&](Qureg qureg, auto& ref) {
-            multiplyMultiQubitProjector(qureg, targets.data(), outcomes.data(), numTargs);
-            multiplyReferenceOperator(ref, projector);
+            leftapplyMultiQubitProjector(qureg, targets.data(), outcomes.data(), numTargs);
+            leftapplyReferenceOperator(ref, projector);
         };
 
         CAPTURE( targets, outcomes );
@@ -2165,7 +2165,7 @@ TEST_CASE( "multiplyMultiQubitProjector", TEST_CATEGORY_OPS ) {
 }
 
 
-TEST_CASE( "postMultiplyMultiQubitProjector", TEST_CATEGORY_OPS ) {
+TEST_CASE( "rightapplyMultiQubitProjector", TEST_CATEGORY_OPS ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -2178,8 +2178,8 @@ TEST_CASE( "postMultiplyMultiQubitProjector", TEST_CATEGORY_OPS ) {
         qmatrix projector = getProjector(targets, outcomes, numQubits);
 
         auto testFunc = [&](Qureg qureg, auto& ref) {
-            postMultiplyMultiQubitProjector(qureg, targets.data(), outcomes.data(), numTargs);
-            postMultiplyReferenceOperator(ref, projector);
+            rightapplyMultiQubitProjector(qureg, targets.data(), outcomes.data(), numTargs);
+            rightapplyReferenceOperator(ref, projector);
         };
 
         CAPTURE( targets, outcomes );
@@ -2190,7 +2190,7 @@ TEST_CASE( "postMultiplyMultiQubitProjector", TEST_CATEGORY_OPS ) {
 }
 
 
-TEST_CASE( "multiplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "leftapplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -2205,7 +2205,7 @@ TEST_CASE( "multiplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
             // must use (and ergo make) an identically-deployed workspace
             Qureg workspace = createCloneQureg(qureg);
-            multiplyPauliStrSum(qureg, sum, workspace);
+            leftapplyPauliStrSum(qureg, sum, workspace);
             destroyQureg(workspace);
 
             ref = getMatrix(sum, numQubits) * ref;
@@ -2220,7 +2220,7 @@ TEST_CASE( "multiplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 }
 
 
-TEST_CASE( "postMultiplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
+TEST_CASE( "rightapplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG ) {
 
     PREPARE_TEST( numQubits, statevecQuregs, densmatrQuregs, statevecRef, densmatrRef );
 
@@ -2235,7 +2235,7 @@ TEST_CASE( "postMultiplyPauliStrSum", TEST_CATEGORY_MULT LABEL_MIXED_DEPLOY_TAG
 
             // must use (and ergo make) an identically-deployed workspace
             Qureg workspace = createCloneQureg(qureg);
-            postMultiplyPauliStrSum(qureg, sum, workspace);
+            rightapplyPauliStrSum(qureg, sum, workspace);
             destroyQureg(workspace);
 
             ref = ref * getMatrix(sum, numQubits);
diff --git a/tests/utils/evolve.cpp b/tests/utils/evolve.cpp
index 38b43cc9c..6c8729efb 100644
--- a/tests/utils/evolve.cpp
+++ b/tests/utils/evolve.cpp
@@ -166,21 +166,21 @@ void applyReferenceOperator(qmatrix& state, qmatrix matrix) {
     state = matrix * state * getConjugateTranspose(matrix);
 }
 
-void multiplyReferenceOperator(qvector& state, qmatrix matrix) {
+void leftapplyReferenceOperator(qvector& state, qmatrix matrix) {
     DEMAND( state.size() == matrix.size() );
 
     // for statevectors, multiplying is the same as applying
     applyReferenceOperator(state, matrix);
 }
 
-void multiplyReferenceOperator(qmatrix& state, qmatrix matrix) {
+void leftapplyReferenceOperator(qmatrix& state, qmatrix matrix) {
     DEMAND( state.size() == matrix.size() );
 
     // we left-multiply upon density matrices only
     state = matrix * state;
 }
 
-void postMultiplyReferenceOperator(qmatrix& state, qmatrix matrix) {
+void rightapplyReferenceOperator(qmatrix& state, qmatrix matrix) {
     DEMAND( state.size() == matrix.size() );
 
     // we right-multiply upon density matrices only
@@ -202,21 +202,21 @@ void applyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> ctrlS
     applyReferenceOperator(state, fullOp);
 }
 
-void multiplyReferenceOperator(qvector& state, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qmatrix matrix) {
+void leftapplyReferenceOperator(qvector& state, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qmatrix matrix) {
     
     applyReferenceOperator(state, ctrls, ctrlStates, targs, matrix);
 }
 
-void multiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qmatrix matrix) {
+void leftapplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qmatrix matrix) {
     
     qmatrix left = getFullStateOperator(ctrls, ctrlStates, targs, matrix, getLog2(state.size()));
-    multiplyReferenceOperator(state, left);
+    leftapplyReferenceOperator(state, left);
 }
 
-void postMultiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qmatrix matrix) {
+void rightapplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, qmatrix matrix) {
     
     qmatrix left = getFullStateOperator(ctrls, ctrlStates, targs, matrix, getLog2(state.size()));
-    postMultiplyReferenceOperator(state, left);
+    rightapplyReferenceOperator(state, left);
 }
 
 
@@ -230,17 +230,17 @@ void applyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> targs
     
     applyReferenceOperator(state, ctrls, {}, targs, matrix);
 }
-void multiplyReferenceOperator(qvector& state, vector<int> ctrls, vector<int> targs, qmatrix matrix) {
+void leftapplyReferenceOperator(qvector& state, vector<int> ctrls, vector<int> targs, qmatrix matrix) {
     
-    multiplyReferenceOperator(state, ctrls, {}, targs, matrix);
+    leftapplyReferenceOperator(state, ctrls, {}, targs, matrix);
 }
-void multiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix) {
+void leftapplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix) {
     
-    multiplyReferenceOperator(state, ctrls, {}, targs, matrix);
+    leftapplyReferenceOperator(state, ctrls, {}, targs, matrix);
 }
-void postMultiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix) {
+void rightapplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix) {
     
-    postMultiplyReferenceOperator(state, ctrls, {}, targs, matrix);
+    rightapplyReferenceOperator(state, ctrls, {}, targs, matrix);
 }
 
 
@@ -254,17 +254,17 @@ void applyReferenceOperator(qmatrix& state, vector<int> targs, qmatrix matrix) {
 
     applyReferenceOperator(state, {}, {}, targs, matrix);
 }
-void multiplyReferenceOperator(qvector& state, vector<int> targs, qmatrix matrix) {
+void leftapplyReferenceOperator(qvector& state, vector<int> targs, qmatrix matrix) {
 
-    multiplyReferenceOperator(state, {}, {}, targs, matrix);
+    leftapplyReferenceOperator(state, {}, {}, targs, matrix);
 }
-void multiplyReferenceOperator(qmatrix& state, vector<int> targs, qmatrix matrix) {
+void leftapplyReferenceOperator(qmatrix& state, vector<int> targs, qmatrix matrix) {
 
-    multiplyReferenceOperator(state, {}, {}, targs, matrix);
+    leftapplyReferenceOperator(state, {}, {}, targs, matrix);
 }
-void postMultiplyReferenceOperator(qmatrix& state, vector<int> targs, qmatrix matrix) {
+void rightapplyReferenceOperator(qmatrix& state, vector<int> targs, qmatrix matrix) {
 
-    postMultiplyReferenceOperator(state, {}, {}, targs, matrix);
+    rightapplyReferenceOperator(state, {}, {}, targs, matrix);
 }
 
 
diff --git a/tests/utils/evolve.hpp b/tests/utils/evolve.hpp
index 876130167..75c0db4c1 100644
--- a/tests/utils/evolve.hpp
+++ b/tests/utils/evolve.hpp
@@ -21,29 +21,29 @@
 using std::vector;
 
 
-void applyReferenceOperator(       qvector& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
-void applyReferenceOperator(       qmatrix& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(    qvector& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(    qmatrix& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
-void postMultiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
-
-void applyReferenceOperator(       qvector& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
-void applyReferenceOperator(       qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(    qvector& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(    qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
-void postMultiplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
-
-void applyReferenceOperator(       qvector& state, vector<int> targs, qmatrix matrix);
-void applyReferenceOperator(       qmatrix& state, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(    qvector& state, vector<int> targs, qmatrix matrix);
-void multiplyReferenceOperator(    qmatrix& state, vector<int> targs, qmatrix matrix);
-void postMultiplyReferenceOperator(qmatrix& state, vector<int> targs, qmatrix matrix);
-
-void applyReferenceOperator(       qvector& state, qmatrix matrix);
-void applyReferenceOperator(       qmatrix& state, qmatrix matrix);
-void multiplyReferenceOperator(    qvector& state, qmatrix matrix);
-void multiplyReferenceOperator(    qmatrix& state, qmatrix matrix);
-void postMultiplyReferenceOperator(qmatrix& state, qmatrix matrix);
+void applyReferenceOperator     (qvector& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
+void applyReferenceOperator     (qmatrix& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
+void leftapplyReferenceOperator (qvector& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
+void leftapplyReferenceOperator (qmatrix& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
+void rightapplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> states, vector<int> targs, qmatrix matrix);
+
+void applyReferenceOperator     (qvector& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
+void applyReferenceOperator     (qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
+void leftapplyReferenceOperator (qvector& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
+void leftapplyReferenceOperator (qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
+void rightapplyReferenceOperator(qmatrix& state, vector<int> ctrls, vector<int> targs, qmatrix matrix);
+
+void applyReferenceOperator     (qvector& state, vector<int> targs, qmatrix matrix);
+void applyReferenceOperator     (qmatrix& state, vector<int> targs, qmatrix matrix);
+void leftapplyReferenceOperator (qvector& state, vector<int> targs, qmatrix matrix);
+void leftapplyReferenceOperator (qmatrix& state, vector<int> targs, qmatrix matrix);
+void rightapplyReferenceOperator(qmatrix& state, vector<int> targs, qmatrix matrix);
+
+void applyReferenceOperator     (qvector& state, qmatrix matrix);
+void applyReferenceOperator     (qmatrix& state, qmatrix matrix);
+void leftapplyReferenceOperator (qvector& state, qmatrix matrix);
+void leftapplyReferenceOperator (qmatrix& state, qmatrix matrix);
+void rightapplyReferenceOperator(qmatrix& state, qmatrix matrix);
 
 void applyReferenceOperator(qmatrix& state, vector<int> targs, vector<qmatrix> matrices);
 

From 6eec1cf662ba0d6f35acf059f7fc947ac7d1fa26 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Tue, 19 Aug 2025 17:57:26 +0200
Subject: [PATCH 18/32] added Trotterised time evolution (#674)

Specifically:
- applyTrotterizedUnitaryTimeEvolution()
- applyTrotterizedImaginaryTimeEvolution()
- applyTrotterizedNoisyTimeEvolution()
where the latter has significant novelty.

PR also
- (patch) made imaginary-time evolution assert Hermiticity (315ea41)
- (patch) patched non-unitary Trotter on density matrix (01c51e1)
- updated dynamics examples to use these new time-evol functions
- tidied some Pauli algebra (replacing paulis_hasOddNumY calls with direct paulis_getSignOfPauliStrConj)
- made createPauliStrSum validate it can fit in RAM
---
 examples/extended/dynamics.c     |   7 +-
 examples/extended/dynamics.cpp   |   7 +-
 quest/include/trotterisation.h   | 532 +++++++++++++++++++++++++------
 quest/src/api/operations.cpp     |  15 +-
 quest/src/api/paulis.cpp         | 174 ++++++++--
 quest/src/api/trotterisation.cpp | 226 +++++++++++--
 quest/src/core/errors.cpp        |  36 +++
 quest/src/core/errors.hpp        |  18 ++
 quest/src/core/localiser.cpp     |  10 +-
 quest/src/core/memory.cpp        |   9 +
 quest/src/core/memory.hpp        |   2 +
 quest/src/core/utilities.cpp     |  35 ++
 quest/src/core/utilities.hpp     |   5 +
 quest/src/core/validation.cpp    | 110 ++++++-
 quest/src/core/validation.hpp    |  18 ++
 tests/unit/paulis.cpp            |   6 +
 tests/unit/trotterisation.cpp    |   6 +
 utils/docs/Doxyfile              |  10 +-
 18 files changed, 1064 insertions(+), 162 deletions(-)

diff --git a/examples/extended/dynamics.c b/examples/extended/dynamics.c
index 8b94e0a7d..0d6763c1d 100644
--- a/examples/extended/dynamics.c
+++ b/examples/extended/dynamics.c
@@ -1,6 +1,5 @@
 /** @file
- * An example of using QuEST (primarily function
- * applyTrotterizedPauliStrSumGadget()) to perform
+ * An example of using QuEST to perform closed
  * dynamical simulation via Trotterisation of the
  * unitary-time evolution operator.
  * 
@@ -158,7 +157,7 @@ int main() {
     for (int i=0; i<steps; i++) {
 
         // evolve qureg under (approx) exp(-i dt H)
-        applyTrotterizedPauliStrSumGadget(qureg, hamil, -dt, order, reps);
+        applyTrotterizedUnitaryTimeEvolution(qureg, hamil, dt, order, reps);
 
         // calculate and report <O>
         qreal time = dt * (i+1);
@@ -188,7 +187,7 @@ int main() {
 
     // verify results by uninterrupted higher-order simulation to target time
     initPlusState(qureg);
-    applyTrotterizedPauliStrSumGadget(qureg, hamil, -dt*steps, order+2, reps*steps);
+    applyTrotterizedUnitaryTimeEvolution(qureg, hamil, dt*steps, order+2, reps*steps);
     reportScalar("final <O>", calcExpecPauliStrSum(qureg, observ));
 
     // clean up
diff --git a/examples/extended/dynamics.cpp b/examples/extended/dynamics.cpp
index ae232e348..95245fb87 100644
--- a/examples/extended/dynamics.cpp
+++ b/examples/extended/dynamics.cpp
@@ -1,6 +1,5 @@
 /** @file
- * An example of using QuEST (primarily function
- * applyTrotterizedPauliStrSumGadget()) to perform
+ * An example of using QuEST to perform closed
  * dynamical simulation via Trotterisation of the
  * unitary-time evolution operator.
  * 
@@ -155,7 +154,7 @@ int main() {
     for (int i=0; i<steps; i++) {
 
         // evolve qureg under (approx) exp(-i dt H)
-        applyTrotterizedPauliStrSumGadget(qureg, hamil, -dt, order, reps);
+        applyTrotterizedUnitaryTimeEvolution(qureg, hamil, dt, order, reps);
 
         // calculate and report <O>
         qreal time = dt * (i+1);
@@ -182,7 +181,7 @@ int main() {
 
     // verify results by uninterrupted higher-order simulation to target time
     initPlusState(qureg);
-    applyTrotterizedPauliStrSumGadget(qureg, hamil, -dt*steps, order+2, reps*steps);
+    applyTrotterizedUnitaryTimeEvolution(qureg, hamil, dt*steps, order+2, reps*steps);
     reportScalar("final <O>", calcExpecPauliStrSum(qureg, observ));
 
     // clean up
diff --git a/quest/include/trotterisation.h b/quest/include/trotterisation.h
index 6c16e3d9a..ca234c5d5 100644
--- a/quest/include/trotterisation.h
+++ b/quest/include/trotterisation.h
@@ -38,15 +38,16 @@ extern "C" {
 
 /** @notyettested
  * 
- * Effects (an approximation to) the exponential of @p sum, weighted by @p angle, upon @p qureg,
+ * Effects an approximation to the exponential of @p sum, weighted by @p angle times @f$ i @f$, upon @p qureg,
  * via the symmetrized Trotter-Suzuki decomposition (<a href="https://arxiv.org/abs/math-ph/0506007">arXiv</a>).
  * Increasing @p reps (the number of Trotter repetitions) or @p order (an even, positive integer or one) 
- * improves the accuracy of the approximation (reducing the "Trotter error" due to non-commuting 
- * terms of @p sum), though increases the runtime linearly and exponentially respectively.
+ * improves the accuracy of the approximation by reducing the "Trotter error" due to non-commuting 
+ * terms of @p sum, though increases the runtime linearly and exponentially respectively.
  * 
  * @formulae 
  * 
- * Let @f$ \hat{H} = @f$ @p sum and @f$ \theta = @f$ @p angle. This function approximates the action of
+ * Let @f$ \hat{H} = @f$ @p sum and @f$ \theta = @f$ @p angle @f$ \in \mathbb{R} @f$. This function approximates 
+ * the action of
  * @f[
       \exp \left(\iu \, \theta \, \hat{H} \right)
  * @f]
@@ -54,7 +55,7 @@ extern "C" {
  * Simulation is exact, regardless of @p order or @p reps, only when all terms in @p sum commute.
  * 
  * @important
- *   Note that @f$ \theta @f$ lacks the @f$ -\frac{1}{2} @f$ prefactor present in other functions like
+ *   Observe that @f$ \theta @f$ lacks the @f$ -\frac{1}{2} @f$ prefactor present in other functions like
  *   applyPauliGadget().
  * 
  * To be precise, let @f$ r = @f$ @p reps and assume @p sum is composed of
@@ -64,7 +65,8 @@ extern "C" {
  * @f]
  * where @f$ c_j @f$ is the coefficient of the @f$ j @f$-th PauliStr @f$ \hat{\sigma}_j @f$.
  * 
- * - When @p order=1, this function performs first-order Trotterisation, whereby
+ * - When @p order=1, this function performs first-order Trotterisation, where the terms of @p sum
+ *   are effected in a repeated, arbitrary but fixed order.
  *   @f[
        \exp(\iu \, \theta \, \hat{H} )
           \approx 
@@ -72,7 +74,9 @@ extern "C" {
         \prod\limits_{j=1}^{T} 
         \exp \left( \iu \, \frac{\theta \, c_j}{r} \, \hat\sigma_j \right).
  *   @f]
- * - When @p order=2, this function performs the lowest order "symmetrized" Suzuki decomposition, whereby 
+ *
+ * - When @p order=2, this function performs the lowest order "symmetrized" Suzuki decomposition, whereby
+ *   each repetition effects the terms of @p sum forward then in reverse.
  *   @f[
        \exp(\iu \, \theta \, \hat{H} )
           \approx 
@@ -81,8 +85,11 @@ extern "C" {
               \prod\limits_{j=T}^{1} \exp \left( \iu \frac{\theta \, c_j}{2 \, r}  \hat\sigma_j \right)
          \right].
  *   @f]
+ *
  * - Greater, even values of @p order (denoted by symbol @f$ n @f$) invoke higher-order symmetrized decompositions 
- *   @f$ S[\theta,n,r] @f$. Letting @f$ p = \left( 4 - 4^{1/(n-1)} \right)^{-1} @f$, these satisfy
+ *   @f$ S[\theta,n,r] @f$. These see the lower order Trotter circuits repeated twice forward, then reversed, then 
+ *   twice forward again, recursively. To be precise, letting @f$ p = \left( 4 - 4^{1/(n-1)} \right)^{-1} @f$, these
+ *   satisfy
  *   @f{align*}
         S[\theta, n, 1] &= 
             \left( \prod\limits^2 S[p \, \theta, n-2, 1] \right)
@@ -98,41 +105,43 @@ extern "C" {
  * 
  * @equivalences
  * 
- * - Time evolution of duration @f$ t @f$ under a time-independent Hamiltonian @p sum = @f$ \hat{H} @f$, as
- *   per the unitary time evolution operator
+ * - By passing @f$ \theta = - \Delta t / \hbar @f$, this function approximates unitary time evolution of a closed 
+ *   system under the time-independent Hamiltonian @p sum = @f$ \hat{H} @f$ over a duration of @f$ \Delta t @f$, as
+ *   described by propagator
  *   @f[
-        \hat{U}(t) = \exp(- \iu \, t  \,\hat{H} \, / \, \hbar) 
+        \hat{U}(\Delta t) = \exp(- \iu \, \Delta t  \,\hat{H} \, / \, \hbar),
  *   @f]
- *   is approximated via @f$ \theta = - t / \hbar @f$.
- *   ```
-     qreal time = 3.14;
-     qreal angle = - time / hbar;
-     applyTrotterizedPauliStrSumGadget(qureg, sum, angle, order, reps);
- *   ```
+ *   as utilised by the function applyTrotterizedUnitaryTimeEvolution().
+ * 
  * - This function is equivalent to applyNonUnitaryTrotterizedPauliStrSumGadget() when passing
  *   a @p qcomp instance with a zero imaginary component as the @p angle parameter. This latter 
  *   function is useful for generalising dynamical simulation to imaginary-time evolution.
  * 
  * @constraints
+ * 
  * - Unitarity of the prescribed exponential(s) requires that @p sum is Hermitian, ergo containing
  *   only real coefficients. Validation will check that @p sum is approximately Hermitian, permitting
  *   coefficients with imaginary components smaller (in magnitude) than epsilon.
  *   @f[ 
-        \max\limits_{i} \Big|c_i| \le \valeps
+        \max\limits_{i} |c_i| \le \valeps
  *   @f]
  *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon().
  *   Otherwise, use applyNonUnitaryTrotterizedPauliStrSumGadget() to permit non-Hermitian @p sum
  *   and ergo effect a non-unitary exponential(s). 
- * - The @p angle parameter is necessarily real despite the validation epsilon, but can be relaxed
- *   to an arbitrary complex scalar using applyNonUnitaryTrotterizedPauliStrSumGadget().
+ * 
+ * - The @p angle parameter is necessarily real to retain unitarity, but can be relaxed to an arbitrary 
+ *   complex scalar (i.e. a @p qcomp) using applyNonUnitaryTrotterizedPauliStrSumGadget(). This permits
+ *   cancelling the complex unit @f$ i @f$ to effect non-unitary @f$ \exp(\theta \, \hat{H}) @f$ as
+ *   is useful for imaginary-time evolution.
+ * 
  * - This function only ever effects @f$ \exp \left(\iu \, \theta \, \hat{H} \right) @f$ exactly
- *   when all PauliStr in @p sum = @f$ \hat{H} @f$ commute. 
+ *   when all PauliStr in @p sum = @f$ \hat{H} @f$ commute, or @p reps @f$ \rightarrow \infty @f$.
  * 
  * @param[in,out] qureg  the state to modify.
  * @param[in]     sum    a weighted sum of Pauli strings to approximately exponentiate.
- * @param[in]     angle  an effective prefactor of @p sum in the exponent.
- * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...)
- * @param[in]     reps   the number of Trotter repetitions
+ * @param[in]     angle  the prefactor of @p sum times @f$ i @f$ in the exponent.
+ * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...).
+ * @param[in]     reps   the number of Trotter repetitions.
  * 
  * @throws @validationerror
  * - if @p qureg or @p sum are uninitialised.
@@ -144,6 +153,7 @@ extern "C" {
  * @see
  *  - applyPauliGadget()
  *  - applyNonUnitaryTrotterizedPauliStrSumGadget()
+ *  - applyTrotterizedUnitaryTimeEvolution()
  * 
  * @author Tyson Jones
  */
@@ -176,92 +186,43 @@ void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* con
 
 /** @notyettested
  * 
- * A generalisation of applyTrotterizedPauliStrSumGadget() which accepts a complex angle and permits
+ * A generalisation of applyTrotterizedPauliStrSumGadget() which accepts a complex @p angle and permits
  * @p sum to be non-Hermitian, thereby effecting a potentially non-unitary and non-CPTP operation.
  * 
  * @formulae 
  * 
- * Let @f$ \hat{H} = @f$ @p sum and @f$ \theta = @f$ @p angle. This function approximates the action of
+ * Let @f$ \hat{H} = @f$ @p sum and @f$ \theta = @f$ @p angle @f$ \in \mathbb{C} @f$. This function 
+ * approximates the action of
  * @f[
       \exp \left(\iu \, \theta \, \hat{H} \right)
  * @f]
  * via a Trotter-Suzuki decomposition of the specified @p order and number of repetitions (@p reps). 
  * 
- * See applyTrotterizedPauliStrSumGadget() for more information about the decomposition.
+ * > See applyTrotterizedPauliStrSumGadget() for more information about the decomposition.
  *
  * @equivalences
  * 
- * - When @p angle is set to @f$ \theta = \iu \, \tau @f$ and @p sum = @f$ \hat{H} @f$ is Hermitian,
- *   this function (approximately) evolves @p qureg in imaginary-time. That is, letting 
- *   @f$ \hat{U}(t) = \exp(-\iu \, t \, \hat{H}) @f$ be the normalised unitary evolution operator, this 
- *   function effects the imaginary-time operator
-     @f[
-        \hat{V}(\tau) = \hat{U}(t=-\iu \tau) = \exp(- \tau \hat{H}).
- *   @f]
- *   This operation drives the system toward the (unnormalised) groundstate.
- *   Let @f$ \{ \ket{\phi_i} \} @f$ and @f$ \{ \ket{\lambda_i} \} @f$ be the eigenstates and respective
- *   eigenvalues of @f$ \hat{H} @f$, which are real due to Hermiticity.
- *   @f[
-         \hat{H} = \sum \limits_i \lambda_i \ket{\phi_i}\bra{\phi_i},
-         \;\;\;\;\; \lambda_i \in \mathbb{R}.
- *   @f]
- *   
- *   - When @p qureg is a statevector @f$ \svpsi @f$ and can ergo be expressed in the basis of 
- *     @f$ \{ \ket{\phi_i} \} @f$ as @f$ \svpsi = \sum_i \alpha_i \ket{\phi_i} @f$, 
- *     this function approximates
- *     @f[
-          \svpsi \, \rightarrow  \, \hat{V}(\tau) \svpsi =
-          \sum\limits_i \alpha_i \exp(- \tau \, \lambda_i) \ket{\phi_i}.
- *     @f]
- *   - When @p qureg is a density matrix and is ergo expressible as
- *     @f$ \dmrho = \sum\limits_{ij} \alpha_{ij} \ket{\phi_i}\bra{\phi_j} @f$, this function effects
- *     @f[
-          \dmrho \, \rightarrow \, \hat{V}(\tau) \dmrho \hat{V}(\tau)^\dagger =
-          \sum\limits_{ij} \alpha_{ij} \exp(-\tau (\lambda_i + \lambda_j)) \ket{\phi_i}\bra{\phi_j}.
- *     @f]
- *
- *   As @f$ \tau \rightarrow \infty @f$, the resulting unnormalised state approaches statevector
- *   @f$ \svpsi \rightarrow \alpha_0 \exp(-\tau \lambda_0) \ket{\phi_0} @f$ or density matrix
- *   @f$ \dmrho \rightarrow \alpha_{0,0} \exp(-2 \tau \lambda_0) \ket{\phi_0}\bra{\phi_0} @f$,
- *   where @f$ \lambda_0 @f$ is the minimum eigenvalue and @f$ \ket{\phi_0} @f$ is the groundstate.
- *   Assuming the initial overlap @f$ \alpha_0 @f$ is not zero (or exponentially tiny), 
- *   subsequent renormalisation via setQuregToRenormalized() produces the pure 
- *   ground-state @f$ \ket{\phi_0} @f$.
- *
- *   ```
-     // pray for a non-zero initial overlap
-     initRandomPureState(qureg); // works even for density matrices
-
-     // minimize then renormalise
-     qreal tau = 10; // impatient infinity
-     int order = 4;
-     int reps = 100;
-     applyNonUnitaryTrotterizedPauliStrSumGadget(qureg, hamil, tau * 1i, order, reps);
-     setQuregToRenormalized(qureg);
-
-     // ground-state (phi_0)
-     reportQureg(qureg);
-
-     // lowest lying eigenvalue (lambda_0)
-     qreal expec = calcExpecPauliStrSum(qureg, hamil);
-     reportScalar("expec", expec);
- *   ```
- *
- *   Note degenerate eigenvalues will yield a pure superposition of the corresponding eigenstates, with 
- *   coefficients informed by the initial, relative populations.
+ * - When @p angle is set to @f$ \theta = \iu \, \Delta \tau @f$ and @p sum = @f$ \hat{H} @f$ is Hermitian,
+ *  this function (approximately) evolves @p qureg in imaginary-time for duration @f$ \Delta \tau @f$,
+ *  effecting non-unitary propagator
+    @f[
+        \exp(- \Delta \tau \hat{H})
+ *  @f]
+ *  as utilised by applyTrotterizedImaginaryTimeEvolution().
  * 
- * - When @p angle is real and @p sum is Hermitian (has approximately real coefficients), this
- *   function is equivalent to applyTrotterizedPauliStrSumGadget()
+ * - When @p angle is real and @p sum is Hermitian (i.e. has approximately real coefficients), the effected
+ *   operation is unitary and this function becomes equivalent to applyTrotterizedPauliStrSumGadget().
  * 
  * @constraints
+ * 
  * - This function only ever effects @f$ \exp \left(\iu \, \theta \, \hat{H} \right) @f$ exactly
  *   when all PauliStr in @p sum = @f$ \hat{H} @f$ commute. 
  * 
  * @param[in,out] qureg  the state to modify.
  * @param[in]     sum    a weighted sum of Pauli strings to approximately exponentiate.
  * @param[in]     angle  an effective prefactor of @p sum in the exponent.
- * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...)
- * @param[in]     reps   the number of Trotter repetitions
+ * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...).
+ * @param[in]     reps   the number of Trotter repetitions.
  * 
  * @throws @validationerror
  * - if @p qureg or @p sum are uninitialised.
@@ -304,6 +265,399 @@ void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, std::vec
 
 
 
+/** 
+ * @defgroup trotter_timeevol Time evolution
+ * @brief Functions for approximate dynamical simulation.
+ * @{
+ */
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** @notyettested
+ * 
+ * Unitarily time evolves @p qureg for the duration @p time under the time-independent Hamiltonian @p hamil, 
+ * as approximated by symmetrized Trotterisation of the specified @p order and number of cycles @p reps. 
+ * 
+ * @formulae 
+ * 
+ * Let @f$ \hat{H} = @f$ @p hamil and @f$ t = @f$ @p time @f$ \in \mathbb{R} @f$. This function approximates 
+ * the action of the unitary-time evolution operator/propagator
+ * @f[
+      \hat{U}(t) = \exp \left(- \iu \, t \, \hat{H} \right),
+ * @f]
+ * as solves the time-independent Schrödinger equation. When @p qureg is a statevector @f$ \svpsi @f$, the 
+ * resulting state approximates
+ * @f[
+      \approx U(t) \svpsi
+ * @f]
+ * while when @p qureg is a density matrix @f$ \dmrho @f$, the result approximates
+ * @f[
+      \approx U(t) \, \dmrho \, U(t)^\dagger.
+ * @f]
+ *
+ * > See applyTrotterizedPauliStrSumGadget() for information about the Trotter method.
+ * 
+ * @equivalences
+ * 
+ * - This function merely wraps applyTrotterizedPauliStrSumGadget() which effects @f$ \exp(\iu \theta \hat{H}) @f$,
+ *   passing @f$ \theta = - t @f$.
+ * 
+ * @constraints
+ * 
+ * - Unitarity requires that @p hamil is Hermitian and ergo contains only real coefficients. Validation will check that 
+ *   @p hamil is approximately Hermitian, permitting coefficients with imaginary components smaller (in magnitude) than 
+ *   epsilon.
+ *   @f[ 
+        \max\limits_{i} |c_i| \le \valeps
+ *   @f]
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon(). The imaginary components
+ *   of the Hamiltonian _are_ considered during simulation.
+ * 
+ * - The @p time parameter is necessarily real to retain unitarity. It can be substituted for a strictly imaginary
+ *   scalar to perform imaginary-time evolution (as per Wick rotation @f$ t \rightarrow - \iu \tau @f$) via 
+ *   applyTrotterizedImaginaryTimeEvolution(), or generalised to an arbitrary complex number through direct use of 
+ *   applyNonUnitaryTrotterizedPauliStrSumGadget().
+ * 
+ * - The simulated system is _closed_ with dynamics described fully by the Hamiltonian @p hamil. Open or otherwise noisy
+ *   system dynamics can be simulated with applyTrotterizedNoisyTimeEvolution().
+ * 
+ * - Simulation is exact such that the effected operation is precisely @f$ \exp(-\iu t \hat{H}) @f$ only when 
+ *   @p reps @f$ \rightarrow \infty @f$ or all terms in @p hamil commute with one another. Conveniently, Trotter error
+ *   does _not_ break normalisation of the state since the approximating circuit remains unitary.
+ * 
+ * @myexample
+ * 
+ *   ```
+     Qureg qureg = createDensityQureg(10);
+     PauliStrSum hamil =  createInlinePauliStrSum(R"(
+         1   ZZI
+         2   IZZ
+         3   ZIZ
+         1.5 XII
+         2.5 IXI
+         3.5 IIX
+     )");
+
+     qreal time = 0.8 * hbar;
+     int order = 4;
+     int reps = 20;
+     applyTrotterizedUnitaryTimeEvolution(qureg, hamil, time, order, reps);
+ *   ```
+ *
+ * @see
+ *  - applyTrotterizedImaginaryTimeEvolution()
+ *  - applyTrotterizedNoisyTimeEvolution()
+ *  - applyNonUnitaryTrotterizedPauliStrSumGadget()
+ * 
+ * @param[in,out] qureg  the state to modify.
+ * @param[in]     hamil  the Hamiltonian as a a weighted sum of Pauli strings.
+ * @param[in]     time   the duration over which to simulate evolution.
+ * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...).
+ * @param[in]     reps   the number of Trotter repetitions.
+ * 
+ * @throws @validationerror
+ * - if @p qureg or @p hamil are uninitialised.
+ * - if @p hamil contains non-identities on qubits beyond the size of @p qureg.
+ * - if @p hamil is not approximately Hermitian.
+ * - if @p order is not 1 nor a positive, @b even integer.
+ * - if @p reps is not a positive integer.
+ * 
+ * @author Tyson Jones
+ */
+void applyTrotterizedUnitaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal time, int order, int reps);
+
+
+/** @notyettested
+ * 
+ * Simulates imaginary-time evolution of @p qureg for the duration @p tau under the time-independent 
+ * Hamiltonian @p hamil, as approximated by symmetrized Trotterisation of the specified @p order and
+ * number of cycles @p reps. 
+ * 
+ * > [!IMPORTANT]
+ * > This is a non-physical operation and breaks the normalisation of state which can be restored
+ * > via setQuregToRenormalized().
+ * 
+ * @formulae 
+ * 
+ * Let @f$ \hat{H} = @f$ @p hamil and @f$ \tau = @f$ @p tau @f$ \in \mathbb{R} @f$. This function 
+ * approximates the action of the non-unitary imaginary-time propagator
+ * @f[
+      \hat{V}(\tau) = \exp \left(- \tau \, \hat{H} \right),
+ * @f]
+ * as prescribed by Wick rotating (substituting time @f$ t @f$ for @f$ t \rightarrow -\iu \tau @f$)
+ * the time-independent Schrödinger equation. When @p qureg is a statevector @f$ \svpsi @f$, the 
+ * resulting state approximates
+ * @f[
+      \approx V(\tau) \svpsi
+ * @f]
+ * while when @p qureg is a density matrix @f$ \dmrho @f$, the result approximates
+ * @f[
+      \approx V(\tau) \, \dmrho \, V(\tau)^\dagger.
+ * @f]
+ *
+ * > See applyTrotterizedPauliStrSumGadget() for information about the Trotter method.
+ * 
+ * @par Utility
+ * 
+ * Imaginary-time evolution drives the system toward the (unnormalised) groundstate of the Hamiltonian.
+ * Let @f$ \{ \ket{\phi_i} \} @f$ and @f$ \{ \ket{\lambda_i} \} @f$ be the eigenstates and respective
+ * eigenvalues of @f$ \hat{H} @f$, which are real due to Hermiticity.
+ * @f[
+    \hat{H} = \sum \limits_i \lambda_i \ket{\phi_i}\bra{\phi_i},
+    \;\;\;\;\; \lambda_i \in \mathbb{R}.
+ * @f]
+ *
+ * - When @p qureg is a statevector @f$ \svpsi @f$ and can ergo be expressed in the basis of 
+ *   @f$ \{ \ket{\phi_i} \} @f$ as @f$ \svpsi = \sum_i \alpha_i \ket{\phi_i} @f$, 
+ *   this function approximates
+ *   @f[
+        \svpsi \, \rightarrow  \, \hat{V}(\tau) \svpsi =
+        \sum\limits_i \alpha_i \exp(- \tau \, \lambda_i) \ket{\phi_i}.
+ *   @f]
+ * - When @p qureg is a density matrix and is ergo expressible as
+ *   @f$ \dmrho = \sum\limits_{ij} \alpha_{ij} \ket{\phi_i}\bra{\phi_j} @f$, this function effects
+ *   @f[
+        \dmrho \, \rightarrow \, \hat{V}(\tau) \dmrho \hat{V}(\tau)^\dagger =
+        \sum\limits_{ij} \alpha_{ij} \exp(-\tau (\lambda_i + \lambda_j)) \ket{\phi_i}\bra{\phi_j}.
+ *   @f]
+ *
+ * As @f$ \tau \rightarrow \infty @f$, the resulting unnormalised state approaches statevector
+ * @f$ \svpsi \rightarrow \alpha_0 \exp(-\tau \lambda_0) \ket{\phi_0} @f$ or density matrix
+ * @f$ \dmrho \rightarrow \alpha_{0,0} \exp(-2 \tau \lambda_0) \ket{\phi_0}\bra{\phi_0} @f$,
+ * where @f$ \lambda_0 @f$ is the minimum eigenvalue and @f$ \ket{\phi_0} @f$ is the groundstate.
+ * Assuming the initial overlap @f$ \alpha_0 @f$ is not zero (or exponentially tiny), 
+ * subsequent renormalisation via setQuregToRenormalized() produces the pure 
+ * ground-state @f$ \ket{\phi_0} @f$ or @f$ \ket{\phi_0}\bra{\phi_0} @f$.
+ * 
+ * Note degenerate minimum eigenvalues will yield a pure superposition of the corresponding 
+ * eigenstates, with coefficients informed by the initial, relative populations.
+ * 
+ * @equivalences
+ * 
+ * - This function merely wraps applyNonUnitaryTrotterizedPauliStrSumGadget() which effects @f$ \exp(\iu \theta \hat{H}) @f$,
+ *   passing @f$ \theta = \tau \iu @f$.
+ * 
+ * @constraints
+ * 
+ * - While the process of imaginary-time evolution is non-unitary (and non-physical), Hermiticity of @p hamil is still
+ *   assumed, requiring it contains only real coefficients. Validation will check that @p hamil is _approximately_ Hermitian, 
+ *   permitting coefficients with imaginary components smaller (in magnitude) than epsilon.
+ *   @f[ 
+        \max\limits_{i} |c_i| \le \valeps
+ *   @f]
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon(). Beware however that 
+ *   imaginary-time evolution under a non-Hermitian Hamiltonian will _not_ necessarily approach the lowest lying eigenstate
+ *   (the eigenvalues may be non-real) so is likely of limited utility.
+ * 
+ * - The @p tau parameter is necessarily real such that evolution approaches the groundstate (modulo renormalisation).
+ *   It can generalised to an arbitrary complex number through direct use of applyNonUnitaryTrotterizedPauliStrSumGadget().
+ * 
+ * - Simulation is exact such that the effected operation is precisely @f$ \exp(-\tau \hat{H}) @f$ only when 
+ *   @p reps @f$ \rightarrow \infty @f$ or all terms in @p hamil commute with one another.
+ * 
+ * @myexample
+ *
+ * ```
+   // pray for a non-zero initial overlap
+   initRandomPureState(qureg); // works even for density matrices
+
+   // minimize then renormalise
+   qreal tau = 10; // impatient infinity
+   int order = 4;
+   int reps = 100;
+   applyTrotterizedImaginaryTimeEvolution(qureg, hamil, tau, order, reps);
+   setQuregToRenormalized(qureg);
+
+   // ground-state (phi_0)
+   reportQureg(qureg);
+
+   // lowest lying eigenvalue (lambda_0)
+   qreal expec = calcExpecPauliStrSum(qureg, hamil);
+   reportScalar("expec", expec);
+ * ```
+ *
+ * @see
+ *  - applyTrotterizedUnitaryTimeEvolution()
+ *  - applyNonUnitaryTrotterizedPauliStrSumGadget()
+ * 
+ * @param[in,out] qureg  the state to modify.
+ * @param[in]     hamil  the Hamiltonian as a a weighted sum of Pauli strings.
+ * @param[in]     tau    the duration over which to simulate imaginary-time evolution.
+ * @param[in]     order  the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...).
+ * @param[in]     reps   the number of Trotter repetitions.
+ * 
+ * @throws @validationerror
+ * - if @p qureg or @p hamil are uninitialised.
+ * - if @p hamil contains non-identities on qubits beyond the size of @p qureg.
+ * - if @p hamil is not approximately Hermitian.
+ * - if @p order is not 1 nor a positive, @b even integer.
+ * - if @p reps is not a positive integer.
+ * 
+ * @author Tyson Jones
+ */
+void applyTrotterizedImaginaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal tau, int order, int reps);
+
+
+/** @notyettested
+ * 
+ * Simulates open dynamics of @p qureg as per the Lindblad master equation, under the time-independent
+ * Hamiltonian @p hamil and jump operators @p jumps with corresponding damping rates @p damps, with 
+ * evolution approximated by symmetrized Trotterisation of the specified @p order and number of cycles
+ * @p reps.
+ * 
+ * @formulae 
+ * 
+ * Let @f$ \rho = @f$ @p qureg, @f$ \hat{H} = @f$ @p hamil, @f$ t = @f$ @p time, and denote the @f$ i @f$-th
+ * element of @p damps and @p jumps as @f$ \gamma_i @f$ and @f$ \hat{J}_i @f$ respectively. The Lindblad
+ * master equation prescribes that @f$ \rho @f$ time-evolves according to
+ * @f[
+     \frac{\mathrm{d}}{\mathrm{d}t} \rho = -\iu [\hat{H}, \rho] + \sum\limits_i \gamma_i \left(
+          \hat{J}_i \rho \hat{J}_i^\dagger - \frac{1}{2} \left\{ \hat{J}_i^\dagger \hat{J}_i, \rho \right\}
+     \right).
+ * @f]
+ * This function works by building a superoperator of the right-hand-side which acts upon the space of
+ * linearised @f$\rho@f$,
+ * @f[
+     \boldsymbol{L} = -\iu \left( \hat{\id} \otimes \hat{H} - \hat{H}^* \otimes \hat{\id} \right) +
+          \sum\limits_i \gamma_i \left(
+               \hat{J}_i^* \otimes \hat{J}_i - \frac{1}{2} \hat{\id} \otimes (\hat{J}^\dagger J_i)
+               - \frac{1}{2} (\hat{J}^\dagger J_i)^* \otimes \hat{\id}
+          \right),
+ * @f]
+ * as a non-Hermitian weighted sum of Pauli strings (a PauliStrSum). The superoperator @f$ \boldsymbol{L} @f$
+ * informs a superpropagator which exactly solves evolution as:
+ * @f[
+     \ket{\rho(t)} = \exp\left( t \boldsymbol{L} \right) \ket{\rho(0)}.
+ * @f]
+ * This function approximates the superpropagator @f$ \exp\left( t \boldsymbol{L} \right) @f$ using a higher-order 
+ * symmetrized Suzuki-Trotter decomposition, as informed by parameters @p order and @p reps.
+ * 
+ * > See applyTrotterizedPauliStrSumGadget() for information about the Trotter method.
+ * 
+ * @par Utility
+ * 
+ * This function simulates time evolution of an open system, where the jump operators model interactions with
+ * the environment. This can capture sophisticated decoherence processes of the quantum state which are untenable
+ * to model as discrete operations with functions like mixKrausMap(). This function also proves useful for
+ * preparing realistic, physical input states to quantum metrological circuits, or the general high-performance
+ * simulation of digital time evolution of condensed matter systems.
+ *
+ * @equivalences
+ * 
+ * - When `numJumps = 0`, evolution is unitary and the Lindblad master equation simplifes to the Liouville–von Neumann 
+ *   equation, which is equivalently (and more efficiently) simulated via applyTrotterizedUnitaryTimeEvolution().
+ * 
+ * @constraints
+ * 
+ * - Each damping rate in @p damps is expected to be a zero or positive number, in order for evolution to be trace 
+ *   preserving. Validation will assert that each damping rate @f$ \gamma_i @f$ satisfies
+ *   @f[
+          \min\limits_{i} \gamma_i \ge - \valeps
+ *   @f]
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon(). Non-trace-preserving,
+ *   negative damping rates can be simulated by disabling numerical validation via `setValidationEpsilon(0)`.
+ * 
+ * - The @p time parameter is necessarily real, and cannot be generalised to imaginary or complex like in other
+ *   functions. Generalisation is trivially numerically possible, but has no established physical meaning and so
+ *   is not exposed in the API. Please open an issue on Github for advice on complex-time simulation.
+ * 
+ * - Simulation is exact only when @p reps @f$ \rightarrow \infty @f$ or all terms in the superoperator 
+ *   @f$ \boldsymbol{L} @f$ incidentally commute with one another, and otherwise incorporates Trotter error.
+ *   Unlike for unitary evolution, Trotter error _does_ break normalisation of the state and so this function
+ *   is generally non-trace-preserving. In theory, normalisation can be restored with setQuregToRenormalized()
+ *   though noticable norm-breaking indicates evolution was inaccurate, and should instead be repeated with 
+ *   increased @p order or @p reps parameters.
+ * 
+ * - The function instantiates superoperator @f$ \boldsymbol{L} @f$ above as a temporary PauliStrSum, incurring a 
+ *   memory and time overhead which grows quadratically with the number of terms in @p hamil, plus quadratically
+ *   with the number in each jump operator. These overheads may prove prohibitively costly for PauliStrSum
+ *   containing very many terms.
+ * 
+ * @myexample
+ *
+ * ```
+    // |+><+|
+    Qureg qureg = createDensityQureg(3);
+    initPlusState(qureg);
+
+    PauliStrSum hamil = createInlinePauliStrSum(R"(
+        1  IIX
+        2  IYI
+        3  ZZZ
+    )");
+
+    // |0><0|
+    PauliStrSum jump1 = createInlinePauliStrSum(R"(
+        0.5  I
+        0.5  Z
+    )");
+
+    // |1><0|
+    PauliStrSum jump2 = createInlinePauliStrSum(R"(
+         0.5  X
+        -0.5i Y
+    )");
+
+    // "noisiness"
+    qreal damps[] = {.3, .4};
+    PauliStrSum jumps[] = {jump1, jump2};
+    int numJumps = 2;
+
+    reportScalar("initial energy", calcExpecPauliStrSum(qureg, hamil));
+
+    // time and accuracy
+    qreal time = 0.5;
+    int order = 4;
+    int reps = 100;
+    applyTrotterizedNoisyTimeEvolution(qureg, hamil, damps, jumps, numJumps, time, order, reps);
+
+    reportScalar("final energy", calcExpecPauliStrSum(qureg, hamil));
+ * ```
+ * 
+ * @see
+ *  - applyTrotterizedUnitaryTimeEvolution()
+ *  - applyTrotterizedImaginaryTimeEvolution()
+ * 
+ * @param[in,out] qureg     the density-matrix state to evolve and modify.
+ * @param[in]     hamil     the Hamiltonian of the qubit system (excludes any environment).
+ * @param[in]     damps     the damping rates of each jump operator in @p jumps.
+ * @param[in]     jumps     the jump operators specified as PauliStrSum.
+ * @param[in]     numJumps  the length of list @p jumps (and @p damps).
+ * @param[in]     time      the duration through which to evolve the state.
+ * @param[in]     order     the order of the Trotter-Suzuki decomposition (e.g. @p 1, @p 2, @p 4, ...).
+ * @param[in]     reps      the number of Trotter repetitions.
+ * 
+ * @throws @validationerror
+ * - if @p qureg, @p hamil or any element of @p jumps are uninitialised.
+ * - if @p qureg is not a density matrix.
+ * - if @p hamil or any element of @p jumps contains non-identities on qubits beyond the size of @p qureg.
+ * - if @p hamil is not approximately Hermitian.
+ * - if @p numJumps is negative.
+ * - if any element of @p damps is not approximately positive.
+ * - if the total number of Lindbladian superoperator terms overflows the `qindex` type.
+ * - if all Lindbladian superoperator terms cannot simultaneously fit into CPU memory.
+ * - if memory allocation of the Lindbladian superoperator terms unexpectedly fails.
+ * - if @p order is not 1 nor a positive, @b even integer.
+ * - if @p reps is not a positive integer.
+ * 
+ * @author Tyson Jones
+ */
+void applyTrotterizedNoisyTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal* damps, PauliStrSum* jumps, int numJumps, qreal time, int order, int reps);
+
+
+// end de-mangler
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */
+
+
+
 #endif // TROTTERISATION_H
 
 /** @} */ // (end file-wide doxygen defgroup)
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index ce5edb579..5c415fbc5 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -30,8 +30,9 @@ using std::vector;
  * PRVIATE UTILITIES
  */
 
+extern int paulis_getSignOfPauliStrConj(PauliStr str);
+
 extern bool paulis_isIdentity(PauliStr str);
-extern bool paulis_hasOddNumY(PauliStr str);
 extern PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift);
 extern PauliStr paulis_getKetAndBraPauliStr(PauliStr str, Qureg qureg);
 
@@ -966,7 +967,7 @@ void applyMultiStateControlledPauliStr(Qureg qureg, int* controls, int* states,
     // operation sinto a single tensor, i.e. +- (shift(str) (x) str), to 
     // avoid superfluous re-enumeration of the state
     if (qureg.isDensityMatrix && numControls == 0) {
-        factor = paulis_hasOddNumY(str)? -1 : 1;
+        factor = paulis_getSignOfPauliStrConj(str);
         ctrlVec = util_getConcatenated(ctrlVec, util_getBraQubits(ctrlVec, qureg));
         stateVec = util_getConcatenated(stateVec, stateVec); 
         str = paulis_getKetAndBraPauliStr(str, qureg);
@@ -976,7 +977,7 @@ void applyMultiStateControlledPauliStr(Qureg qureg, int* controls, int* states,
 
     // but density-matrix control qubits require two distinct operations
     if (qureg.isDensityMatrix && numControls > 0) {
-        factor = paulis_hasOddNumY(str)? -1 : 1;
+        factor = paulis_getSignOfPauliStrConj(str);
         ctrlVec = util_getBraQubits(ctrlVec, qureg);
         str = paulis_getShiftedPauliStr(str, qureg.numQubits);
         localiser_statevec_anyCtrlPauliTensor(qureg, ctrlVec, stateVec, str, factor);
@@ -1230,8 +1231,8 @@ void applyNonUnitaryPauliGadget(Qureg qureg, PauliStr str, qcomp angle) {
     if (!qureg.isDensityMatrix)
         return;
 
-    // conj(e^i(a)XZ) = e^(-i conj(a)XZ) but conj(Y)=-Y, so odd-Y undoes phase negation
-    phase = std::conj(phase) * (paulis_hasOddNumY(str) ? 1 : -1);
+    // conj(e^i(a)P) = e^(-i s conj(a) P)
+    phase = - std::conj(phase) * paulis_getSignOfPauliStrConj(str);
     str = paulis_getShiftedPauliStr(str, qureg.numQubits);
     localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
 }
@@ -1273,8 +1274,8 @@ void applyMultiStateControlledPauliGadget(Qureg qureg, int* controls, int* state
     if (!qureg.isDensityMatrix)
         return;
 
-    // conj(e^iXZ) = e^(-iXZ), but conj(Y)=-Y, so odd-Y undoes phase negation
-    phase *= paulis_hasOddNumY(str) ? 1 : -1;
+    // conj(e^(i a P)) = e^(-i s a P)
+    phase *= - paulis_getSignOfPauliStrConj(str);
     ctrlVec = util_getBraQubits(ctrlVec, qureg);
     str = paulis_getShiftedPauliStr(str, qureg.numQubits);
     localiser_statevec_anyCtrlPauliGadget(qureg, ctrlVec, stateVec, str, phase);
diff --git a/quest/src/api/paulis.cpp b/quest/src/api/paulis.cpp
index c770e5fb9..b07c065fd 100644
--- a/quest/src/api/paulis.cpp
+++ b/quest/src/api/paulis.cpp
@@ -20,6 +20,7 @@
 #include "quest/src/comm/comm_routines.hpp"
 
 #include <iostream>
+#include <utility>
 #include <vector>
 #include <string>
 #include <array>
@@ -87,7 +88,7 @@ void freeAllMemoryIfAnyAllocsFailed(PauliStrSum sum) {
 
 
 /*
- * INTERNAL UTILITIES
+ * INTERNAL PauliStr UTILITIES
  *
  * callable by other internal files but which are not exposed in the header
  * because we do not wish to make them visible to users. Ergo other internal
@@ -139,12 +140,6 @@ int paulis_getIndOfLefmostNonIdentityPauli(PauliStr* strings, qindex numStrings)
 }
 
 
-int paulis_getIndOfLefmostNonIdentityPauli(PauliStrSum sum) {
-
-    return paulis_getIndOfLefmostNonIdentityPauli(sum.strings, sum.numTerms);
-}
-
-
 bool paulis_containsXOrY(PauliStr str) {
 
     int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
@@ -160,16 +155,6 @@ bool paulis_containsXOrY(PauliStr str) {
 }
 
 
-bool paulis_containsXOrY(PauliStrSum sum) {
-
-    for (qindex i=0; i<sum.numTerms; i++)
-        if (paulis_containsXOrY(sum.strings[i]))
-            return true;
-
-    return false;
-}
-
-
 bool paulis_hasOddNumY(PauliStr str) {
 
     bool odd = false;
@@ -182,6 +167,13 @@ bool paulis_hasOddNumY(PauliStr str) {
 }
 
 
+int paulis_getSignOfPauliStrConj(PauliStr str) {
+
+    // conj(Y) = -Y, conj(YY) = YY
+    return paulis_hasOddNumY(str)? -1 : 1;
+}
+
+
 int paulis_getPrefixZSign(Qureg qureg, vector<int> prefixZ) {
 
     int sign = 1;
@@ -239,7 +231,7 @@ qindex paulis_getTargetBitMask(PauliStr str) {
 }
 
 
-array<vector<int>,3> paulis_getSeparateInds(PauliStr str, Qureg qureg) {
+array<vector<int>,3> paulis_getSeparateInds(PauliStr str) {
 
     vector<int> iXYZ = paulis_getTargetInds(str);
     vector<int> iX, iY, iZ;
@@ -279,18 +271,25 @@ PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift) {
 }
 
 
-PauliStr paulis_getKetAndBraPauliStr(PauliStr str, Qureg qureg) {
+PauliStr paulis_getTensorProdOfPauliStr(PauliStr left, PauliStr right, int numQubits) {
+
+    // computes left (tensor) right, assuming right is smaller than numQubits
+    PauliStr shifted = paulis_getShiftedPauliStr(left, numQubits);
 
-    PauliStr shifted = paulis_getShiftedPauliStr(str, qureg.numQubits);
-    
     // return a new stack PauliStr instance (avoiding C++20 initialiser)
     PauliStr out;
-    out.lowPaulis  = str.lowPaulis  | shifted.lowPaulis;
-    out.highPaulis = str.highPaulis | shifted.highPaulis;
+    out.lowPaulis  = right.lowPaulis  | shifted.lowPaulis;
+    out.highPaulis = right.highPaulis | shifted.highPaulis;
     return out;
 }
 
 
+PauliStr paulis_getKetAndBraPauliStr(PauliStr str, Qureg qureg) {
+
+    return paulis_getTensorProdOfPauliStr(str, str, qureg.numQubits);
+}
+
+
 PAULI_MASK_TYPE paulis_getKeyOfSameMixedAmpsGroup(PauliStr str) {
 
     PAULI_MASK_TYPE key = 0;
@@ -312,6 +311,54 @@ PAULI_MASK_TYPE paulis_getKeyOfSameMixedAmpsGroup(PauliStr str) {
 }
 
 
+std::pair<qcomp,PauliStr> paulis_getPauliStrProd(PauliStr strA, PauliStr strB) {
+
+    // a . b = coeff * (a ^ b)
+    PauliStr strOut;
+    strOut.lowPaulis  = strA.lowPaulis  ^ strB.lowPaulis;
+    strOut.highPaulis = strA.highPaulis ^ strB.highPaulis;
+
+    // coeff = product of single-site product coeffs
+    qcomp coeff = 1;
+    for (int i=0; i<MAX_NUM_PAULIS_PER_STR; i++) {
+        int pA = paulis_getPauliAt(strA, i);
+        int pB = paulis_getPauliAt(strB, i);
+        
+        // I.P = P.I = P and P.P = I contribute factor=1
+        if (pA == 0 || pB == 0 || pA == pB)
+            continue;
+
+        // XY,YZ,ZX=i, XZ,YX,ZY=-i
+        int dif = pB - pA;
+        coeff *= qcomp(0, (dif == 1 || dif == -2)? 1 : -1);
+    }
+    
+    return {coeff, strOut};
+}
+
+
+
+/*
+ * INTERNAL PauliStrSum UTILITIES
+ */
+
+
+int paulis_getIndOfLefmostNonIdentityPauli(PauliStrSum sum) {
+
+    return paulis_getIndOfLefmostNonIdentityPauli(sum.strings, sum.numTerms);
+}
+
+
+bool paulis_containsXOrY(PauliStrSum sum) {
+
+    for (qindex i=0; i<sum.numTerms; i++)
+        if (paulis_containsXOrY(sum.strings[i]))
+            return true;
+
+    return false;
+}
+
+
 qindex paulis_getTargetBitMask(PauliStrSum sum) {
 
     qindex mask = 0;
@@ -324,6 +371,87 @@ qindex paulis_getTargetBitMask(PauliStrSum sum) {
 }
 
 
+void paulis_setPauliStrSumToScaledTensorProdOfConjWithSelf(PauliStrSum out, qreal factor, PauliStrSum in, int numQubits) {
+
+    // sets out = factor * conj(in) (x) in, where in has dim of numQubits
+    if (paulis_getIndOfLefmostNonIdentityPauli(in) >= numQubits)
+        error_pauliStrSumHasMoreQubitsThanSpecifiedInTensorProd();
+    if (out.numTerms != in.numTerms * in.numTerms)
+        error_pauliStrSumTensorProdHasIncorrectNumTerms();
+
+    // conj(in) (x) in = sum_jk conj(c_j) c_k conj(P_j) (x) P_k...
+    qindex i = 0;
+    for (qindex j=0; j<in.numTerms; j++) {
+        for (qindex k=0; k<in.numTerms; k++) {
+
+            // ... where conj(P_j) = sign_j P_j
+            out.strings[i] = paulis_getTensorProdOfPauliStr(in.strings[j], in.strings[k], numQubits);
+            out.coeffs[i] = factor * std::conj(in.coeffs[j]) * in.coeffs[k] * paulis_getSignOfPauliStrConj(in.strings[j]);
+            i++;
+        }
+    }
+}
+
+
+qindex paulis_getNumTermsInPauliStrSumProdOfAdjointWithSelf(PauliStrSum in) {
+
+    // adj(in).in has fewer terms than the numTerms^2 bound, since 
+    // a.a = I (causing -n and +1 below) and a.b ~ b.a (causing /2);
+    // we do not however consider any cancellations of coefficients
+    int n = in.numTerms;
+    return 1 + (n*n - n)/2;
+}
+
+
+void paulis_setPauliStrSumToScaledProdOfAdjointWithSelf(PauliStrSum out, qreal factor, PauliStrSum in) {
+
+    // sets out = factor * adj(in) . in, permitting duplicate strings
+    if (out.numTerms != paulis_getNumTermsInPauliStrSumProdOfAdjointWithSelf(in))
+        error_pauliStrSumProdHasIncorrectNumTerms();
+
+    // since out definitely contains an identity (when neglecting coeff cancellation)
+    // which is contributed toward by all j=k iterations below, we keep it at i=0
+    out.strings[0] = getPauliStr("I");
+    out.coeffs[0] = 0;
+    qindex i = 1;
+
+    // we leverage that sum_jk a_j^* a_k P_j P_k...
+    for (qindex j=0; j<in.numTerms; j++) {
+
+        // = sum_j ( |a_j|^2 Id + sum_k<j ...)
+        out.coeffs[0] += factor * std::norm(in.coeffs[j]);
+
+        // containing sum_k<j (a_j^* a_k P_j P_k + a_k^* a_j P_k P_j)
+        for (qindex k=0; k<j; k++) {
+
+            // = (a_j^* a_k b_jk + a_k^* a_j b_jk^*) P'
+            auto [coeff, str] = paulis_getPauliStrProd(in.strings[j], in.strings[k]);
+
+            // = (x + x^*) P' = 2 Re[x] P'
+            out.strings[i] = str;
+            out.coeffs[i] = factor * 2 * std::real(std::conj(in.coeffs[j]) * in.coeffs[k] * coeff);
+            i++;
+        }
+    }
+}
+
+
+void paulis_setPauliStrSumToShiftedConj(PauliStrSum out, PauliStrSum in, int numQubits) {
+
+    // sets out = conj(in) (x) I
+    if (paulis_getIndOfLefmostNonIdentityPauli(in) >= numQubits)
+        error_pauliStrSumHasMoreQubitsThanSpecifiedInConjShift();
+    if (out.numTerms != in.numTerms)
+        error_pauliStrSumConjHasIncorrectNumTerms();
+
+    // where conj(c P) = conj(c) sign P
+    for (qindex i=0; i<out.numTerms; i++) {
+        out.strings[i] = paulis_getShiftedPauliStr(in.strings[i], numQubits);
+        out.coeffs[i] = std::conj(in.coeffs[i]) * paulis_getSignOfPauliStrConj(in.strings[i]);
+    }
+}
+
+
 
 /*
  * PAULI STRING INITIALISATION
diff --git a/quest/src/api/trotterisation.cpp b/quest/src/api/trotterisation.cpp
index d8d5351c2..760432ee7 100644
--- a/quest/src/api/trotterisation.cpp
+++ b/quest/src/api/trotterisation.cpp
@@ -13,6 +13,7 @@
 #include "quest/src/core/validation.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/localiser.hpp"
+#include "quest/src/core/errors.hpp"
 
 #include <vector>
 
@@ -24,12 +25,16 @@ using std::vector;
  * INTERNAL UTILS
  */
 
-extern bool paulis_hasOddNumY(PauliStr str);
+extern int paulis_getSignOfPauliStrConj(PauliStr str);
 extern PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift);
+extern void paulis_setPauliStrSumToScaledTensorProdOfConjWithSelf(PauliStrSum out, qreal factor, PauliStrSum in, int numQubits);
+extern void paulis_setPauliStrSumToScaledProdOfAdjointWithSelf(PauliStrSum out, qreal factor, PauliStrSum in);
+extern void paulis_setPauliStrSumToShiftedConj(PauliStrSum out, PauliStrSum in, int numQubits);
+extern qindex paulis_getNumTermsInPauliStrSumProdOfAdjointWithSelf(PauliStrSum in);
 
 void internal_applyFirstOrderTrotterRepetition(
     Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
-    vector<int>& states, PauliStrSum sum, qcomp angle, bool reverse
+    vector<int>& states, PauliStrSum sum, qcomp angle, bool onlyLeftApply, bool reverse
 ) {
     // apply each sum term as a gadget, in forward or reverse order
     for (qindex i=0; i<sum.numTerms; i++) {
@@ -37,15 +42,22 @@ void internal_applyFirstOrderTrotterRepetition(
         qcomp coeff = sum.coeffs[j];
         PauliStr str = sum.strings[j];
 
-        // effect |psi> -> exp(i angle * sum)|psi>
+        // effect |psi> -> exp(i angle * coeff * term)|psi>
         qcomp arg = angle * coeff;
         localiser_statevec_anyCtrlPauliGadget(qureg, ketCtrls, states, str, arg);
 
+        // term finished upon statevector 
         if (!qureg.isDensityMatrix)
             continue;
 
-        // effect rho -> rho dagger(i angle * sum)
-        arg *= paulis_hasOddNumY(str) ? 1 : -1;
+        // Linbladian propagator is only ever pre-multiplied
+        if (onlyLeftApply)
+            continue;
+
+        // effect rho -> rho exp(i angle * coeff * term)^dagger via linearised
+        //    ||rho>> -> conj(exp(i angle * coeff * term)) (x) I ||rho>>
+        //             = exp(- i conj(angle coeff) sign term) (x) I ||rho>>
+        arg = - std::conj(arg) * paulis_getSignOfPauliStrConj(str);
         str = paulis_getShiftedPauliStr(str, qureg.numQubits);
         localiser_statevec_anyCtrlPauliGadget(qureg, braCtrls, states, str, arg);
     }
@@ -53,14 +65,14 @@ void internal_applyFirstOrderTrotterRepetition(
 
 void internal_applyHigherOrderTrotterRepetition(
     Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
-    vector<int>& states, PauliStrSum sum, qcomp angle, int order
+    vector<int>& states, PauliStrSum sum, qcomp angle, int order, bool onlyLeftApply
 ) {
     if (order == 1) {
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle, false);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle, onlyLeftApply, false);
     
     } else if (order == 2) {
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle/2, false);
-        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle/2, true);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle/2, onlyLeftApply, false);
+        internal_applyFirstOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, angle/2, onlyLeftApply, true);
     
     } else {
         qreal p = 1. / (4 - std::pow(4, 1./(order-1)));
@@ -68,17 +80,17 @@ void internal_applyHigherOrderTrotterRepetition(
         qcomp b = (1-4*p) * angle;
 
         int lower = order - 2;
-        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower); // angle -> a
-        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
-        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, b, lower); // angle -> b
-        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
-        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower);
+        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower, onlyLeftApply); // angle -> a
+        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower, onlyLeftApply);
+        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, b, lower, onlyLeftApply); // angle -> b
+        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower, onlyLeftApply);
+        internal_applyHigherOrderTrotterRepetition(qureg, ketCtrls, braCtrls, states, sum, a, lower, onlyLeftApply);
     }
 }
 
 void internal_applyAllTrotterRepetitions(
     Qureg qureg, int* controls, int* states, int numControls, 
-    PauliStrSum sum, qcomp angle, int order, int reps
+    PauliStrSum sum, qcomp angle, int order, int reps, bool onlyLeftApply
 ) {
     // exp(i angle sum) = identity when angle=0
     if (angle == qcomp(0,0))
@@ -94,7 +106,7 @@ void internal_applyAllTrotterRepetitions(
     // perform carefully-ordered sequence of gadgets
     for (int r=0; r<reps; r++)
         internal_applyHigherOrderTrotterRepetition(
-            qureg, ketCtrlsVec, braCtrlsVec, statesVec, sum, arg, order);
+            qureg, ketCtrlsVec, braCtrlsVec, statesVec, sum, arg, order, onlyLeftApply);
 
     /// @todo
     /// the accuracy of Trotterisation is greatly improved by randomisation
@@ -102,6 +114,37 @@ void internal_applyAllTrotterRepetitions(
     /// implement these above or into another function?
 }
 
+qindex internal_getNumTotalSuperPropagatorTerms(PauliStrSum hamil, PauliStrSum* jumps, int numJumps) {
+
+    // this function returns 0 to indicate an overflow, which will never
+    // be confused for the correct non-overflowed output because hamil.numTerms>0
+    int OVERFLOW_FLAG = 0;
+
+    if (util_willProdOverflow({2,hamil.numTerms}))
+        return OVERFLOW_FLAG;
+        
+    // I (x) H + conj(H) (x) I
+    qindex numTerms = 2 * hamil.numTerms;
+
+    for (int i=0; i<numJumps; i++) {
+        qindex n = jumps[i].numTerms;
+
+        if (util_willProdOverflow({n,n,3}))
+            return OVERFLOW_FLAG;
+        if (util_willSumOverflow({numTerms, 3*n*n}))
+            return OVERFLOW_FLAG;
+
+        // conj(J) (x) J has n^2 terms
+        numTerms += n * n;
+
+        // I (x) (adj(J) . J ) + conj(...) (x) I is bounded by 2*n^2 terms
+        numTerms += 2 * paulis_getNumTermsInPauliStrSumProdOfAdjointWithSelf(jumps[i]);
+    }
+
+    // indicate no overflow
+    return numTerms;
+}
+
 
 
 /*
@@ -117,7 +160,9 @@ void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, q
     validate_trotterParams(qureg, order, reps, __func__);
     // sum is permitted to be non-Hermitian
 
-    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps);
+    // |psi> -> U |psi>, rho -> U rho U^dagger
+    bool onlyLeftApply = false;
+    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps, onlyLeftApply);
 }
 
 void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps) {
@@ -127,7 +172,8 @@ void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle
     validate_pauliStrSumIsHermitian(sum, __func__);
     validate_trotterParams(qureg, order, reps, __func__);
 
-    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps);
+    bool onlyLeftApply = false;
+    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps, onlyLeftApply);
 }
 
 void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps) {
@@ -137,7 +183,8 @@ void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, Pauli
     validate_controlAndPauliStrSumTargets(qureg, control, sum, __func__);
     validate_trotterParams(qureg, order, reps, __func__);
     
-    internal_applyAllTrotterRepetitions(qureg, &control, nullptr, 1, sum, angle, order, reps);
+    bool onlyLeftApply = false;
+    internal_applyAllTrotterRepetitions(qureg, &control, nullptr, 1, sum, angle, order, reps, onlyLeftApply);
 }
 
 void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
@@ -147,7 +194,8 @@ void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls
     validate_controlsAndPauliStrSumTargets(qureg, controls, numControls, sum, __func__);
     validate_trotterParams(qureg, order, reps, __func__);
 
-    internal_applyAllTrotterRepetitions(qureg, controls, nullptr, numControls, sum, angle, order, reps);
+    bool onlyLeftApply = false;
+    internal_applyAllTrotterRepetitions(qureg, controls, nullptr, numControls, sum, angle, order, reps, onlyLeftApply);
 }
 
 void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
@@ -158,7 +206,8 @@ void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* con
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
     validate_trotterParams(qureg, order, reps, __func__);
 
-    internal_applyAllTrotterRepetitions(qureg, controls, states, numControls, sum, angle, order, reps);
+    bool onlyLeftApply = false;
+    internal_applyAllTrotterRepetitions(qureg, controls, states, numControls, sum, angle, order, reps, onlyLeftApply);
 }
 
 } // end de-mangler
@@ -173,3 +222,138 @@ void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, vector<i
 
     applyMultiStateControlledTrotterizedPauliStrSumGadget(qureg, controls.data(), states.data(), controls.size(), sum, angle, order, reps);
 }
+
+
+
+/*
+ * CLOSED TIME EVOLUTION
+ */
+
+extern "C" {
+
+void applyTrotterizedUnitaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal time, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(hamil, __func__);
+    validate_pauliStrSumTargets(hamil, qureg, __func__);
+    validate_pauliStrSumIsHermitian(hamil, __func__);
+    validate_trotterParams(qureg, order, reps, __func__);
+
+    // exp(-i t H) = exp(x i H) | x=-t
+    qcomp angle = - time;
+    bool onlyLeftApply = false;
+    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, hamil, angle, order, reps, onlyLeftApply);
+}
+
+void applyTrotterizedImaginaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal tau, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_pauliStrSumFields(hamil, __func__);
+    validate_pauliStrSumTargets(hamil, qureg, __func__);
+    validate_pauliStrSumIsHermitian(hamil, __func__);
+    validate_trotterParams(qureg, order, reps, __func__);
+
+    // exp(-tau H) = exp(x i H) | x=tau*i
+    qcomp angle = qcomp(0, tau);
+    bool onlyLeftApply = false;
+    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, hamil, angle, order, reps, onlyLeftApply);
+}
+
+} // end de-mangler
+
+
+
+/*
+ * OPEN TIME EVOLUTION
+ */
+
+extern "C" {
+
+void applyTrotterizedNoisyTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal* damps, PauliStrSum* jumps, int numJumps, qreal time, int order, int reps) {
+    validate_quregFields(qureg, __func__);
+    validate_quregIsDensityMatrix(qureg, __func__);
+    validate_pauliStrSumFields(hamil, __func__);
+    validate_pauliStrSumTargets(hamil, qureg, __func__);
+    validate_pauliStrSumIsHermitian(hamil, __func__);
+    validate_trotterParams(qureg, order, reps, __func__);
+    validate_lindbladJumpOps(jumps, numJumps, qureg, __func__);
+    validate_lindbladDampingRates(damps, numJumps, __func__);
+    
+    qindex numSuperTerms = internal_getNumTotalSuperPropagatorTerms(hamil, jumps, numJumps); // 0 indicates overflow
+    validate_numLindbladSuperPropagatorTerms(numSuperTerms, __func__);
+
+    // validate memory allocations for all super-propagator terms
+    vector<PauliStr> superStrings;
+    vector<qcomp> superCoeffs;
+    auto callbackString = [&]() { validate_tempAllocSucceeded(false, numSuperTerms, sizeof(PauliStr), __func__); };
+    auto callbackCoeff  = [&]() { validate_tempAllocSucceeded(false, numSuperTerms, sizeof(qcomp),    __func__); };
+    util_tryAllocVector(superStrings, numSuperTerms, callbackString);
+    util_tryAllocVector(superCoeffs,  numSuperTerms, callbackCoeff);
+
+    qindex superTermInd = 0;
+
+    // collect -i[H,rho] terms
+    for (qindex n=0; n<hamil.numTerms; n++) {
+        PauliStr oldStr = hamil.strings[n];
+        qcomp oldCoeff = hamil.coeffs[n];
+
+        // term of -i Id (x) H
+        superStrings[superTermInd] = oldStr;
+        superCoeffs [superTermInd] = -1_i * oldCoeff;
+        superTermInd++;
+
+        // term of i conj(H) (x) I
+        superStrings[superTermInd] = paulis_getShiftedPauliStr(oldStr, qureg.numQubits);
+        superCoeffs [superTermInd] = 1_i * paulis_getSignOfPauliStrConj(oldStr) * std::conj(oldCoeff);
+        superTermInd++;
+    }
+
+    // below we bind superStrings/Coeffs to a spoofed PauliStrSum to pass to paulis functions
+    PauliStrSum temp;
+    int flagForDebugSafety = -1;
+    temp.isApproxHermitian = &flagForDebugSafety;
+
+    // collect jump terms
+    for (int n=0; n<numJumps; n++) {
+
+        // damp  conj(J) (x) J
+        temp.strings = &superStrings[superTermInd];
+        temp.coeffs = &superCoeffs[superTermInd];
+        temp.numTerms = jumps[n].numTerms * jumps[n].numTerms;
+        superTermInd += temp.numTerms;
+        paulis_setPauliStrSumToScaledTensorProdOfConjWithSelf(temp, damps[n], jumps[n], qureg.numQubits);
+
+        // -damp/2  I (x) (adj(J) . J)
+        temp.strings = &superStrings[superTermInd];
+        temp.coeffs = &superCoeffs[superTermInd];
+        temp.numTerms = paulis_getNumTermsInPauliStrSumProdOfAdjointWithSelf(jumps[n]);
+        superTermInd += temp.numTerms;
+        paulis_setPauliStrSumToScaledProdOfAdjointWithSelf(temp, -damps[n]/2, jumps[n]);
+
+        // -damp/2 conj(adj(J) . J) (x) I = conj(above) when damp is real
+        PauliStrSum temp2;
+        temp2.strings = &superStrings[superTermInd];
+        temp2.coeffs = &superCoeffs[superTermInd];
+        temp2.numTerms = temp.numTerms;
+        superTermInd += temp2.numTerms;
+        paulis_setPauliStrSumToShiftedConj(temp2, temp, qureg.numQubits);
+    }
+
+    // defensively check we didn't write too few (or too many, though that'd segfault
+    // above) Lindblad terms, in case the above code changes when jump ops are generalised
+    if (superTermInd != numSuperTerms)
+        error_unexpectedNumLindbladSuperpropTerms();
+
+    // pass superpropagator terms as temporary PauliStrSum
+    PauliStrSum superSum; 
+    superSum.numTerms = numSuperTerms;
+    superSum.strings = superStrings.data();
+    superSum.coeffs = superCoeffs.data();
+    superSum.isApproxHermitian = nullptr; // will not be queried
+
+    // effect exp(t S) = exp(x i S) | x=-i*time, left-multiplying only
+    qcomp angle = qcomp(0, -time);
+    bool onlyLeftApply = true;
+    internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, superSum, angle, order, reps, onlyLeftApply);
+}
+
+} // end de-mangler
+
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 51261ee1e..8261b0ad5 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -742,6 +742,31 @@ void error_pauliStrShiftedByIllegalAmount() {
     raiseInternalError("A PauliStr was attemptedly shifted (likely invoked by its application upon a density matrix) by an illegal amount (e.g. negative, or that exceeding the PauliStr bitmask length).");
 }
 
+void error_pauliStrSumHasMoreQubitsThanSpecifiedInTensorProd() {
+
+    raiseInternalError("Attempted to calculate the tensor product of a PauliStrSum with itself, but it contained non-identity Paulis on qubits beyond the number specified.");
+}
+
+void error_pauliStrSumHasMoreQubitsThanSpecifiedInConjShift() {
+
+    raiseInternalError("Attempted to calculate the tensor product of a (conjugated) PauliStrSum with identity, but it contained non-identity Paulis on qubits beyond the number specified in the identity.");
+}
+
+void error_pauliStrSumTensorProdHasIncorrectNumTerms() {
+
+    raiseInternalError("The tensor product of a (conjugated) PauliStrSum with itself was attemptedly written to output PauliStrSum with an incompatible number of terms.");
+}
+
+void error_pauliStrSumProdHasIncorrectNumTerms() {
+
+    raiseInternalError("The product of a (conjugate transposed) PauliStrSum with itself was attemptedly written to an output PauliStrSum with an incompatible number of terms.");
+}
+
+void error_pauliStrSumConjHasIncorrectNumTerms() {
+
+    raiseInternalError("Attempted to calculate the conjugate of a PauliStrSum but the output PauliStrSum had a differing (and ergo invalid) number of terms.");
+}
+
 
 
 /*
@@ -892,3 +917,14 @@ void error_envVarsAlreadyLoaded() {
 
     raiseInternalError("All environment variables were already loaded and validated yet re-loading was attempted.");
 }
+
+
+
+/*
+ * TROTTERISATION ERRORS
+ */
+
+void error_unexpectedNumLindbladSuperpropTerms() {
+
+    raiseInternalError("A different number of Lindblad superpropagator terms were prepared than expected.");
+}
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index 50af5e8aa..c31d8df9a 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -294,6 +294,16 @@ void error_cuQuantumTempCpuAllocFailed();
 
 void error_pauliStrShiftedByIllegalAmount();
 
+void error_pauliStrSumHasMoreQubitsThanSpecifiedInTensorProd();
+
+void error_pauliStrSumHasMoreQubitsThanSpecifiedInConjShift();
+
+void error_pauliStrSumTensorProdHasIncorrectNumTerms();
+
+void error_pauliStrSumProdHasIncorrectNumTerms();
+
+void error_pauliStrSumConjHasIncorrectNumTerms();
+
 
 
 /*
@@ -370,4 +380,12 @@ void error_envVarsAlreadyLoaded();
 
 
 
+/*
+ * TROTTERISATION ERRORS
+ */
+
+void error_unexpectedNumLindbladSuperpropTerms();
+
+
+
 #endif // ERRORS_HPP
\ No newline at end of file
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 731c598d8..5b107e763 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -1250,7 +1250,7 @@ template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, vector<int>, vecto
 
 extern bool paulis_containsXOrY(PauliStr str);
 extern vector<int> paulis_getTargetInds(PauliStr str);
-extern std::array<vector<int>,3> paulis_getSeparateInds(PauliStr str, Qureg qureg);
+extern std::array<vector<int>,3> paulis_getSeparateInds(PauliStr str);
 extern int paulis_getPrefixZSign(Qureg qureg, vector<int> prefixZ) ;
 extern qcomp paulis_getPrefixPaulisElem(Qureg qureg, vector<int> prefixY, vector<int> prefixZ);
 
@@ -1298,7 +1298,7 @@ void anyCtrlPauliTensorOrGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrl
     // - prefix X,Y determine communication, because they apply bit-not to rank
     // - prefix Y,Z determine node-wide coefficient, because they contain rank-determined !=1 elements
     // - suffix X,Y,Z determine local amp coefficients
-    auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str, qureg);
+    auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str);
     auto [prefixX, suffixX] = util_getPrefixAndSuffixQubits(targsX, qureg);
     auto [prefixY, suffixY] = util_getPrefixAndSuffixQubits(targsY, qureg);
     auto [prefixZ, suffixZ] = util_getPrefixAndSuffixQubits(targsZ, qureg);
@@ -1994,7 +1994,7 @@ qcomp getDensMatrExpecPauliStrTermOfOnlyThisNode(Qureg qureg, PauliStr str) {
     // caller must reduce the returned value between nodes if necessary
 
     // all ket-paulis are in the suffix state
-    auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str, qureg);
+    auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str);
 
     // optimised scenario when str = I
     if (targsX.empty() && targsY.empty() && targsZ.empty())
@@ -2019,7 +2019,7 @@ qcomp localiser_statevec_calcExpecPauliStr(Qureg qureg, PauliStr str) {
     // - prefix Y,Z determine node-wide coefficient, because they contain rank-determined !=1 elements
     // - suffix X,Y,Z determine local amp coefficients
     // noting that when !qureg.isDistributed, all paulis will be in suffix
-    auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str, qureg);
+    auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str);
     auto [prefixX, suffixX] = util_getPrefixAndSuffixQubits(targsX, qureg);
     auto [prefixY, suffixY] = util_getPrefixAndSuffixQubits(targsY, qureg);
     auto [prefixZ, suffixZ] = util_getPrefixAndSuffixQubits(targsZ, qureg);
@@ -2107,7 +2107,7 @@ qcomp localiser_statevec_calcExpecPauliStrSum(Qureg qureg, PauliStrSum sum) {
 
         // for each term within the current group...
         for (auto& [str, coeff] : terms) {
-            auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str, qureg);
+            auto [targsX, targsY, targsZ] = paulis_getSeparateInds(str);
             auto [prefixX, suffixX] = util_getPrefixAndSuffixQubits(targsX, qureg);
             auto [prefixY, suffixY] = util_getPrefixAndSuffixQubits(targsY, qureg);
             auto [prefixZ, suffixZ] = util_getPrefixAndSuffixQubits(targsZ, qureg);
diff --git a/quest/src/core/memory.cpp b/quest/src/core/memory.cpp
index c8b81fc88..7f11494a1 100644
--- a/quest/src/core/memory.cpp
+++ b/quest/src/core/memory.cpp
@@ -388,6 +388,15 @@ bool mem_canSuperOpFitInMemory(int numQubits, qindex numBytesPerNode) {
 }
 
 
+bool mem_canPauliStrSumFitInMemory(qindex numTerms, qindex numBytesPerNode) {
+
+    // awkwardly arranged to avoid overflow when numTerms is too large
+    size_t numBytesPerTerm = sizeof(PauliStr) + sizeof(qcomp);
+    qindex maxNumTerms = numBytesPerNode / numBytesPerTerm; // floors
+    return numTerms <= maxNumTerms;
+}
+
+
 
 /*
  * MEMORY ALLOCATION SUCCESS
diff --git a/quest/src/core/memory.hpp b/quest/src/core/memory.hpp
index 7b112b027..b624d6c03 100644
--- a/quest/src/core/memory.hpp
+++ b/quest/src/core/memory.hpp
@@ -101,6 +101,8 @@ bool mem_canMatrixFitInMemory(int numQubits, bool isDense, int numNodes, qindex
 
 bool mem_canSuperOpFitInMemory(int numQubits, qindex numBytesPerNode);
 
+bool mem_canPauliStrSumFitInMemory(qindex numTerms, qindex numBytesPerNode);
+
 
 
 /*
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index e30aa86d6..207b76d7f 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -28,6 +28,7 @@
 #include <algorithm>
 #include <utility>
 #include <complex>
+#include <limits>
 #include <cmath>
 #include <vector>
 #include <array>
@@ -339,6 +340,39 @@ qcomp util_getPowerOfI(size_t exponent) {
     return values[exponent % 4];
 }
 
+bool util_willProdOverflow(vector<qindex> terms) {
+
+    qindex max = std::numeric_limits<qindex>::max();
+    qindex prod = 1;
+
+    for (auto x : terms) {
+
+        // division floors, so prod strictly exceed
+        if (prod > max / x)
+            return true;
+
+        prod *= x;
+    }
+
+    return false;
+}
+
+bool util_willSumOverflow(vector<qindex> terms) {
+
+    qindex max = std::numeric_limits<qindex>::max();
+    qindex sum = 0;
+
+    for (auto x : terms) {
+
+        if (sum >= max - x)
+            return true;
+        
+        sum += x;
+    }
+
+    return false;
+}
+
 
 
 /*
@@ -1107,6 +1141,7 @@ void util_tryAllocVector(vector<qreal>    &vec, qindex size, std::function<void(
 void util_tryAllocVector(vector<qcomp>    &vec, qindex size, std::function<void()> errFunc) { tryAllocVector(vec, size, errFunc); }
 void util_tryAllocVector(vector<qcomp*>   &vec, qindex size, std::function<void()> errFunc) { tryAllocVector(vec, size, errFunc); }
 void util_tryAllocVector(vector<unsigned> &vec, qindex size, std::function<void()> errFunc) { tryAllocVector(vec, size, errFunc); }
+void util_tryAllocVector(vector<PauliStr> &vec, qindex size, std::function<void()> errFunc) { tryAllocVector(vec, size, errFunc); }
 
 // cuQuantum needs a vector<double> overload, which we additionally define when qreal!=double. Gross!
 #if FLOAT_PRECISION != 2
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index c514821bf..a55e94ab5 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -105,6 +105,10 @@ bool util_isApproxReal(qcomp num, qreal epsilon);
 
 qcomp util_getPowerOfI(size_t exponent);
 
+bool util_willProdOverflow(vector<qindex> terms);
+
+bool util_willSumOverflow(vector<qindex> terms);
+
 
 
 /*
@@ -401,6 +405,7 @@ void util_tryAllocVector(vector<qreal>    &vec, qindex size, std::function<void(
 void util_tryAllocVector(vector<qcomp>    &vec, qindex size, std::function<void()> errFunc);
 void util_tryAllocVector(vector<qcomp*>   &vec, qindex size, std::function<void()> errFunc);
 void util_tryAllocVector(vector<unsigned> &vec, qindex size, std::function<void()> errFunc);
+void util_tryAllocVector(vector<PauliStr> &vec, qindex size, std::function<void()> errFunc);
 
 // cuQuantum needs a vector<double> overload, which we additionally define when qreal!=double. Gross!
 #if FLOAT_PRECISION != 2
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index a8223cb3f..51b61b7fc 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -693,6 +693,9 @@ namespace report {
     string NEW_PAULI_STR_SUM_DIFFERENT_NUM_STRINGS_AND_COEFFS =
         "Given a different number of Pauli strings (${NUM_STRS}) and coefficients ${NUM_COEFFS}.";
 
+    string NEW_PAULI_STR_SUM_CANNOT_FIT_INTO_CPU_MEM =
+        "A PauliStrSum containing ${NUM_TERMS} terms cannot fit in the available RAM of ${NUM_BYTES} bytes.";
+
     string NEW_PAULI_STR_SUM_STRINGS_ALLOC_FAILED = 
         "Attempted allocation of the PauliStrSum's ${NUM_TERMS}-term array of Pauli strings (${NUM_BYTES} bytes total) unexpectedly failed.";
 
@@ -874,6 +877,10 @@ namespace report {
         "Cannot perform this ${NUM_TARGS}-target operation upon a ${NUM_QUREG_QUBITS}-qubit density-matrix distributed between ${NUM_NODES} nodes, since each node's communication buffer (with capacity for ${NUM_QUREG_AMPS_PER_NODE} amps) cannot simultaneously store the ${NUM_TARG_AMPS} mixed remote amplitudes.";
 
 
+    /*
+    * TROTTERISATION PARAMETERS
+    */
+
     string INVALID_TROTTER_ORDER =
         "Invalid Trotter order (${ORDER}). The order parameter must be positive and even, or unity.";
 
@@ -881,6 +888,23 @@ namespace report {
         "Invalid number of Trotter repetitions (${REPS}). The number of repetitions must be positive.";
 
 
+    /*
+    * TIME EVOLUTION PARAMETERS
+    */
+
+    string NEGATIVE_NUM_LINDBLAD_JUMP_OPS =
+        "The number of jump operators must be zero or positive.";
+
+    string NEGATIVE_LINDBLAD_DAMPING_RATE = 
+        "One or more damping rates were negative, beyond the tolerance set by the validation epsilon.";
+
+    string NUM_LINDBLAD_SUPER_PROPAGATOR_TERMS_OVERFLOWED =
+        "The given Hamiltonian and jump operators suggest a Lindbladian superpropagator with more weighted Pauli strings than can be stored in a qindex type.";
+
+    string NEW_LINDBLAD_SUPER_PROPAGATOR_CANNOT_FIT_INTO_CPU_MEM =
+        "The Lindbladian superpropagator would contain ${NUM_TERMS} weighted Pauli strings and exceed the available RAM of ${NUM_BYTES} bytes.";
+
+
     /*
      * CHANNEL PARAMETERS 
      */
@@ -3190,14 +3214,25 @@ void validate_controlAndPauliStrTargets(Qureg qureg, int ctrl, PauliStr str, con
 
 void validate_newPauliStrSumParams(qindex numTerms, const char* caller) {
 
-    // note we do not bother checking whether RAM has enough memory to contain
-    // the new Pauli sum, because the caller to this function has already
-    // been passed data of the same size (and it's unlikely the user is about
-    // to max RAM), and the memory requirements scale only linearly with the
-    // parameters (e.g. numTerms), unlike the exponential scaling of the memory
-    // of Qureg and CompMatr, for example
-
     assertThat(numTerms > 0, report::NEW_PAULI_STR_SUM_NON_POSITIVE_NUM_STRINGS, {{"${NUM_TERMS}", numTerms}}, caller);
+
+    // attempt to fetch RAM, and simply return if we fail; if we unknowingly
+    // didn't have enough RAM, then alloc validation will trigger later
+    size_t memPerNode = 0;
+    try {
+        memPerNode = mem_tryGetLocalRamCapacityInBytes();
+    } catch(mem::COULD_NOT_QUERY_RAM e) {
+        return;
+    }
+
+    // pedantically check whether the PauliStrSum fits in memory. This seems
+    // ridiculous/pointless because the user is expected to have already prepared
+    // data of an equivalent size (which is passed), but checking means we catch
+    // when the user has passed an erroneous 'numTerms' which is way too large,
+    // avoiding a seg fault
+
+    bool fits = mem_canPauliStrSumFitInMemory(numTerms, memPerNode);
+    assertThat(fits, report::NEW_PAULI_STR_SUM_CANNOT_FIT_INTO_CPU_MEM, {{"${NUM_TERMS}", numTerms}, {"${NUM_BYTES}", memPerNode}}, caller);
 }
 
 void validate_newPauliStrSumMatchingListLens(qindex numStrs, qindex numCoeffs, const char* caller) {
@@ -3751,6 +3786,12 @@ void validate_mixedAmpsFitInNode(Qureg qureg, int numTargets, const char* caller
     assertThat(qureg.numAmpsPerNode >= numTargAmps, msg, vars, caller);
 }
 
+
+
+/*
+ * TROTTERISATION PARAMETERS
+ */
+
 void validate_trotterParams(Qureg qureg, int order, int reps, const char* caller) {
 
     bool isEven = (order % 2) == 0;
@@ -3760,6 +3801,61 @@ void validate_trotterParams(Qureg qureg, int order, int reps, const char* caller
 
 
 
+/*
+ * TIME EVOLUTION PARAMETERS
+ */
+
+void validate_lindbladJumpOps(PauliStrSum* jumps, int numJumps, Qureg qureg, const char* caller) {
+
+    assertThat(numJumps >= 0, report::NEGATIVE_NUM_LINDBLAD_JUMP_OPS, caller);
+
+    // @todo
+    // these error messages report as if each jump operator was "the" PauliStrSum
+    // to a function expecting one, and should be tailored to them being "a" jump op
+    for (int n=0; n<numJumps; n++) {
+        validate_pauliStrSumFields(jumps[n], caller);
+        validate_pauliStrSumTargets(jumps[n], qureg, caller);
+    }
+
+    // separate validation checks whether there is sufficient memory to translate 
+    // all jump operators into terms of a super-propagator (and guards overflow)
+}
+
+void validate_lindbladDampingRates(qreal* damps, int numJumps, const char* caller) {
+
+    // possibly repeated from jump op validation, for safety
+    assertThat(numJumps >= 0, report::NEGATIVE_NUM_LINDBLAD_JUMP_OPS, caller);
+
+    if (isNumericalValidationDisabled())
+        return;
+
+    // in lieu of asserting positivity, we somewhat unusually permit small negative
+    // damping rates just for consistency with other numerical validation tolerances
+    for (int n=0; n<numJumps; n++)
+        assertThat(damps[n] >= - global_validationEpsilon, report::NEGATIVE_LINDBLAD_DAMPING_RATE, caller);
+}
+
+void validate_numLindbladSuperPropagatorTerms(qindex numSuperTerms, const char* caller) {
+
+    assertThat(numSuperTerms != 0, report::NUM_LINDBLAD_SUPER_PROPAGATOR_TERMS_OVERFLOWED, caller);
+
+    // attempt to fetch RAM, and simply return if we fail; if we unknowingly
+    // didn't have enough RAM, then alloc validation will trigger later
+    size_t memPerNode = 0;
+    try {
+        memPerNode = mem_tryGetLocalRamCapacityInBytes();
+    } catch(mem::COULD_NOT_QUERY_RAM e) {
+        return;
+    }
+
+    // check whether the superpropagator fits in memory
+    bool fits = mem_canPauliStrSumFitInMemory(numSuperTerms, memPerNode);
+    assertThat(fits, report::NEW_LINDBLAD_SUPER_PROPAGATOR_CANNOT_FIT_INTO_CPU_MEM, {{"${NUM_TERMS}", numSuperTerms}, {"${NUM_BYTES}", memPerNode}}, caller);
+
+}
+
+
+
 /*
  * CHANNEL PARAMETERS 
  */
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 92baac843..e15bca4e9 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -414,10 +414,28 @@ void validate_rotationAxisNotZeroVector(qreal x, qreal y, qreal z, const char* c
 
 void validate_mixedAmpsFitInNode(Qureg qureg, int numTargets, const char* caller);
 
+
+
+/*
+ * TROTTERISATION PARAMETERS
+ */
+
 void validate_trotterParams(Qureg qureg, int order, int reps, const char* caller);
 
 
 
+/*
+ * TIME EVOLUTION PARAMETERS
+ */
+
+void validate_lindbladJumpOps(PauliStrSum* jumps, int numJumps, Qureg qureg, const char* caller);
+
+void validate_lindbladDampingRates(qreal* damps, int numJumps, const char* caller);
+
+void validate_numLindbladSuperPropagatorTerms(qindex numSuperTerms, const char* caller);
+
+
+
 /*
  * DECOHERENCE 
  */
diff --git a/tests/unit/paulis.cpp b/tests/unit/paulis.cpp
index 255d30c4b..8b6fc4f90 100644
--- a/tests/unit/paulis.cpp
+++ b/tests/unit/paulis.cpp
@@ -328,6 +328,12 @@ TEST_CASE( "createPauliStrSum", TEST_CATEGORY ) {
             REQUIRE_THROWS_WITH( createPauliStrSum(nullptr, nullptr, numTerms), ContainsSubstring("number of terms must be a positive integer") );
         }
 
+        SECTION( "exceeds memory" ) {
+
+            // can choose even a number of terms so large that its size (in bytes) overflows
+            REQUIRE_THROWS_WITH( createPauliStrSum(nullptr, nullptr, 1LL << 60), ContainsSubstring("cannot fit in the available RAM") );
+        }
+
         SECTION( "mismatching lengths" ) {
 
             // specific to the C++ interface
diff --git a/tests/unit/trotterisation.cpp b/tests/unit/trotterisation.cpp
index 174b7c66e..bc2c7aab6 100644
--- a/tests/unit/trotterisation.cpp
+++ b/tests/unit/trotterisation.cpp
@@ -34,3 +34,9 @@ void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, Pauli
 void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
 
 void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
+
+void applyTrotterizedUnitaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal time, int order, int reps);
+
+void applyTrotterizedImaginaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal tau, int order, int reps);
+
+void applyTrotterizedNoisyTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal* damps, PauliStr* jumps, int numJumps, qreal time, int order, int reps);
diff --git a/utils/docs/Doxyfile b/utils/docs/Doxyfile
index daa1f0143..1d1086fe3 100644
--- a/utils/docs/Doxyfile
+++ b/utils/docs/Doxyfile
@@ -1927,7 +1927,10 @@ MATHJAX_RELPATH        =
 # MATHJAX_EXTENSIONS = ams
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
-MATHJAX_EXTENSIONS     =
+### Tyson:
+### these were added to attemptedly fix \mathcal rendering
+### but they did nothing, alas! I retain for posterity
+MATHJAX_EXTENSIONS     = TeX/AMSmath TeX/AMSsymbols
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
@@ -2101,7 +2104,10 @@ PAPER_TYPE             = a4
 # If left blank no extra packages will be included.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
-EXTRA_PACKAGES         = times, amsmath, xr, amsfonts, tikz
+### Tyson:
+### \mathcal{L} does not render despite amssymb appearing below
+### (and MathJax given the AMSsymbols extension elsewhere), grr!
+EXTRA_PACKAGES         = xr times amsmath amssymb amsfonts tikz
 
 # The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
 # the generated LaTeX document. The header should contain everything until the

From 0ecad9ca9f8d18728907b176c84a1cb513e2a4d7 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Wed, 20 Aug 2025 21:13:00 +0200
Subject: [PATCH 19/32] separated pauli logic from paulis.cpp

since paulis.cpp is an "API" file while the pauli logic previously therein was used by numerous core files.

A final inelegance remains; some unit test utilities leverage the internal pauli logic functions using extern, which will break when we eventually switch to private namespacing. Alas!
---
 quest/src/api/matrices.cpp       |   4 +-
 quest/src/api/multiplication.cpp |  11 +-
 quest/src/api/operations.cpp     |   7 +-
 quest/src/api/paulis.cpp         | 393 +------------------------------
 quest/src/api/trotterisation.cpp |   9 +-
 quest/src/core/CMakeLists.txt    |   1 +
 quest/src/core/localiser.cpp     |   8 +-
 quest/src/core/paulilogic.cpp    | 388 ++++++++++++++++++++++++++++++
 quest/src/core/paulilogic.hpp    |  91 +++++++
 quest/src/core/printer.cpp       |   7 +-
 quest/src/core/validation.cpp    |   8 +-
 tests/unit/operations.cpp        |   8 +-
 tests/utils/convert.cpp          |  15 +-
 13 files changed, 509 insertions(+), 441 deletions(-)
 create mode 100644 quest/src/core/paulilogic.cpp
 create mode 100644 quest/src/core/paulilogic.hpp

diff --git a/quest/src/api/matrices.cpp b/quest/src/api/matrices.cpp
index 019b70f79..2c817253c 100644
--- a/quest/src/api/matrices.cpp
+++ b/quest/src/api/matrices.cpp
@@ -16,6 +16,7 @@
 #include "quest/src/core/autodeployer.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/localiser.hpp"
+#include "quest/src/core/paulilogic.hpp"
 #include "quest/src/core/printer.hpp"
 #include "quest/src/core/bitwise.hpp"
 #include "quest/src/core/fastmath.hpp"
@@ -629,9 +630,6 @@ extern "C" {
  */
 
 
-extern int paulis_getIndOfLefmostNonIdentityPauli(PauliStrSum sum);
-
-
 extern "C" void setFullStateDiagMatrFromPauliStrSum(FullStateDiagMatr out, PauliStrSum in) {
     validate_matrixFields(out, __func__);
     validate_pauliStrSumFields(in, __func__);
diff --git a/quest/src/api/multiplication.cpp b/quest/src/api/multiplication.cpp
index c4b508e0c..460a45895 100644
--- a/quest/src/api/multiplication.cpp
+++ b/quest/src/api/multiplication.cpp
@@ -14,6 +14,7 @@
 #include "quest/src/core/validation.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/localiser.hpp"
+#include "quest/src/core/paulilogic.hpp"
 
 #include <vector>
 
@@ -370,8 +371,6 @@ void rightapplySwap(Qureg qureg, int qubit1, int qubit2) {
  * individual Paulis
  */
 
-extern PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift);
-
 extern "C" {
 
 void leftapplyPauliX(Qureg qureg, int target) {
@@ -437,8 +436,6 @@ void rightapplyPauliZ(Qureg qureg, int target) {
  * Pauli strings
  */
 
-extern bool paulis_hasOddNumY(PauliStr str);
-
 extern "C" {
 
 void leftapplyPauliStr(Qureg qureg, PauliStr str) {
@@ -453,7 +450,7 @@ void rightapplyPauliStr(Qureg qureg, PauliStr str) {
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_pauliStrTargets(qureg, str, __func__);
 
-    qcomp factor = paulis_hasOddNumY(str)? -1 : 1; // undo transpose
+    qcomp factor = paulis_getSignOfPauliStrConj(str); // undo transpose
     str = paulis_getShiftedPauliStr(str, qureg.numQubits);
     localiser_statevec_anyCtrlPauliTensor(qureg, {}, {}, str, factor);
 }
@@ -481,7 +478,7 @@ void rightapplyPauliGadget(Qureg qureg, PauliStr str, qreal angle) {
     validate_quregIsDensityMatrix(qureg, __func__);
     validate_pauliStrTargets(qureg, str, __func__);
 
-    qreal factor = paulis_hasOddNumY(str)? -1 : 1;
+    qreal factor = paulis_getSignOfPauliStrConj(str);
     qreal phase = factor * util_getPhaseFromGateAngle(angle);
     str = paulis_getShiftedPauliStr(str, qureg.numQubits);
     localiser_statevec_anyCtrlPauliGadget(qureg, {}, {}, str, phase);
@@ -675,7 +672,7 @@ void rightapplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
     // post-multiply each term in-turn, mixing into output qureg, then undo using idempotency
     for (qindex i=0; i<sum.numTerms; i++) {
         PauliStr str =  paulis_getShiftedPauliStr(sum.strings[i], qureg.numQubits);
-        qcomp factor = paulis_hasOddNumY(str)? -1 : 1; // undoes transpose
+        qcomp factor = paulis_getSignOfPauliStrConj(str); // undoes transpose
 
         localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, str, factor);
         localiser_statevec_setQuregToSuperposition(1, qureg, sum.coeffs[i], workspace, 0, workspace);
diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index 5c415fbc5..a2b09d84e 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -19,6 +19,7 @@
 #include "quest/src/core/localiser.hpp"
 #include "quest/src/core/bitwise.hpp"
 #include "quest/src/core/constants.hpp"
+#include "quest/src/core/paulilogic.hpp"
 
 #include <vector>
 
@@ -30,12 +31,6 @@ using std::vector;
  * PRVIATE UTILITIES
  */
 
-extern int paulis_getSignOfPauliStrConj(PauliStr str);
-
-extern bool paulis_isIdentity(PauliStr str);
-extern PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift);
-extern PauliStr paulis_getKetAndBraPauliStr(PauliStr str, Qureg qureg);
-
 // T can be CompMatr, CompMatr1, CompMatr2, DiagMatr, DiagMatr1, DiagMatr2
 template <class T>
 void validateAndApplyAnyCtrlAnyTargUnitaryMatrix(Qureg qureg, int* ctrls, int* states, int numCtrls, int* targs, int numTargs, T matr, const char* caller) {
diff --git a/quest/src/api/paulis.cpp b/quest/src/api/paulis.cpp
index b07c065fd..5a2a122c2 100644
--- a/quest/src/api/paulis.cpp
+++ b/quest/src/api/paulis.cpp
@@ -8,36 +8,21 @@
 #include "quest/include/precision.h"
 #include "quest/include/paulis.h"
 
+#include "quest/src/core/paulilogic.hpp"
 #include "quest/src/core/validation.hpp"
-#include "quest/src/core/printer.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/parser.hpp"
+#include "quest/src/core/printer.hpp"
 #include "quest/src/core/memory.hpp"
-#include "quest/src/core/errors.hpp"
-#include "quest/src/core/bitwise.hpp"
-#include "quest/src/cpu/cpu_config.hpp"
 #include "quest/src/comm/comm_config.hpp"
 #include "quest/src/comm/comm_routines.hpp"
+#include "quest/src/cpu/cpu_config.hpp"
 
-#include <iostream>
-#include <utility>
 #include <vector>
 #include <string>
-#include <array>
 
 using std::string;
 using std::vector;
-using std::array;
-
-
-
-/*
- * PRIVATE CONSTANTS
- */
-
-
-static const int MAX_NUM_PAULIS_PER_MASK = sizeof(PAULI_MASK_TYPE) * 8 / 2;
-static const int MAX_NUM_PAULIS_PER_STR  = MAX_NUM_PAULIS_PER_MASK * 2;
 
 
 
@@ -46,12 +31,6 @@ static const int MAX_NUM_PAULIS_PER_STR  = MAX_NUM_PAULIS_PER_MASK * 2;
  */
 
 
-int getPauliFromMaskAt(PAULI_MASK_TYPE mask, int ind) {
-
-    return getTwoAdjacentBits(mask, 2*ind); // bits at (ind+1, ind)
-}
-
-
 bool didAnyAllocsFailOnAnyNode(PauliStrSum sum) {
 
     bool anyFail = (
@@ -87,372 +66,6 @@ void freeAllMemoryIfAnyAllocsFailed(PauliStrSum sum) {
 
 
 
-/*
- * INTERNAL PauliStr UTILITIES
- *
- * callable by other internal files but which are not exposed in the header
- * because we do not wish to make them visible to users. Ergo other internal
- * files must declare these functions as extern where needed. Yes, it's ugly :(
- */
-
-
-bool paulis_isIdentity(PauliStr str) {
-
-    return 
-        (str.lowPaulis  == 0) && 
-        (str.highPaulis == 0);
-}
-
-
-int paulis_getPauliAt(PauliStr str, int ind) {
-
-    return (ind < MAX_NUM_PAULIS_PER_MASK)?
-        getPauliFromMaskAt(str.lowPaulis,  ind) :
-        getPauliFromMaskAt(str.highPaulis, ind - MAX_NUM_PAULIS_PER_MASK);
-}
-
-
-int paulis_getIndOfLefmostNonIdentityPauli(PauliStr str) {
-
-    int ind   = (str.highPaulis == 0)? 0 : MAX_NUM_PAULIS_PER_MASK;
-    auto mask = (str.highPaulis == 0)? str.lowPaulis : str.highPaulis;
-
-    while (mask) {
-        mask >>= 2;
-        ind++;
-    }
-
-    return ind - 1;
-}
-
-
-int paulis_getIndOfLefmostNonIdentityPauli(PauliStr* strings, qindex numStrings) {
-
-    int maxInd = 0;
-
-    for (qindex i=0; i<numStrings; i++) {
-        int ind = paulis_getIndOfLefmostNonIdentityPauli(strings[i]);
-        if (ind > maxInd)
-            maxInd = ind;
-    }
-
-    return maxInd;
-}
-
-
-bool paulis_containsXOrY(PauliStr str) {
-
-    int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
-
-    for (int i=0; i<=maxInd; i++) {
-        int pauli = paulis_getPauliAt(str, i);
-
-        if (pauli == 1 || pauli == 2)
-            return true;
-    }
-
-    return false;
-}
-
-
-bool paulis_hasOddNumY(PauliStr str) {
-
-    bool odd = false;
-
-    for (int targ=0; targ < MAX_NUM_PAULIS_PER_STR; targ++) 
-        if (paulis_getPauliAt(str, targ) == 2)
-            odd = !odd;
-
-    return odd;
-}
-
-
-int paulis_getSignOfPauliStrConj(PauliStr str) {
-
-    // conj(Y) = -Y, conj(YY) = YY
-    return paulis_hasOddNumY(str)? -1 : 1;
-}
-
-
-int paulis_getPrefixZSign(Qureg qureg, vector<int> prefixZ) {
-
-    int sign = 1;
-
-    // each Z contributes +- 1
-    for (int qubit : prefixZ)
-        sign *= util_getRankBitOfQubit(qubit, qureg)? -1 : 1;
-
-    return sign;
-}
-
-
-qcomp paulis_getPrefixPaulisElem(Qureg qureg, vector<int> prefixY, vector<int> prefixZ) {
-
-    // each Z contributes +- 1
-    qcomp elem = paulis_getPrefixZSign(qureg, prefixZ);
-
-    // each Y contributes -+ i
-    for (int qubit : prefixY)
-        elem *= 1_i * (util_getRankBitOfQubit(qubit, qureg)? 1 : -1);
-
-    return elem;
-}
-
-
-vector<int> paulis_getTargetInds(PauliStr str) {
-
-    int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
-
-    vector<int> inds(0);
-    inds.reserve(maxInd+1);
-
-    for (int i=0; i<=maxInd; i++)
-        if (paulis_getPauliAt(str, i) != 0) // Id
-            inds.push_back(i);
-
-    return inds;
-}
-
-
-qindex paulis_getTargetBitMask(PauliStr str) {
-    
-    /// @todo 
-    /// would compile-time MAX_NUM_PAULIS_PER_STR bound be faster here,
-    /// since this function is invoked upon every PauliStrSum element?
-    int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
-
-    qindex mask = 0;
-
-    for (int i=0; i<=maxInd; i++)
-        if (paulis_getPauliAt(str, i) != 0) // Id
-            mask = flipBit(mask, i);
-
-    return mask;
-}
-
-
-array<vector<int>,3> paulis_getSeparateInds(PauliStr str) {
-
-    vector<int> iXYZ = paulis_getTargetInds(str);
-    vector<int> iX, iY, iZ;
-
-    vector<int>* ptrs[] = {&iX, &iY, &iZ};
-
-    for (int i : iXYZ)
-        ptrs[paulis_getPauliAt(str, i) - 1]->push_back(i);
-
-    return {iX, iY, iZ};
-}
-
-
-PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift) {
-
-    if (pauliShift <= 0 || pauliShift >= MAX_NUM_PAULIS_PER_MASK)
-        error_pauliStrShiftedByIllegalAmount();
-
-    int numBitsPerPauli = 2;
-    int numMaskBits = numBitsPerPauli * MAX_NUM_PAULIS_PER_MASK;
-    int bitShift    = numBitsPerPauli * pauliShift;
-
-    // record the bits we will lose from lowPaulis, to move to highPaulis
-    PAULI_MASK_TYPE lostBits = getBitsLeftOfIndex(str.lowPaulis, numMaskBits - bitShift - 1);
-
-    // ensure we actually lose these bits from lowPaulis
-    PAULI_MASK_TYPE lowerBits = getBitsRightOfIndex(str.lowPaulis, numMaskBits - bitShift) << bitShift;
-
-    // and add them to highPaulis; we don't have to force lose upper bits of high paulis
-    PAULI_MASK_TYPE upperBits = concatenateBits(str.highPaulis, lostBits, bitShift);
-
-    // return a new stack PauliStr instance (avoiding C++20 initialiser)
-    PauliStr out;
-    out.lowPaulis = lowerBits;
-    out.highPaulis = upperBits;
-    return out;
-}
-
-
-PauliStr paulis_getTensorProdOfPauliStr(PauliStr left, PauliStr right, int numQubits) {
-
-    // computes left (tensor) right, assuming right is smaller than numQubits
-    PauliStr shifted = paulis_getShiftedPauliStr(left, numQubits);
-
-    // return a new stack PauliStr instance (avoiding C++20 initialiser)
-    PauliStr out;
-    out.lowPaulis  = right.lowPaulis  | shifted.lowPaulis;
-    out.highPaulis = right.highPaulis | shifted.highPaulis;
-    return out;
-}
-
-
-PauliStr paulis_getKetAndBraPauliStr(PauliStr str, Qureg qureg) {
-
-    return paulis_getTensorProdOfPauliStr(str, str, qureg.numQubits);
-}
-
-
-PAULI_MASK_TYPE paulis_getKeyOfSameMixedAmpsGroup(PauliStr str) {
-
-    PAULI_MASK_TYPE key = 0;
-
-    // in theory, we can reduce the number of involved operations by bit-shifting
-    // str left by 1, XOR'ing this with str, and retaining every 2nd bit, producing
-    // e.g. key=0110 from str=IXYZ. However, this is an insignificant speedup which
-    // risks sneaky bugs related to handling str's two masks.
-
-    int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
-
-    for (int i=0; i<=maxInd; i++) {
-        int pauli = paulis_getPauliAt(str, i);
-        int isXY = (pauli == 1 || pauli == 2);
-        key |= (isXY << i);
-    }
-
-    return key;
-}
-
-
-std::pair<qcomp,PauliStr> paulis_getPauliStrProd(PauliStr strA, PauliStr strB) {
-
-    // a . b = coeff * (a ^ b)
-    PauliStr strOut;
-    strOut.lowPaulis  = strA.lowPaulis  ^ strB.lowPaulis;
-    strOut.highPaulis = strA.highPaulis ^ strB.highPaulis;
-
-    // coeff = product of single-site product coeffs
-    qcomp coeff = 1;
-    for (int i=0; i<MAX_NUM_PAULIS_PER_STR; i++) {
-        int pA = paulis_getPauliAt(strA, i);
-        int pB = paulis_getPauliAt(strB, i);
-        
-        // I.P = P.I = P and P.P = I contribute factor=1
-        if (pA == 0 || pB == 0 || pA == pB)
-            continue;
-
-        // XY,YZ,ZX=i, XZ,YX,ZY=-i
-        int dif = pB - pA;
-        coeff *= qcomp(0, (dif == 1 || dif == -2)? 1 : -1);
-    }
-    
-    return {coeff, strOut};
-}
-
-
-
-/*
- * INTERNAL PauliStrSum UTILITIES
- */
-
-
-int paulis_getIndOfLefmostNonIdentityPauli(PauliStrSum sum) {
-
-    return paulis_getIndOfLefmostNonIdentityPauli(sum.strings, sum.numTerms);
-}
-
-
-bool paulis_containsXOrY(PauliStrSum sum) {
-
-    for (qindex i=0; i<sum.numTerms; i++)
-        if (paulis_containsXOrY(sum.strings[i]))
-            return true;
-
-    return false;
-}
-
-
-qindex paulis_getTargetBitMask(PauliStrSum sum) {
-
-    qindex mask = 0;
-
-    // mask has 1 where any str has a != Id
-    for (int t=0; t<sum.numTerms; t++)
-        mask |= paulis_getTargetBitMask(sum.strings[t]);
-
-    return mask;
-}
-
-
-void paulis_setPauliStrSumToScaledTensorProdOfConjWithSelf(PauliStrSum out, qreal factor, PauliStrSum in, int numQubits) {
-
-    // sets out = factor * conj(in) (x) in, where in has dim of numQubits
-    if (paulis_getIndOfLefmostNonIdentityPauli(in) >= numQubits)
-        error_pauliStrSumHasMoreQubitsThanSpecifiedInTensorProd();
-    if (out.numTerms != in.numTerms * in.numTerms)
-        error_pauliStrSumTensorProdHasIncorrectNumTerms();
-
-    // conj(in) (x) in = sum_jk conj(c_j) c_k conj(P_j) (x) P_k...
-    qindex i = 0;
-    for (qindex j=0; j<in.numTerms; j++) {
-        for (qindex k=0; k<in.numTerms; k++) {
-
-            // ... where conj(P_j) = sign_j P_j
-            out.strings[i] = paulis_getTensorProdOfPauliStr(in.strings[j], in.strings[k], numQubits);
-            out.coeffs[i] = factor * std::conj(in.coeffs[j]) * in.coeffs[k] * paulis_getSignOfPauliStrConj(in.strings[j]);
-            i++;
-        }
-    }
-}
-
-
-qindex paulis_getNumTermsInPauliStrSumProdOfAdjointWithSelf(PauliStrSum in) {
-
-    // adj(in).in has fewer terms than the numTerms^2 bound, since 
-    // a.a = I (causing -n and +1 below) and a.b ~ b.a (causing /2);
-    // we do not however consider any cancellations of coefficients
-    int n = in.numTerms;
-    return 1 + (n*n - n)/2;
-}
-
-
-void paulis_setPauliStrSumToScaledProdOfAdjointWithSelf(PauliStrSum out, qreal factor, PauliStrSum in) {
-
-    // sets out = factor * adj(in) . in, permitting duplicate strings
-    if (out.numTerms != paulis_getNumTermsInPauliStrSumProdOfAdjointWithSelf(in))
-        error_pauliStrSumProdHasIncorrectNumTerms();
-
-    // since out definitely contains an identity (when neglecting coeff cancellation)
-    // which is contributed toward by all j=k iterations below, we keep it at i=0
-    out.strings[0] = getPauliStr("I");
-    out.coeffs[0] = 0;
-    qindex i = 1;
-
-    // we leverage that sum_jk a_j^* a_k P_j P_k...
-    for (qindex j=0; j<in.numTerms; j++) {
-
-        // = sum_j ( |a_j|^2 Id + sum_k<j ...)
-        out.coeffs[0] += factor * std::norm(in.coeffs[j]);
-
-        // containing sum_k<j (a_j^* a_k P_j P_k + a_k^* a_j P_k P_j)
-        for (qindex k=0; k<j; k++) {
-
-            // = (a_j^* a_k b_jk + a_k^* a_j b_jk^*) P'
-            auto [coeff, str] = paulis_getPauliStrProd(in.strings[j], in.strings[k]);
-
-            // = (x + x^*) P' = 2 Re[x] P'
-            out.strings[i] = str;
-            out.coeffs[i] = factor * 2 * std::real(std::conj(in.coeffs[j]) * in.coeffs[k] * coeff);
-            i++;
-        }
-    }
-}
-
-
-void paulis_setPauliStrSumToShiftedConj(PauliStrSum out, PauliStrSum in, int numQubits) {
-
-    // sets out = conj(in) (x) I
-    if (paulis_getIndOfLefmostNonIdentityPauli(in) >= numQubits)
-        error_pauliStrSumHasMoreQubitsThanSpecifiedInConjShift();
-    if (out.numTerms != in.numTerms)
-        error_pauliStrSumConjHasIncorrectNumTerms();
-
-    // where conj(c P) = conj(c) sign P
-    for (qindex i=0; i<out.numTerms; i++) {
-        out.strings[i] = paulis_getShiftedPauliStr(in.strings[i], numQubits);
-        out.coeffs[i] = std::conj(in.coeffs[i]) * paulis_getSignOfPauliStrConj(in.strings[i]);
-    }
-}
-
-
-
 /*
  * PAULI STRING INITIALISATION
  *
diff --git a/quest/src/api/trotterisation.cpp b/quest/src/api/trotterisation.cpp
index 760432ee7..fd371b723 100644
--- a/quest/src/api/trotterisation.cpp
+++ b/quest/src/api/trotterisation.cpp
@@ -13,6 +13,7 @@
 #include "quest/src/core/validation.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/core/localiser.hpp"
+#include "quest/src/core/paulilogic.hpp"
 #include "quest/src/core/errors.hpp"
 
 #include <vector>
@@ -25,13 +26,6 @@ using std::vector;
  * INTERNAL UTILS
  */
 
-extern int paulis_getSignOfPauliStrConj(PauliStr str);
-extern PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift);
-extern void paulis_setPauliStrSumToScaledTensorProdOfConjWithSelf(PauliStrSum out, qreal factor, PauliStrSum in, int numQubits);
-extern void paulis_setPauliStrSumToScaledProdOfAdjointWithSelf(PauliStrSum out, qreal factor, PauliStrSum in);
-extern void paulis_setPauliStrSumToShiftedConj(PauliStrSum out, PauliStrSum in, int numQubits);
-extern qindex paulis_getNumTermsInPauliStrSumProdOfAdjointWithSelf(PauliStrSum in);
-
 void internal_applyFirstOrderTrotterRepetition(
     Qureg qureg, vector<int>& ketCtrls, vector<int>& braCtrls,
     vector<int>& states, PauliStrSum sum, qcomp angle, bool onlyLeftApply, bool reverse
@@ -356,4 +350,3 @@ void applyTrotterizedNoisyTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal* d
 }
 
 } // end de-mangler
-
diff --git a/quest/src/core/CMakeLists.txt b/quest/src/core/CMakeLists.txt
index 9d11d16d7..9dd703b30 100644
--- a/quest/src/core/CMakeLists.txt
+++ b/quest/src/core/CMakeLists.txt
@@ -9,6 +9,7 @@ target_sources(QuEST
   localiser.cpp
   memory.cpp
   parser.cpp
+  paulilogic.cpp
   printer.cpp
   randomiser.cpp
   utilities.cpp
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 5b107e763..0d826bf44 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -19,6 +19,7 @@
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/bitwise.hpp"
 #include "quest/src/core/utilities.hpp"
+#include "quest/src/core/paulilogic.hpp"
 #include "quest/src/core/localiser.hpp"
 #include "quest/src/core/accelerator.hpp"
 #include "quest/src/comm/comm_config.hpp"
@@ -1248,13 +1249,6 @@ template void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg, vector<int>, vecto
  */
 
 
-extern bool paulis_containsXOrY(PauliStr str);
-extern vector<int> paulis_getTargetInds(PauliStr str);
-extern std::array<vector<int>,3> paulis_getSeparateInds(PauliStr str);
-extern int paulis_getPrefixZSign(Qureg qureg, vector<int> prefixZ) ;
-extern qcomp paulis_getPrefixPaulisElem(Qureg qureg, vector<int> prefixY, vector<int> prefixZ);
-
-
 void anyCtrlZTensorOrGadget(Qureg qureg, vector<int> ctrls, vector<int> ctrlStates, vector<int> targs, bool isGadget, qcomp phase) {     
     assertValidCtrlStates(ctrls, ctrlStates);
     setDefaultCtrlStates(ctrls, ctrlStates);
diff --git a/quest/src/core/paulilogic.cpp b/quest/src/core/paulilogic.cpp
new file mode 100644
index 000000000..1a90dce2f
--- /dev/null
+++ b/quest/src/core/paulilogic.cpp
@@ -0,0 +1,388 @@
+/** @file
+ * Internal functions which process Pauli strings
+ * and their weighted sums
+ * 
+ * @author Tyson Jones
+ */
+
+#include "quest/include/paulis.h"
+#include "quest/include/qureg.h"
+
+#include "quest/src/core/paulilogic.hpp"
+#include "quest/src/core/utilities.hpp"
+#include "quest/src/core/bitwise.hpp"
+#include "quest/src/core/errors.hpp"
+
+#include <utility>
+#include <vector>
+#include <array>
+
+using std::vector;
+
+
+
+/*
+ * PRIVATE UTILITIES
+ */
+
+
+int getPauliFromMaskAt(PAULI_MASK_TYPE mask, int ind) {
+
+    return getTwoAdjacentBits(mask, 2*ind); // bits at (ind+1, ind)
+}
+
+
+
+/*
+ * PauliStr
+ */
+
+
+bool paulis_isIdentity(PauliStr str) {
+
+    return 
+        (str.lowPaulis  == 0) && 
+        (str.highPaulis == 0);
+}
+
+
+int paulis_getPauliAt(PauliStr str, int ind) {
+
+    return (ind < MAX_NUM_PAULIS_PER_MASK)?
+        getPauliFromMaskAt(str.lowPaulis,  ind) :
+        getPauliFromMaskAt(str.highPaulis, ind - MAX_NUM_PAULIS_PER_MASK);
+}
+
+
+int paulis_getIndOfLefmostNonIdentityPauli(PauliStr str) {
+
+    int ind   = (str.highPaulis == 0)? 0 : MAX_NUM_PAULIS_PER_MASK;
+    auto mask = (str.highPaulis == 0)? str.lowPaulis : str.highPaulis;
+
+    while (mask) {
+        mask >>= 2;
+        ind++;
+    }
+
+    return ind - 1;
+}
+
+
+int paulis_getIndOfLefmostNonIdentityPauli(PauliStr* strings, qindex numStrings) {
+
+    int maxInd = 0;
+
+    for (qindex i=0; i<numStrings; i++) {
+        int ind = paulis_getIndOfLefmostNonIdentityPauli(strings[i]);
+        if (ind > maxInd)
+            maxInd = ind;
+    }
+
+    return maxInd;
+}
+
+
+bool paulis_containsXOrY(PauliStr str) {
+
+    int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
+
+    for (int i=0; i<=maxInd; i++) {
+        int pauli = paulis_getPauliAt(str, i);
+
+        if (pauli == 1 || pauli == 2)
+            return true;
+    }
+
+    return false;
+}
+
+
+int paulis_getSignOfPauliStrConj(PauliStr str) {
+
+    // determine parity of Y count in str
+    bool odd = false;
+    for (int targ=0; targ < MAX_NUM_PAULIS_PER_STR; targ++) 
+        if (paulis_getPauliAt(str, targ) == 2)
+            odd = !odd;
+
+    // conj(Y) = -Y, conj(YY) = YY
+    return odd? -1 : 1;
+}
+
+
+int paulis_getPrefixZSign(Qureg qureg, vector<int> prefixZ) {
+
+    int sign = 1;
+
+    // each Z contributes +- 1
+    for (int qubit : prefixZ)
+        sign *= util_getRankBitOfQubit(qubit, qureg)? -1 : 1;
+
+    return sign;
+}
+
+
+qcomp paulis_getPrefixPaulisElem(Qureg qureg, vector<int> prefixY, vector<int> prefixZ) {
+
+    // each Z contributes +- 1
+    qcomp elem = paulis_getPrefixZSign(qureg, prefixZ);
+
+    // each Y contributes -+ i
+    for (int qubit : prefixY)
+        elem *= 1_i * (util_getRankBitOfQubit(qubit, qureg)? 1 : -1);
+
+    return elem;
+}
+
+
+vector<int> paulis_getTargetInds(PauliStr str) {
+
+    int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
+
+    vector<int> inds(0);
+    inds.reserve(maxInd+1);
+
+    for (int i=0; i<=maxInd; i++)
+        if (paulis_getPauliAt(str, i) != 0) // Id
+            inds.push_back(i);
+
+    return inds;
+}
+
+
+qindex paulis_getTargetBitMask(PauliStr str) {
+    
+    /// @todo 
+    /// would compile-time MAX_NUM_PAULIS_PER_STR bound be faster here,
+    /// since this function is invoked upon every PauliStrSum element?
+    int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
+
+    qindex mask = 0;
+
+    for (int i=0; i<=maxInd; i++)
+        if (paulis_getPauliAt(str, i) != 0) // Id
+            mask = flipBit(mask, i);
+
+    return mask;
+}
+
+
+std::array<vector<int>,3> paulis_getSeparateInds(PauliStr str) {
+
+    vector<int> iXYZ = paulis_getTargetInds(str);
+    vector<int> iX, iY, iZ;
+
+    vector<int>* ptrs[] = {&iX, &iY, &iZ};
+
+    for (int i : iXYZ)
+        ptrs[paulis_getPauliAt(str, i) - 1]->push_back(i);
+
+    return {iX, iY, iZ};
+}
+
+
+PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift) {
+
+    if (pauliShift <= 0 || pauliShift >= MAX_NUM_PAULIS_PER_MASK)
+        error_pauliStrShiftedByIllegalAmount();
+
+    int numBitsPerPauli = 2;
+    int numMaskBits = numBitsPerPauli * MAX_NUM_PAULIS_PER_MASK;
+    int bitShift    = numBitsPerPauli * pauliShift;
+
+    // record the bits we will lose from lowPaulis, to move to highPaulis
+    PAULI_MASK_TYPE lostBits = getBitsLeftOfIndex(str.lowPaulis, numMaskBits - bitShift - 1);
+
+    // ensure we actually lose these bits from lowPaulis
+    PAULI_MASK_TYPE lowerBits = getBitsRightOfIndex(str.lowPaulis, numMaskBits - bitShift) << bitShift;
+
+    // and add them to highPaulis; we don't have to force lose upper bits of high paulis
+    PAULI_MASK_TYPE upperBits = concatenateBits(str.highPaulis, lostBits, bitShift);
+
+    // return a new stack PauliStr instance (avoiding C++20 initialiser)
+    PauliStr out;
+    out.lowPaulis = lowerBits;
+    out.highPaulis = upperBits;
+    return out;
+}
+
+
+PauliStr paulis_getTensorProdOfPauliStr(PauliStr left, PauliStr right, int numQubits) {
+
+    // computes left (tensor) right, assuming right is smaller than numQubits
+    PauliStr shifted = paulis_getShiftedPauliStr(left, numQubits);
+
+    // return a new stack PauliStr instance (avoiding C++20 initialiser)
+    PauliStr out;
+    out.lowPaulis  = right.lowPaulis  | shifted.lowPaulis;
+    out.highPaulis = right.highPaulis | shifted.highPaulis;
+    return out;
+}
+
+
+PauliStr paulis_getKetAndBraPauliStr(PauliStr str, Qureg qureg) {
+
+    return paulis_getTensorProdOfPauliStr(str, str, qureg.numQubits);
+}
+
+
+PAULI_MASK_TYPE paulis_getKeyOfSameMixedAmpsGroup(PauliStr str) {
+
+    PAULI_MASK_TYPE key = 0;
+
+    // in theory, we can reduce the number of involved operations by bit-shifting
+    // str left by 1, XOR'ing this with str, and retaining every 2nd bit, producing
+    // e.g. key=0110 from str=IXYZ. However, this is an insignificant speedup which
+    // risks sneaky bugs related to handling str's two masks.
+
+    int maxInd = paulis_getIndOfLefmostNonIdentityPauli(str);
+
+    for (int i=0; i<=maxInd; i++) {
+        int pauli = paulis_getPauliAt(str, i);
+        int isXY = (pauli == 1 || pauli == 2);
+        key |= (isXY << i);
+    }
+
+    return key;
+}
+
+
+std::pair<qcomp,PauliStr> paulis_getPauliStrProd(PauliStr strA, PauliStr strB) {
+
+    // a . b = coeff * (a ^ b)
+    PauliStr strOut;
+    strOut.lowPaulis  = strA.lowPaulis  ^ strB.lowPaulis;
+    strOut.highPaulis = strA.highPaulis ^ strB.highPaulis;
+
+    // coeff = product of single-site product coeffs
+    qcomp coeff = 1;
+    for (int i=0; i<MAX_NUM_PAULIS_PER_STR; i++) {
+        int pA = paulis_getPauliAt(strA, i);
+        int pB = paulis_getPauliAt(strB, i);
+        
+        // I.P = P.I = P and P.P = I contribute factor=1
+        if (pA == 0 || pB == 0 || pA == pB)
+            continue;
+
+        // XY,YZ,ZX=i, XZ,YX,ZY=-i
+        int dif = pB - pA;
+        coeff *= qcomp(0, (dif == 1 || dif == -2)? 1 : -1);
+    }
+    
+    return {coeff, strOut};
+}
+
+
+
+/*
+ * PauliStrSum
+ */
+
+
+int paulis_getIndOfLefmostNonIdentityPauli(PauliStrSum sum) {
+
+    return paulis_getIndOfLefmostNonIdentityPauli(sum.strings, sum.numTerms);
+}
+
+
+bool paulis_containsXOrY(PauliStrSum sum) {
+
+    for (qindex i=0; i<sum.numTerms; i++)
+        if (paulis_containsXOrY(sum.strings[i]))
+            return true;
+
+    return false;
+}
+
+
+qindex paulis_getTargetBitMask(PauliStrSum sum) {
+
+    qindex mask = 0;
+
+    // mask has 1 where any str has a != Id
+    for (int t=0; t<sum.numTerms; t++)
+        mask |= paulis_getTargetBitMask(sum.strings[t]);
+
+    return mask;
+}
+
+
+void paulis_setPauliStrSumToScaledTensorProdOfConjWithSelf(PauliStrSum out, qreal factor, PauliStrSum in, int numQubits) {
+
+    // sets out = factor * conj(in) (x) in, where in has dim of numQubits
+    if (paulis_getIndOfLefmostNonIdentityPauli(in) >= numQubits)
+        error_pauliStrSumHasMoreQubitsThanSpecifiedInTensorProd();
+    if (out.numTerms != in.numTerms * in.numTerms)
+        error_pauliStrSumTensorProdHasIncorrectNumTerms();
+
+    // conj(in) (x) in = sum_jk conj(c_j) c_k conj(P_j) (x) P_k...
+    qindex i = 0;
+    for (qindex j=0; j<in.numTerms; j++) {
+        for (qindex k=0; k<in.numTerms; k++) {
+
+            // ... where conj(P_j) = sign_j P_j
+            out.strings[i] = paulis_getTensorProdOfPauliStr(in.strings[j], in.strings[k], numQubits);
+            out.coeffs[i] = factor * std::conj(in.coeffs[j]) * in.coeffs[k] * paulis_getSignOfPauliStrConj(in.strings[j]);
+            i++;
+        }
+    }
+}
+
+
+qindex paulis_getNumTermsInPauliStrSumProdOfAdjointWithSelf(PauliStrSum in) {
+
+    // adj(in).in has fewer terms than the numTerms^2 bound, since 
+    // a.a = I (causing -n and +1 below) and a.b ~ b.a (causing /2);
+    // we do not however consider any cancellations of coefficients
+    int n = in.numTerms;
+    return 1 + (n*n - n)/2;
+}
+
+
+void paulis_setPauliStrSumToScaledProdOfAdjointWithSelf(PauliStrSum out, qreal factor, PauliStrSum in) {
+
+    // sets out = factor * adj(in) . in, permitting duplicate strings
+    if (out.numTerms != paulis_getNumTermsInPauliStrSumProdOfAdjointWithSelf(in))
+        error_pauliStrSumProdHasIncorrectNumTerms();
+
+    // since out definitely contains an identity (when neglecting coeff cancellation)
+    // which is contributed toward by all j=k iterations below, we keep it at i=0
+    out.strings[0] = getPauliStr("I");
+    out.coeffs[0] = 0;
+    qindex i = 1;
+
+    // we leverage that sum_jk a_j^* a_k P_j P_k...
+    for (qindex j=0; j<in.numTerms; j++) {
+
+        // = sum_j ( |a_j|^2 Id + sum_k<j ...)
+        out.coeffs[0] += factor * std::norm(in.coeffs[j]);
+
+        // containing sum_k<j (a_j^* a_k P_j P_k + a_k^* a_j P_k P_j)
+        for (qindex k=0; k<j; k++) {
+
+            // = (a_j^* a_k b_jk + a_k^* a_j b_jk^*) P'
+            auto [coeff, str] = paulis_getPauliStrProd(in.strings[j], in.strings[k]);
+
+            // = (x + x^*) P' = 2 Re[x] P'
+            out.strings[i] = str;
+            out.coeffs[i] = factor * 2 * std::real(std::conj(in.coeffs[j]) * in.coeffs[k] * coeff);
+            i++;
+        }
+    }
+}
+
+
+void paulis_setPauliStrSumToShiftedConj(PauliStrSum out, PauliStrSum in, int numQubits) {
+
+    // sets out = conj(in) (x) I
+    if (paulis_getIndOfLefmostNonIdentityPauli(in) >= numQubits)
+        error_pauliStrSumHasMoreQubitsThanSpecifiedInConjShift();
+    if (out.numTerms != in.numTerms)
+        error_pauliStrSumConjHasIncorrectNumTerms();
+
+    // where conj(c P) = conj(c) sign P
+    for (qindex i=0; i<out.numTerms; i++) {
+        out.strings[i] = paulis_getShiftedPauliStr(in.strings[i], numQubits);
+        out.coeffs[i] = std::conj(in.coeffs[i]) * paulis_getSignOfPauliStrConj(in.strings[i]);
+    }
+}
diff --git a/quest/src/core/paulilogic.hpp b/quest/src/core/paulilogic.hpp
new file mode 100644
index 000000000..f3748b5e8
--- /dev/null
+++ b/quest/src/core/paulilogic.hpp
@@ -0,0 +1,91 @@
+/** @file
+ * Internal signatures which process Pauli strings
+ * and their weighted sums
+ * 
+ * @author Tyson Jones
+ */
+
+#ifndef PAULILOGIC_HPP
+#define PAULILOGIC_HPP
+
+#include "quest/include/precision.h"
+#include "quest/include/paulis.h"
+#include "quest/include/qureg.h"
+
+#include <utility>
+#include <vector>
+#include <array>
+
+using std::vector;
+
+
+/*
+ * CONSTANTS
+ */
+
+static const int MAX_NUM_PAULIS_PER_MASK = sizeof(PAULI_MASK_TYPE) * 8 / 2;
+static const int MAX_NUM_PAULIS_PER_STR  = MAX_NUM_PAULIS_PER_MASK * 2;
+
+
+/*
+ * PauliStr
+ */
+
+bool paulis_isIdentity(PauliStr str);
+
+bool paulis_containsXOrY(PauliStr str);
+
+int paulis_getPauliAt(PauliStr str, int ind);
+
+int paulis_getIndOfLefmostNonIdentityPauli(PauliStr str);
+int paulis_getIndOfLefmostNonIdentityPauli(PauliStr* strings, qindex numStrings);
+
+int paulis_getSignOfPauliStrConj(PauliStr str);
+
+int paulis_getPrefixZSign(Qureg qureg, vector<int> prefixZ);
+
+qcomp paulis_getPrefixPaulisElem(Qureg qureg, vector<int> prefixY, vector<int> prefixZ);
+
+vector<int> paulis_getTargetInds(PauliStr str);
+
+std::array<vector<int>,3> paulis_getSeparateInds(PauliStr str);
+
+qindex paulis_getTargetBitMask(PauliStr str);
+
+PauliStr paulis_getShiftedPauliStr(PauliStr str, int pauliShift);
+
+PauliStr paulis_getKetAndBraPauliStr(PauliStr str, Qureg qureg);
+
+PAULI_MASK_TYPE paulis_getKeyOfSameMixedAmpsGroup(PauliStr str);
+
+
+// below are not currently used outside of paulilogic.cpp but are natural methods
+
+PauliStr paulis_getTensorProdOfPauliStr(PauliStr left, PauliStr right, int numQubits);
+
+std::pair<qcomp,PauliStr> paulis_getPauliStrProd(PauliStr strA, PauliStr strB);
+
+
+/*
+ * PauliStrSum
+ */
+
+bool paulis_containsXOrY(PauliStrSum sum);
+
+int paulis_getIndOfLefmostNonIdentityPauli(PauliStrSum sum);
+
+qindex paulis_getTargetBitMask(PauliStrSum sum);
+
+
+// below are used exclusively by Trotterisation
+
+qindex paulis_getNumTermsInPauliStrSumProdOfAdjointWithSelf(PauliStrSum in);
+
+void paulis_setPauliStrSumToScaledTensorProdOfConjWithSelf(PauliStrSum out, qreal factor, PauliStrSum in, int numQubits);
+
+void paulis_setPauliStrSumToScaledProdOfAdjointWithSelf(PauliStrSum out, qreal factor, PauliStrSum in);
+
+void paulis_setPauliStrSumToShiftedConj(PauliStrSum out, PauliStrSum in, int numQubits);
+
+
+#endif // PAULILOGIC_HPP
\ No newline at end of file
diff --git a/quest/src/core/printer.cpp b/quest/src/core/printer.cpp
index 8a7bea15b..016d05257 100644
--- a/quest/src/core/printer.cpp
+++ b/quest/src/core/printer.cpp
@@ -19,6 +19,7 @@
 #include "quest/src/core/errors.hpp"
 #include "quest/src/core/memory.hpp"
 #include "quest/src/core/bitwise.hpp"
+#include "quest/src/core/paulilogic.hpp"
 #include "quest/src/core/localiser.hpp"
 #include "quest/src/core/utilities.hpp"
 #include "quest/src/comm/comm_config.hpp"
@@ -1260,12 +1261,6 @@ void print_elems(Qureg qureg, string indent) {
  */
 
 
-// we'll make use of these internal functions from paulis.cpp
-extern int paulis_getPauliAt(PauliStr str, int ind);
-extern int paulis_getIndOfLefmostNonIdentityPauli(PauliStr str);
-extern int paulis_getIndOfLefmostNonIdentityPauli(PauliStr* strings, qindex numStrings);
-
-
 string getPauliStrAsAllQubitsString(PauliStr str, int numPaulis) {
 
     // avoid repeated allocations in below string concatenation
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 51b61b7fc..38474323f 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -21,6 +21,7 @@
 #include "quest/src/core/bitwise.hpp"
 #include "quest/src/core/memory.hpp"
 #include "quest/src/core/utilities.hpp"
+#include "quest/src/core/paulilogic.hpp"
 #include "quest/src/core/parser.hpp"
 #include "quest/src/core/printer.hpp"
 #include "quest/src/core/envvars.hpp"
@@ -3177,9 +3178,6 @@ void validate_newPauliStrNumChars(int numPaulis, int numIndices, const char* cal
  * EXISTING PAULI STRING
  */
 
-extern int paulis_getPauliAt(PauliStr str, int ind);
-extern int paulis_getIndOfLefmostNonIdentityPauli(PauliStr str);
-
 void validate_pauliStrTargets(Qureg qureg, PauliStr str, const char* caller) {
 
     // avoid producing a list of targets which requires enumerating all bits
@@ -3306,10 +3304,6 @@ void validate_parsedStringIsNotEmpty(bool stringIsNotEmpty, const char* caller)
  * EXISTING PAULI STRING SUMS
  */
 
-extern bool paulis_containsXOrY(PauliStrSum sum);
-extern qindex paulis_getTargetBitMask(PauliStrSum sum);
-extern int paulis_getIndOfLefmostNonIdentityPauli(PauliStrSum sum);
-
 bool areQubitsDisjoint(qindex qubitsMaskA, int* qubitsB, int numQubitsB);
 
 void validate_pauliStrSumFields(PauliStrSum sum, const char* caller) {
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index cd39ac003..05d46418e 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -45,6 +45,12 @@ using std::tuple;
 using Catch::Matchers::ContainsSubstring;
 
 
+/*
+ * INTERNAL QUEST FUNCITONS
+ */
+
+extern int paulis_getPauliAt(PauliStr str, int ind);
+
 
 /*
  * UTILITIES
@@ -629,8 +635,6 @@ std::string toString(vector<int> list) {
     return out;
 }
 
-extern int paulis_getPauliAt(PauliStr str, int ind);
-
 std::string toString(PauliStr str, vector<int> targs) {
 
     std::string labels = "IXYZ";
diff --git a/tests/utils/convert.cpp b/tests/utils/convert.cpp
index 7d425d50b..50c34292b 100644
--- a/tests/utils/convert.cpp
+++ b/tests/utils/convert.cpp
@@ -22,6 +22,16 @@ using std::vector;
 
 
 
+/*
+ * INTERNAL QUEST FUNCTIONS
+ */
+
+extern int paulis_getPauliAt(PauliStr str, int ind);
+extern int paulis_getIndOfLefmostNonIdentityPauli(PauliStr str);
+extern int paulis_getIndOfLefmostNonIdentityPauli(PauliStrSum sum);
+
+
+
 /*
  * TO QUREG
  */
@@ -136,11 +146,6 @@ qmatrix getMatrix(SuperOp   m) { return getMatrixInner(m); }
  */
 
 
-extern int paulis_getPauliAt(PauliStr str, int ind);
-extern int paulis_getIndOfLefmostNonIdentityPauli(PauliStr str);
-extern int paulis_getIndOfLefmostNonIdentityPauli(PauliStrSum sum);
-
-
 qmatrix getMatrix(PauliStr str, vector<int> targs) {
     DEMAND( targs.size() >= 1 );
 

From 211d4e8a0030501bf71bc6258b22eb256ba47f23 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Wed, 20 Aug 2025 21:14:36 +0200
Subject: [PATCH 20/32] renamed Trotter functions

in order to make the applyTrotterized prefix consistent, as considered in #669.

Specifically, renamed:
- applyNonUnitaryTrotterizedPauliStrSumGadget -> applyTrotterizedNonUnitaryPauliStrSumGadget
- applyControlledTrotterizedPauliStrSumGadget -> applyTrotterizedControlledPauliStrSumGadget
- applyMultiControlledTrotterizedPauliStrSumGadget -> applyTrotterizedMultiControlledPauliStrSumGadget
- applyMultiStateControlledTrotterizedPauliStrSumGadget -> applyTrotterizedMultiStateControlledPauliStrSumGadget
---
 quest/include/trotterisation.h   | 34 ++++++++++++++++----------------
 quest/src/api/trotterisation.cpp | 16 +++++++--------
 tests/unit/trotterisation.cpp    |  8 ++++----
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/quest/include/trotterisation.h b/quest/include/trotterisation.h
index ca234c5d5..667eedd91 100644
--- a/quest/include/trotterisation.h
+++ b/quest/include/trotterisation.h
@@ -113,7 +113,7 @@ extern "C" {
  *   @f]
  *   as utilised by the function applyTrotterizedUnitaryTimeEvolution().
  * 
- * - This function is equivalent to applyNonUnitaryTrotterizedPauliStrSumGadget() when passing
+ * - This function is equivalent to applyTrotterizedNonUnitaryPauliStrSumGadget() when passing
  *   a @p qcomp instance with a zero imaginary component as the @p angle parameter. This latter 
  *   function is useful for generalising dynamical simulation to imaginary-time evolution.
  * 
@@ -126,11 +126,11 @@ extern "C" {
         \max\limits_{i} |c_i| \le \valeps
  *   @f]
  *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon().
- *   Otherwise, use applyNonUnitaryTrotterizedPauliStrSumGadget() to permit non-Hermitian @p sum
+ *   Otherwise, use applyTrotterizedNonUnitaryPauliStrSumGadget() to permit non-Hermitian @p sum
  *   and ergo effect a non-unitary exponential(s). 
  * 
  * - The @p angle parameter is necessarily real to retain unitarity, but can be relaxed to an arbitrary 
- *   complex scalar (i.e. a @p qcomp) using applyNonUnitaryTrotterizedPauliStrSumGadget(). This permits
+ *   complex scalar (i.e. a @p qcomp) using applyTrotterizedNonUnitaryPauliStrSumGadget(). This permits
  *   cancelling the complex unit @f$ i @f$ to effect non-unitary @f$ \exp(\theta \, \hat{H}) @f$ as
  *   is useful for imaginary-time evolution.
  * 
@@ -152,7 +152,7 @@ extern "C" {
  * 
  * @see
  *  - applyPauliGadget()
- *  - applyNonUnitaryTrotterizedPauliStrSumGadget()
+ *  - applyTrotterizedNonUnitaryPauliStrSumGadget()
  *  - applyTrotterizedUnitaryTimeEvolution()
  * 
  * @author Tyson Jones
@@ -165,7 +165,7 @@ void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle
 /// @see
 ///  - applyTrotterizedPauliStrSumGadget()
 ///  - applyControlledCompMatr1()
-void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps);
+void applyTrotterizedControlledPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps);
 
 
 /// @notyetdoced
@@ -173,7 +173,7 @@ void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, Pauli
 /// @see
 ///  - applyTrotterizedPauliStrSumGadget()
 ///  - applyMultiControlledCompMatr1()
-void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
+void applyTrotterizedMultiControlledPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
 
 
 /// @notyetdoced
@@ -181,7 +181,7 @@ void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls
 /// @see
 ///  - applyTrotterizedPauliStrSumGadget()
 ///  - applyMultiStateControlledCompMatr1()
-void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
+void applyTrotterizedMultiStateControlledPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
 
 
 /** @notyettested
@@ -232,7 +232,7 @@ void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* con
  * 
  * @author Tyson Jones
  */
-void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);
+void applyTrotterizedNonUnitaryPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);
 
 
 // end de-mangler
@@ -247,16 +247,16 @@ void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, q
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see applyMultiControlledTrotterizedPauliStrSumGadget()
-void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, std::vector<int> controls, PauliStrSum sum, qreal angle, int order, int reps);
+/// @see applyTrotterizedMultiControlledPauliStrSumGadget()
+void applyTrotterizedMultiControlledPauliStrSumGadget(Qureg qureg, std::vector<int> controls, PauliStrSum sum, qreal angle, int order, int reps);
 
 
 /// @notyettested
 /// @notyetvalidated
 /// @notyetdoced
 /// @cppvectoroverload
-/// @see applyMultiStateControlledTrotterizedPauliStrSumGadget()
-void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, std::vector<int> controls, std::vector<int> states, PauliStrSum sum, qreal angle, int order, int reps);
+/// @see applyTrotterizedMultiStateControlledPauliStrSumGadget()
+void applyTrotterizedMultiStateControlledPauliStrSumGadget(Qureg qureg, std::vector<int> controls, std::vector<int> states, PauliStrSum sum, qreal angle, int order, int reps);
 
 
 #endif // __cplusplus
@@ -320,7 +320,7 @@ extern "C" {
  * - The @p time parameter is necessarily real to retain unitarity. It can be substituted for a strictly imaginary
  *   scalar to perform imaginary-time evolution (as per Wick rotation @f$ t \rightarrow - \iu \tau @f$) via 
  *   applyTrotterizedImaginaryTimeEvolution(), or generalised to an arbitrary complex number through direct use of 
- *   applyNonUnitaryTrotterizedPauliStrSumGadget().
+ *   applyTrotterizedNonUnitaryPauliStrSumGadget().
  * 
  * - The simulated system is _closed_ with dynamics described fully by the Hamiltonian @p hamil. Open or otherwise noisy
  *   system dynamics can be simulated with applyTrotterizedNoisyTimeEvolution().
@@ -351,7 +351,7 @@ extern "C" {
  * @see
  *  - applyTrotterizedImaginaryTimeEvolution()
  *  - applyTrotterizedNoisyTimeEvolution()
- *  - applyNonUnitaryTrotterizedPauliStrSumGadget()
+ *  - applyTrotterizedNonUnitaryPauliStrSumGadget()
  * 
  * @param[in,out] qureg  the state to modify.
  * @param[in]     hamil  the Hamiltonian as a a weighted sum of Pauli strings.
@@ -438,7 +438,7 @@ void applyTrotterizedUnitaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal
  * 
  * @equivalences
  * 
- * - This function merely wraps applyNonUnitaryTrotterizedPauliStrSumGadget() which effects @f$ \exp(\iu \theta \hat{H}) @f$,
+ * - This function merely wraps applyTrotterizedNonUnitaryPauliStrSumGadget() which effects @f$ \exp(\iu \theta \hat{H}) @f$,
  *   passing @f$ \theta = \tau \iu @f$.
  * 
  * @constraints
@@ -454,7 +454,7 @@ void applyTrotterizedUnitaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal
  *   (the eigenvalues may be non-real) so is likely of limited utility.
  * 
  * - The @p tau parameter is necessarily real such that evolution approaches the groundstate (modulo renormalisation).
- *   It can generalised to an arbitrary complex number through direct use of applyNonUnitaryTrotterizedPauliStrSumGadget().
+ *   It can generalised to an arbitrary complex number through direct use of applyTrotterizedNonUnitaryPauliStrSumGadget().
  * 
  * - Simulation is exact such that the effected operation is precisely @f$ \exp(-\tau \hat{H}) @f$ only when 
  *   @p reps @f$ \rightarrow \infty @f$ or all terms in @p hamil commute with one another.
@@ -482,7 +482,7 @@ void applyTrotterizedUnitaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal
  *
  * @see
  *  - applyTrotterizedUnitaryTimeEvolution()
- *  - applyNonUnitaryTrotterizedPauliStrSumGadget()
+ *  - applyTrotterizedNonUnitaryPauliStrSumGadget()
  * 
  * @param[in,out] qureg  the state to modify.
  * @param[in]     hamil  the Hamiltonian as a a weighted sum of Pauli strings.
diff --git a/quest/src/api/trotterisation.cpp b/quest/src/api/trotterisation.cpp
index fd371b723..af25b9615 100644
--- a/quest/src/api/trotterisation.cpp
+++ b/quest/src/api/trotterisation.cpp
@@ -147,7 +147,7 @@ qindex internal_getNumTotalSuperPropagatorTerms(PauliStrSum hamil, PauliStrSum*
 
 extern "C" {
 
-void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps) {
+void applyTrotterizedNonUnitaryPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrSumFields(sum, __func__);
     validate_pauliStrSumTargets(sum, qureg, __func__);
@@ -170,7 +170,7 @@ void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle
     internal_applyAllTrotterRepetitions(qureg, nullptr, nullptr, 0, sum, angle, order, reps, onlyLeftApply);
 }
 
-void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps) {
+void applyTrotterizedControlledPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrSumFields(sum, __func__);
     validate_pauliStrSumIsHermitian(sum, __func__);
@@ -181,7 +181,7 @@ void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, Pauli
     internal_applyAllTrotterRepetitions(qureg, &control, nullptr, 1, sum, angle, order, reps, onlyLeftApply);
 }
 
-void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
+void applyTrotterizedMultiControlledPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrSumFields(sum, __func__);
     validate_pauliStrSumIsHermitian(sum, __func__);
@@ -192,7 +192,7 @@ void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls
     internal_applyAllTrotterRepetitions(qureg, controls, nullptr, numControls, sum, angle, order, reps, onlyLeftApply);
 }
 
-void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
+void applyTrotterizedMultiStateControlledPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps) {
     validate_quregFields(qureg, __func__);
     validate_pauliStrSumFields(sum, __func__);
     validate_pauliStrSumIsHermitian(sum, __func__);
@@ -206,15 +206,15 @@ void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* con
 
 } // end de-mangler
 
-void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, vector<int> controls, PauliStrSum sum, qreal angle, int order, int reps) {
+void applyTrotterizedMultiControlledPauliStrSumGadget(Qureg qureg, vector<int> controls, PauliStrSum sum, qreal angle, int order, int reps) {
 
-    applyMultiControlledTrotterizedPauliStrSumGadget(qureg, controls.data(), controls.size(), sum, angle, order, reps);
+    applyTrotterizedMultiControlledPauliStrSumGadget(qureg, controls.data(), controls.size(), sum, angle, order, reps);
 }
 
-void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, vector<int> controls, vector<int> states, PauliStrSum sum, qreal angle, int order, int reps) {
+void applyTrotterizedMultiStateControlledPauliStrSumGadget(Qureg qureg, vector<int> controls, vector<int> states, PauliStrSum sum, qreal angle, int order, int reps) {
     validate_controlsMatchStates(controls.size(), states.size(), __func__);
 
-    applyMultiStateControlledTrotterizedPauliStrSumGadget(qureg, controls.data(), states.data(), controls.size(), sum, angle, order, reps);
+    applyTrotterizedMultiStateControlledPauliStrSumGadget(qureg, controls.data(), states.data(), controls.size(), sum, angle, order, reps);
 }
 
 
diff --git a/tests/unit/trotterisation.cpp b/tests/unit/trotterisation.cpp
index bc2c7aab6..cc6914027 100644
--- a/tests/unit/trotterisation.cpp
+++ b/tests/unit/trotterisation.cpp
@@ -25,15 +25,15 @@
  * UNTESTED FUNCTIONS
  */
 
-void applyNonUnitaryTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);
+void applyTrotterizedNonUnitaryPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qcomp angle, int order, int reps);
 
 void applyTrotterizedPauliStrSumGadget(Qureg qureg, PauliStrSum sum, qreal angle, int order, int reps);
 
-void applyControlledTrotterizedPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps);
+void applyTrotterizedControlledPauliStrSumGadget(Qureg qureg, int control, PauliStrSum sum, qreal angle, int order, int reps);
 
-void applyMultiControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
+void applyTrotterizedMultiControlledPauliStrSumGadget(Qureg qureg, int* controls, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
 
-void applyMultiStateControlledTrotterizedPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
+void applyTrotterizedMultiStateControlledPauliStrSumGadget(Qureg qureg, int* controls, int* states, int numControls, PauliStrSum sum, qreal angle, int order, int reps);
 
 void applyTrotterizedUnitaryTimeEvolution(Qureg qureg, PauliStrSum hamil, qreal time, int order, int reps);
 

From d4a5714e76f00d760f146b7013de2e570b278e92 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Wed, 20 Aug 2025 21:12:07 +0200
Subject: [PATCH 21/32] added setQuregToWeightedSum (and ToMixture)

---
 quest/include/initialisations.h   |  24 ++
 quest/src/api/decoherence.cpp     |   2 +-
 quest/src/api/initialisations.cpp |  40 +++
 quest/src/core/accelerator.cpp    |  17 ++
 quest/src/core/accelerator.hpp    |   8 +-
 quest/src/core/localiser.cpp      |  13 +
 quest/src/core/localiser.hpp      |   2 +
 quest/src/core/utilities.cpp      |  28 +-
 quest/src/core/utilities.hpp      |   9 +-
 quest/src/core/validation.cpp     | 131 ++++++++-
 quest/src/core/validation.hpp     |  14 +-
 quest/src/cpu/cpu_subroutines.cpp |  26 ++
 quest/src/cpu/cpu_subroutines.hpp |   2 +
 quest/src/gpu/gpu_kernels.cuh     |  23 +-
 quest/src/gpu/gpu_subroutines.cpp |  32 +++
 quest/src/gpu/gpu_subroutines.hpp |   2 +
 quest/src/gpu/gpu_thrust.cuh      |  22 +-
 tests/unit/initialisations.cpp    | 427 +++++++++++++++++++++++++++++-
 18 files changed, 796 insertions(+), 26 deletions(-)

diff --git a/quest/include/initialisations.h b/quest/include/initialisations.h
index 06599fdda..05de64588 100644
--- a/quest/include/initialisations.h
+++ b/quest/include/initialisations.h
@@ -116,6 +116,16 @@ void setDensityQuregFlatAmps(Qureg qureg, qindex startInd, qcomp* amps, qindex n
 void setQuregToClone(Qureg targetQureg, Qureg copyQureg);
 
 
+/// @notyetdoced
+/// @notyettested
+void setQuregToWeightedSum(Qureg out, qcomp* coeffs, Qureg* in, int numIn);
+
+
+/// @notyetdoced
+/// @notyettested
+void setQuregToMixture(Qureg out, qreal* probs, Qureg* in, int numIn);
+
+
 /** @notyetdoced
  * @notyettested
  * 
@@ -227,6 +237,20 @@ void setQuregToPartialTrace(Qureg out, Qureg in, std::vector<int> traceOutQubits
 void setQuregToReducedDensityMatrix(Qureg out, Qureg in, std::vector<int> retainQubits);
 
 
+/// @ingroup init_amps
+/// @notyetdoced
+/// @cpponly
+/// @see setQuregToWeightedSum()
+void setQuregToWeightedSum(Qureg out, std::vector<qcomp> coeffs, std::vector<Qureg> in);
+
+
+/// @ingroup init_amps
+/// @notyetdoced
+/// @cpponly
+/// @see setQuregToMixture()
+void setQuregToMixture(Qureg out, std::vector<qreal> probs, std::vector<Qureg> in);
+
+
 #endif // __cplusplus
 
 
diff --git a/quest/src/api/decoherence.cpp b/quest/src/api/decoherence.cpp
index 02f2f0950..d2fadf621 100644
--- a/quest/src/api/decoherence.cpp
+++ b/quest/src/api/decoherence.cpp
@@ -134,7 +134,7 @@ void mixQureg(Qureg outQureg, Qureg inQureg, qreal inProb) {
     validate_quregFields(outQureg, __func__);
     validate_quregFields(inQureg, __func__);
     validate_probability(inProb, __func__);
-    validate_quregsCanBeMixed(outQureg, inQureg, __func__); // checks outQureg is densmatr
+    validate_quregPairCanBeMixed(outQureg, inQureg, __func__); // checks outQureg is densmatr
 
     qreal outProb = 1 - inProb;
     localiser_densmatr_mixQureg(outProb, outQureg, inProb, inQureg);
diff --git a/quest/src/api/initialisations.cpp b/quest/src/api/initialisations.cpp
index ce3a5949d..c3f6bb236 100644
--- a/quest/src/api/initialisations.cpp
+++ b/quest/src/api/initialisations.cpp
@@ -252,6 +252,34 @@ void setQuregToReducedDensityMatrix(Qureg out, Qureg in, int* retainQubits, int
 }
 
 
+void setQuregToWeightedSum(Qureg out, qcomp* coeffs, Qureg* in, int numIn) {
+    validate_quregFields(out, __func__);
+    validate_numQuregsInSum(numIn, __func__);
+    validate_quregsCanBeSummed(out, in, numIn, __func__); // also validates all init
+
+    auto coeffVec = util_getVector(coeffs, numIn);
+    auto inVec = util_getVector(in, numIn);
+    localiser_statevec_setQuregToWeightedSum(out, coeffVec, inVec);
+}
+
+
+void setQuregToMixture(Qureg out, qreal* probs, Qureg* in, int numIn) {
+    validate_quregFields(out, __func__);
+    validate_quregIsDensityMatrix(out, __func__);
+    validate_numQuregsInSum(numIn, __func__);
+    validate_quregsCanBeMixed(out, in, numIn, __func__); // also validates all init & densmatr
+    validate_probabilities(probs, numIn, __func__);
+
+    // convert probs to complex (assume this alloc never fails)
+    vector<qcomp> coeffVec(numIn);
+    for (int i=0; i<numIn; i++)
+        coeffVec[i] = getQcomp(probs[i], 0);
+
+    auto inVec = util_getVector(in, numIn);
+    localiser_statevec_setQuregToWeightedSum(out, coeffVec, inVec);
+}
+
+
 } // end de-mangler
 
 
@@ -296,3 +324,15 @@ void setQuregToPartialTrace(Qureg out, Qureg in, vector<int> traceOutQubits) {
 void setQuregToReducedDensityMatrix(Qureg out, Qureg in, vector<int> retainQubits) {
     setQuregToReducedDensityMatrix(out, in, retainQubits.data(), retainQubits.size());
 }
+
+void setQuregToWeightedSum(Qureg out, vector<qcomp> coeffs, vector<Qureg> in) {
+    validate_numQuregsMatchesCoeffs(in.size(), coeffs.size(), __func__);
+
+    setQuregToWeightedSum(out, coeffs.data(), in.data(), in.size());
+}
+
+void setQuregToMixture(Qureg out, vector<qreal> probs, vector<Qureg> in) {
+    validate_numQuregsMatchesProbs(in.size(), probs.size(), __func__);
+
+    setQuregToMixture(out, probs.data(), in.data(), in.size());
+}
diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 1016f61ef..812e2cf25 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -74,6 +74,10 @@ using std::min;
 #endif
 
 
+#define GET_FUNC_OPTIMISED_FOR_NUM_QUREGS(f, numquregs) \
+    (vector <decltype(&f<0>)> {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
+    [std::min((int) numquregs, MAX_OPTIMISED_NUM_QUREGS + 1)]
+
 #define GET_FUNC_OPTIMISED_FOR_NUM_CTRLS(f, numctrls) \
     (vector <decltype(&f<0>)> {&f<0>, &f<1>, &f<2>, &f<3>, &f<4>, &f<5>, &f<-1>}) \
     [std::min((int) numctrls, MAX_OPTIMISED_NUM_CTRLS + 1)]
@@ -97,6 +101,11 @@ using std::min;
 #define ARR(f) vector<decltype(&f<0,0>)>
 
 
+#define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_QUREGS(funcsuffix, qureg, numquregs) \
+    ((qureg.isGpuAccelerated)? \
+        GET_FUNC_OPTIMISED_FOR_NUM_QUREGS( gpu_##funcsuffix, numquregs ) : \
+        GET_FUNC_OPTIMISED_FOR_NUM_QUREGS( cpu_##funcsuffix, numquregs ))
+
 #define GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_CTRLS(funcsuffix, qureg, numctrls) \
     ((qureg.isGpuAccelerated)? \
         GET_FUNC_OPTIMISED_FOR_NUM_CTRLS( gpu_##funcsuffix, numctrls ) : \
@@ -539,6 +548,14 @@ void accel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector<int>
  */
 
 
+void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs) {
+
+    // consult outQureg's deployment since others are prior validated to match
+    auto func = GET_CPU_OR_GPU_FUNC_OPTIMISED_FOR_NUM_QUREGS( statevec_setQuregToWeightedSum_sub, outQureg, inQuregs.size() );
+    func(outQureg, coeffs, inQuregs);
+}
+
+
 void accel_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2) {
 
     // consult outQureg's deployment (other quregs should match, though we dangerously do not assert this post-validation)
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index cd413d9a1..600fe4655 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -35,7 +35,7 @@ using std::vector;
  * used by cpu_subroutines.cpp and gpu_subroutines to force the compiler
  * to instantiate and compile their template definitions with the given
  * explicit parameters below. Notice the final parameter is always -1, 
- * to handle when the number of controls or targets is not known at 
+ * to handle when the number of parameters (e.g. controls) is not known at 
  * compile-time (it is larger than a bespoke, optimised instantiations), 
  * causing the optimised function to fallback to a suboptimal but general 
  * implementation.
@@ -44,6 +44,7 @@ using std::vector;
 // must match the macros below, and those in accelerator.cpp
 #define MAX_OPTIMISED_NUM_CTRLS 5
 #define MAX_OPTIMISED_NUM_TARGS 5
+#define MAX_OPTIMISED_NUM_QUREGS 5
 
 
 #define INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS(returntype, funcname, args) \
@@ -58,6 +59,9 @@ using std::vector;
 #define INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS(returntype, funcname, args) \
     INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS(returntype, funcname, args)
 
+#define INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_QUREGS(returntype, funcname, args) \
+    INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_TARGS(returntype, funcname, args)
+
 
 #define INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS_AND_TARGS(returntype, funcname, args) \
     private_INSTANTIATE(returntype, funcname, 0, args) \
@@ -228,6 +232,8 @@ void accel_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int>
  * QUREG COMBINATION
  */
 
+void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs);
+
 void accel_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2);
 
 void accel_densmatr_mixQureg_subA(qreal outProb, Qureg out, qreal inProb, Qureg in);
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 0d826bf44..f62f33e8a 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -1371,6 +1371,19 @@ void localiser_statevec_anyCtrlPauliGadget(Qureg qureg, vector<int> ctrls, vecto
  */
 
 
+void localiser_statevec_setQuregToWeightedSum(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs) {
+
+    /// @todo
+    /// this function requires (as validated) distributions are identical.
+    /// It would be trivial to generalise this so that Qureg distributions
+    /// can differ (we merely spoof local Quregs, offsetting their memory).
+    /// They must still however be identically GPU-accelerated; this is a
+    /// low priority because this situation is non-sensical
+
+    accel_statevec_setQuregToWeightedSum_sub(outQureg, coeffs, inQuregs);
+}
+
+
 void localiser_statevec_setQuregToSuperposition(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2) {
 
     /// @todo
diff --git a/quest/src/core/localiser.hpp b/quest/src/core/localiser.hpp
index 6d615919d..7f4565322 100644
--- a/quest/src/core/localiser.hpp
+++ b/quest/src/core/localiser.hpp
@@ -127,6 +127,8 @@ void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, vector<int> ctrls, vecto
  * QUREG COMBINATION
  */
 
+void localiser_statevec_setQuregToWeightedSum(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs);
+
 void localiser_statevec_setQuregToSuperposition(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2);
 
 void localiser_densmatr_mixQureg(qreal outProb, Qureg out, qreal inProb, Qureg in);
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 207b76d7f..aba32ef09 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -218,15 +218,6 @@ qindex util_getBitMask(vector<int> ctrls, vector<int> ctrlStates, vector<int> ta
     return util_getBitMask(qubits, states);
 }
 
-vector<int> util_getVector(int* qubits, int numQubits) {
-
-    // permit qubits=nullptr, overriding numQubits (might be non-zero)
-    if (qubits == nullptr)
-        return {};
-
-    return vector<int> (qubits, qubits + numQubits);
-}
-
 
 
 /*
@@ -384,7 +375,7 @@ qreal util_getSum(vector<qreal> list) {
     qreal sum = 0;
     qreal y, t, c=0;
     
-    // complex Kahan summation
+    // Kahan summation
     for (auto& x : list) {
         y = x - c;
         t = sum + y;
@@ -1118,6 +1109,23 @@ qreal util_getMaxProbOfTwoQubitDepolarising() {
  * TEMPORARY MEMORY ALLOCATION
  */
 
+template <typename T>
+vector<T> getVector(T* ptr, int length) {
+
+    // permit nullptr to indicate empty list, regardless of length
+    if (ptr == nullptr)
+        return {};
+
+    // assumes memory alloc failure is impossible
+    return vector<T> (ptr, ptr + length);
+}
+
+vector<int>   util_getVector(int*   ptr, int length) { return getVector(ptr, length); }
+vector<qreal> util_getVector(qreal* ptr, int length) { return getVector(ptr, length); }
+vector<qcomp> util_getVector(qcomp* ptr, int length) { return getVector(ptr, length); }
+vector<Qureg> util_getVector(Qureg* ptr, int length) { return getVector(ptr, length); }
+
+
 template <typename T>
 void tryAllocVector(vector<T> &vec, qindex size, std::function<void()> errFunc) {
 
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index a55e94ab5..e3a92a12c 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -59,8 +59,6 @@ vector<int> util_getBraQubits(vector<int> ketQubits, Qureg qureg);
 
 vector<int> util_getNonTargetedQubits(int* targets, int numTargets, int numQubits);
 
-vector<int> util_getVector(int* qubits, int numQubits);
-
 vector<int> util_getConcatenated(vector<int> list1, vector<int> list2);
 
 vector<int> util_getSorted(vector<int> list);
@@ -401,6 +399,13 @@ qreal util_getMaxProbOfTwoQubitDepolarising();
  * TEMPORARY MEMORY ALLOCATION
  */
 
+// alloc assumed to never fail
+vector<int>   util_getVector(int*   ptr, int length);
+vector<qreal> util_getVector(qreal* ptr, int length);
+vector<qcomp> util_getVector(qcomp* ptr, int length);
+vector<Qureg> util_getVector(Qureg* ptr, int length);
+
+// calls errFunc when alloc fails
 void util_tryAllocVector(vector<qreal>    &vec, qindex size, std::function<void()> errFunc);
 void util_tryAllocVector(vector<qcomp>    &vec, qindex size, std::function<void()> errFunc);
 void util_tryAllocVector(vector<qcomp*>   &vec, qindex size, std::function<void()> errFunc);
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 38474323f..330aef0c0 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -910,9 +910,14 @@ namespace report {
      * CHANNEL PARAMETERS 
      */
 
-    string INVALID_PROBABILITY =
+    string INVALID_PROB =
         "The given probability is invalid, and must instead be between 0 and 1 (both inclusive).";
 
+    string INVALID_PROBS =
+        "One or more given probabilities are invalid. Each must be between 0 and 1 (both inclusive).";
+
+    string PROBS_DO_NOT_SUM_TO_ONE =
+        "The given probabilities do not sum to (within epsilon of) one.";
 
     string ONE_QUBIT_DEPHASING_PROB_EXCEEDS_MAXIMAL_MIXING =
         "The given one-qubit dephasing probability exceeds that which induces maximal mixing, i.e. 1/2.";
@@ -937,6 +942,30 @@ namespace report {
      * QUREG COMBINATION
      */
 
+    string NON_POSITIVE_NUM_QUREGS_IN_SUM =
+        "The number of passed Quregs (${NUM_QUREGS}) is invalid. Must pass one or more.";
+
+    string SUMMED_QUREGS_HAVE_INCONSISTENT_MEM_LAYOUTS =
+        "The given list of Quregs have inconsistent attributes. They must all be the same size, all statevectors or density matrices, and be identically distributed or GPU-accelerated.";
+
+    string DIFFERENT_NUM_QUREGS_AND_COEFFS =
+        "A different number of coefficients (${NUM_COEFFS}) than Quregs (${NUM_QUREGS}) were passed.";
+
+
+    // relates to mixing-in multiple Quregs
+
+    string MIXED_QUREGS_NOT_ALL_DENSITY_MATRICES =
+        "One or more Quregs were statevectors though only density matrices are supported. To mix a single statevector, use mixQureg().";
+
+    string MIXED_QUREGS_HAVE_INCONSISTENT_MEM_LAYOUTS =
+        "The given list of Quregs have inconsistent attributes. They must all be the same size and be identically distributed or GPU-accelerated.";
+
+    string DIFFERENT_NUM_QUREGS_AND_PROBS =
+        "A different number of probabilities (${NUM_PROBS}) than Quregs (${NUM_QUREGS}) were passed.";
+        
+
+    // relates to mixing-in single Qureg (more permissive than above)
+
     string MIXED_QUREG_NOT_DENSITY_MATRIX =
         "The first Qureg, which will undergo mixing, must be a density matrix.";
 
@@ -1356,6 +1385,19 @@ bool isIndexListUnique(int* list, int len) {
     return true;
 }
 
+bool doQuregsHaveIdenticalMemoryLayouts(Qureg a, Qureg b) {
+
+    // same #dims, same size, same distribution and GPU status
+    return (
+        (a.numQubits        == b.numQubits       ) &&
+        (a.isDensityMatrix  == b.isDensityMatrix ) &&
+        (a.isDistributed    == b.isDistributed   ) &&
+        (a.isGpuAccelerated == b.isGpuAccelerated)
+    );
+
+    // note that multithreading does not affecting memory layout
+}
+
 
 
 /*
@@ -3858,7 +3900,32 @@ void validate_probability(qreal prob, const char* caller) {
 
     /// @todo report 'prob' once validation reporting can handle floats
 
-    assertThat(prob >= 0 && prob <= 1, report::INVALID_PROBABILITY, caller);
+    /// @todo 
+    ///     should we permit -eps <= prob <= 1+eps so that this validation
+    ///     can skipped by disabled only numerical validation?
+
+    assertThat(prob >= 0 && prob <= 1, report::INVALID_PROB, caller);
+}
+
+void validate_probabilities(qreal* probs, int numProbs, const char* caller) {
+
+    // we assume that numProbs>0 was prior validated
+
+    /// @todo like above, should we permit -eps <= prob <= 1+eps?
+
+    for (int i=0; i<numProbs; i++)
+        assertThat(probs[i] >= 0 && probs[i] <= 1, report::INVALID_PROBS, caller);
+
+    if (isNumericalValidationDisabled())
+        return;
+    
+    // check sum=1 using numerically stable sum, because our users deserve the best ;)
+    // note numProbs is expected small (the caller accepts just as many Quregs) so we
+    // are safe to allocate this vector without internal checks
+    qreal total = util_getSum(util_getVector(probs, numProbs));
+
+    qreal dist = std::abs(total - 1);
+    assertThat(dist <= global_validationEpsilon, report::PROBS_DO_NOT_SUM_TO_ONE, caller);
 }
 
 void validate_oneQubitDepashingProb(qreal prob, const char* caller) {
@@ -3938,14 +4005,46 @@ void validate_oneQubitPauliChannelProbs(qreal pX, qreal pY, qreal pZ, const char
 void validate_quregCanBeWorkspace(Qureg qureg, Qureg workspace, const char* caller) {
 
     assertThat(
-        (qureg.numQubits        == workspace.numQubits       ) &&
-        (qureg.isDensityMatrix  == workspace.isDensityMatrix ) &&
-        (qureg.isDistributed    == workspace.isDistributed   ) &&
-        (qureg.isGpuAccelerated == workspace.isGpuAccelerated),
+        doQuregsHaveIdenticalMemoryLayouts(qureg, workspace),
         report::QUREG_IS_INCOMPATIBLE_WITH_WORKSPACE, caller);
 }
 
-void validate_quregsCanBeMixed(Qureg quregOut, Qureg quregIn, const char* caller) {
+void validate_numQuregsInSum(int numQuregs, const char* caller) {
+
+    assertThat(numQuregs > 0, report::NON_POSITIVE_NUM_QUREGS_IN_SUM, {{"${NUM_QUREGS}", numQuregs}}, caller);
+}
+
+void validate_quregsCanBeSummed(Qureg out, Qureg* in, int numIn, const char* caller) {
+
+    for (int i=0; i<numIn; i++)
+        validate_quregFields(in[i], caller);
+
+    bool valid = true;
+    for (int i=0; i<numIn && valid; i++)
+        valid = valid && doQuregsHaveIdenticalMemoryLayouts(out, in[i]);
+
+    assertThat(valid, report::SUMMED_QUREGS_HAVE_INCONSISTENT_MEM_LAYOUTS, caller);
+}
+
+void validate_quregsCanBeMixed(Qureg out, Qureg* in, int numIn, const char* caller) {
+
+    // mixing in multiple quregs (done here) is much stricter than when 
+    // only one pair is being mixed in, which is handled below
+
+    for (int i=0; i<numIn; i++)
+        validate_quregFields(in[i], caller);
+
+    for (int i=0; i<numIn; i++)
+        assertThat(in[i].isDensityMatrix, report::MIXED_QUREGS_NOT_ALL_DENSITY_MATRICES, caller);
+
+    bool valid = true;
+    for (int i=0; i<numIn && valid; i++)
+        valid = valid && doQuregsHaveIdenticalMemoryLayouts(out, in[i]);
+
+    assertThat(valid, report::MIXED_QUREGS_HAVE_INCONSISTENT_MEM_LAYOUTS, caller);
+}
+
+void validate_quregPairCanBeMixed(Qureg quregOut, Qureg quregIn, const char* caller) {
 
     // mixing must be mathematically possible; dims are compatible, but quregIn can be a statevector
     assertThat(quregOut.isDensityMatrix, report::MIXED_QUREG_NOT_DENSITY_MATRIX, caller);
@@ -3963,6 +4062,24 @@ void validate_quregsCanBeMixed(Qureg quregOut, Qureg quregIn, const char* caller
         assertThat(!quregIn.isDistributed, report::MIXED_DENSITY_MATRIX_LOCAL_BUT_STATEVEC_DISTRIBUTED, caller);
 }
 
+void validate_numQuregsMatchesCoeffs(size_t numQuregs, size_t numCoeffs, const char* caller) {
+
+    tokenSubs vars = {
+        {"${NUM_QUREGS}", numQuregs},
+        {"${NUM_COEFFS}", numCoeffs}
+    };
+    assertThat(numQuregs == numCoeffs, report::DIFFERENT_NUM_QUREGS_AND_COEFFS, vars, caller);
+}
+
+void validate_numQuregsMatchesProbs(size_t numQuregs, size_t numProbs, const char* caller) {
+
+    tokenSubs vars = {
+        {"${NUM_QUREGS}", numQuregs},
+        {"${NUM_PROBS}",  numProbs}
+    };
+    assertThat(numQuregs == numProbs, report::DIFFERENT_NUM_QUREGS_AND_PROBS, vars, caller);
+}
+
 void validate_quregsCanBeSuperposed(Qureg qureg1, Qureg qureg2, Qureg qureg3, const char* caller) {
 
     // all quregs must be statevectors
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index e15bca4e9..53b511d74 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -442,6 +442,8 @@ void validate_numLindbladSuperPropagatorTerms(qindex numSuperTerms, const char*
 
 void validate_probability(qreal prob, const char* caller);
 
+void validate_probabilities(qreal* probs, int numProbs, const char* caller);
+
 void validate_oneQubitDepashingProb(qreal prob, const char* caller);
 void validate_twoQubitDepashingProb(qreal prob, const char* caller);
 
@@ -458,9 +460,19 @@ void validate_oneQubitPauliChannelProbs(qreal pX, qreal pY, qreal pZ, const char
  * QUREG COMBINATION
  */
 
+void validate_numQuregsInSum(int numQuregs, const char* caller);
+
 void validate_quregCanBeWorkspace(Qureg quregA, Qureg quregB, const char* caller);
 
-void validate_quregsCanBeMixed(Qureg quregOut, Qureg quregIn, const char* caller);
+void validate_quregsCanBeSummed(Qureg out, Qureg* in, int numIn, const char* caller);
+
+void validate_quregsCanBeMixed(Qureg out, Qureg* in, int numIn, const char* caller);
+
+void validate_quregPairCanBeMixed(Qureg out, Qureg in, const char* caller);
+
+void validate_numQuregsMatchesCoeffs(size_t numQuregs, size_t numCoeffs, const char* caller);
+
+void validate_numQuregsMatchesProbs(size_t numQuregs, size_t numProbs, const char* caller);
 
 void validate_quregsCanBeSuperposed(Qureg qureg1, Qureg qureg2, Qureg qureg3, const char* caller);
 
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index dd6f64b59..8db92111f 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -1034,6 +1034,29 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, cpu_statevector_anyCtrlAnyTargZO
  */
 
 
+template <int NumQuregs>
+void cpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs) {
+
+    qindex numIts = outQureg.numAmpsPerNode;
+
+    // use template param to compile-time unroll inner loop below
+    SET_VAR_AT_COMPILE_TIME(int, numQuregs, NumQuregs, inQuregs.size());
+
+    #pragma omp parallel for if(outQureg.isMultithreaded)
+    for (qindex n=0; n<numIts; n++) {
+
+        // unrolled when inQuregs.size() <= 5
+        qcomp amp = 0;
+        for (int q=0; q<numQuregs; q++)
+            amp += coeffs[q] * inQuregs[q].cpuAmps[n];
+
+        // must not modify cpuAmps[n] before computing the amp since
+        // outQureg can legally appear among inQuregs
+        outQureg.cpuAmps[n] = amp;
+    }
+}
+
+
 void cpu_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2) {
 
     assert_superposedQuregDimsAndDeploysMatch(outQureg, inQureg1, inQureg2);
@@ -1103,6 +1126,9 @@ void cpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) {
 }
 
 
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_QUREGS( void, cpu_statevec_setQuregToWeightedSum_sub, (Qureg, vector<qcomp>, vector<Qureg>) )
+
+
 
 /*
  * ONE-QUBIT DEPHASING
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index b81e28905..ab98cd957 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -100,6 +100,8 @@ template <int NumCtrls> void cpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qu
  * QUREG COMBINATION
  */
 
+template <int NumQuregs> void cpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs);
+
 void cpu_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2);
 
 void cpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qureg inDensMatr);
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index a74d9ddcf..448073fe0 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -711,9 +711,30 @@ __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(
  */
 
 
+template <int NumQuregs> 
+__global__ void kernel_statevec_setQuregToWeightedSum_sub(
+    cu_qcomp* outAmps, qindex numThreads,
+    cu_qcomp* coeffs, cu_qcomp** inAmps, int numQuregs
+) {
+    GET_THREAD_IND(n, numThreads);
+
+    // use template param to compile-time unroll below loop
+    SET_VAR_AT_COMPILE_TIME(int, numInner, NumQuregs, numQuregs);
+
+    cu_qcomp amp = getCuQcomp(0, 0);
+
+    for (int q=0; q<numInner; q++)
+        amp = amp + coeffs[q] * inAmps[q][n];
+
+    // must not modify outAmps[n] before computing the amp 
+    // since outAmps can legally appear among inAmps
+    outAmps[n] = amp;
+}
+
+
 // kernel_densmatr_mixQureg_subA() is avoided; we instead use
 // Thrust for this common circumstances (mixing density matrices),
-// which should be significantly more optimisex
+// which should be significantly more optimised
 
 
 __global__ void kernel_densmatr_mixQureg_subB(
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index bb9688209..607ebebb6 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -909,6 +909,35 @@ INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS( void, gpu_statevector_anyCtrlAnyTargZO
  */
 
 
+template <int NumQuregs> 
+void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs) {
+
+#if COMPILE_CUDA || COMPILE_CUQUANTUM
+
+    qindex numThreads = outQureg.numAmpsPerNode;
+    qindex numBlocks = getNumBlocks(numThreads);
+
+    // extract amp ptrs from qureg list
+    vector<cu_qcomp*> ptrs;
+    ptrs.reserve(inQuregs.size());
+    for (auto& qureg : inQuregs)
+        ptrs.push_back(toCuQcomps(qureg.gpuAmps));
+    
+    // copy coeff and qureg lists into GPU memory
+    devcuqcompptrs devQuregAmps = ptrs;
+    devcomps devCoeffs = coeffs;
+
+    kernel_statevec_setQuregToWeightedSum_sub <NumQuregs> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
+        toCuQcomps(outQureg.gpuAmps), numThreads,
+        getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size()
+    );
+
+#else
+    error_gpuSimButGpuNotCompiled();
+#endif
+}
+
+
 void gpu_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2) {
 
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
@@ -969,6 +998,9 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) {
 }
 
 
+INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_QUREGS( void, gpu_statevec_setQuregToWeightedSum_sub, (Qureg, vector<qcomp>, vector<Qureg>) )
+
+
 
 /*
  * ONE-QUBIT DEPHASING
diff --git a/quest/src/gpu/gpu_subroutines.hpp b/quest/src/gpu/gpu_subroutines.hpp
index aac2966ff..35cf2839d 100644
--- a/quest/src/gpu/gpu_subroutines.hpp
+++ b/quest/src/gpu/gpu_subroutines.hpp
@@ -93,6 +93,8 @@ template <int NumCtrls> void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qu
  * QUREG COMBINATION
  */
 
+template <int NumQuregs> void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs);
+
 void gpu_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2);
 
 void gpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qureg inDensMatr);
diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh
index 7cb45e93f..b6e48beca 100644
--- a/quest/src/gpu/gpu_thrust.cuh
+++ b/quest/src/gpu/gpu_thrust.cuh
@@ -65,7 +65,8 @@
  * are copied to device memory using thrust's device_vector's 
  * copy constructor (devicevec d_vec = hostvec). The pointer 
  * to the data (d_vec.data()) can be cast into a raw pointer
- * and passed directly to CUDA kernels
+ * and passed directly to CUDA kernels (though qcomp must be
+ * reinterpreted to cu_qcomp)
  */
 
 
@@ -105,6 +106,25 @@ devreals getDeviceRealsVec(qindex dim) {
 }
 
 
+using devcomps = thrust::device_vector<qcomp>;
+
+cu_qcomp* getPtr(devcomps& comps) {
+
+    // devcomps -> qcomp -> cu_qcomp
+    qcomp* ptr =  thrust::raw_pointer_cast(comps.data());
+    return toCuQcomps(ptr);
+}
+
+
+// father forgive me for I have sinned
+using devcuqcompptrs = thrust::device_vector<cu_qcomp*>;
+
+cu_qcomp** getPtr(devcuqcompptrs& ptrs) {
+
+    return thrust::raw_pointer_cast(ptrs.data());
+}
+
+
 
 /*
  * AMP POINTERS
diff --git a/tests/unit/initialisations.cpp b/tests/unit/initialisations.cpp
index 90a27ad75..758a863bd 100644
--- a/tests/unit/initialisations.cpp
+++ b/tests/unit/initialisations.cpp
@@ -409,6 +409,428 @@ TEST_CASE( "setQuregToPauliStrSum", TEST_CATEGORY ) {
 }
 
 
+TEST_CASE( "setQuregToWeightedSum", TEST_CATEGORY ) {
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // @todo
+        // below, we test when every inQureg is unqiue and distinct
+        // from the outQureg, and so do not test the valid scenarios of:
+        // - outQureg being among inQuregs
+        // - one or more inQuregs being repeated
+        // However, both CPU and GPU implementations are sufficiently
+        // trivial to validate by inspection (eep...)
+
+        qindex quregDim = getPow2(getNumCachedQubits());
+
+        // compile-time optimisations apply for <= 5
+        int numInQuregs = GENERATE( 1, 2, 3, 4, 5, 6, 20 );
+        CAPTURE( numInQuregs );
+
+        vector<qcomp> coeffs = getRandomVector(numInQuregs);
+
+        // we must pass identically-deployed inQureg as outQureg,
+        // which itself gets tested being each possible deployment,
+        // so we defer allocation of the inQuregs
+        vector<Qureg> inQuregs(numInQuregs);
+
+        // this function generates apiFunc in a way agnostic to
+        // whether outQureg is a statevector (inRefs are qvector)
+        // or a density matrix (inRefs are qmatrix)
+        auto apiFuncGen = [&](auto& inRefs) {
+
+            return [&](Qureg outQureg) { 
+
+                // prepare input quregs
+                for (int i=0; i<numInQuregs; i++) {
+                    inQuregs[i] = createCloneQureg(outQureg);
+                    setQuregToReference(inQuregs[i], inRefs[i]);
+                }
+
+                // modify outQureg
+                setQuregToWeightedSum(outQureg, coeffs.data(), inQuregs.data(), numInQuregs);
+
+                // free input quregs
+                for (int i=0; i<numInQuregs; i++)
+                    destroyQureg(inQuregs[i]);
+            };
+        };
+
+        SECTION( LABEL_STATEVEC ) {
+
+            // generate (unnormalised) input reference vectors
+            vector<qvector> inVecRefs(numInQuregs);
+            for (int i=0; i<numInQuregs; i++)
+                inVecRefs[i] = getRandomVector(quregDim);
+                
+            // compute output reference vector
+            qvector outVecRef = getZeroVector(quregDim);
+            for (int i=0; i<numInQuregs; i++)
+                outVecRef += coeffs[i] * inVecRefs[i];
+
+            TEST_ON_CACHED_QUREGS(getCachedStatevecs(), apiFuncGen(inVecRefs), outVecRef);
+        }
+
+        SECTION( LABEL_DENSMATR ) {
+
+            // generate (unnormalised) input reference matrices
+            vector<qmatrix> inMatrRefs(numInQuregs);
+            for (int i=0; i<numInQuregs; i++)
+                inMatrRefs[i] = getRandomMatrix(quregDim);
+                
+            // compute output reference matrix
+            qmatrix outMatrRef = getZeroMatrix(quregDim);
+            for (int i=0; i<numInQuregs; i++)
+                outMatrRef += coeffs[i] * inMatrRefs[i];
+
+            TEST_ON_CACHED_QUREGS(getCachedDensmatrs(), apiFuncGen(inMatrRefs), outMatrRef);
+        }
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        // arbitrary existing qureg
+        Qureg qureg = getCachedStatevecs().begin()->second;
+
+        SECTION( "out qureg uninitialised" ) {
+
+            // spoof uninitialised value to be sure
+            Qureg badQureg;
+            badQureg.numQubits = -123;
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToWeightedSum(badQureg, nullptr, nullptr, 1), 
+                ContainsSubstring("invalid Qureg") );
+        }
+
+        SECTION( "in qureg uninitialised" ) {
+
+            // set all inQureg to arbitrary existing qureg
+            int numIn = 5;
+            vector<Qureg> inQuregs(numIn, qureg);
+
+            // hide an uninitialised qureg among them
+            Qureg badQureg;
+            badQureg.numQubits = -123;
+            int badInd = GENERATE_COPY( range(0,numIn) );
+            inQuregs[badInd] = badQureg;
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToWeightedSum(qureg, nullptr, inQuregs.data(), numIn), 
+                ContainsSubstring("invalid Qureg") );
+        }
+
+        SECTION( "invalid number of quregs" ) {
+
+            int numIn = GENERATE( -1, 0 );
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToWeightedSum(qureg, nullptr, nullptr, numIn), 
+                ContainsSubstring("number of passed Quregs") && ContainsSubstring("is invalid") );
+        }
+
+        SECTION( "inconsistent qureg types" ) {
+
+            // must create new Quregs to ensure they are identically deployed
+            Qureg sv = createCustomQureg(getNumCachedQubits(), 0, 0,0,0);
+            Qureg dm = createCustomQureg(getNumCachedQubits(), 1, 0,0,0);
+
+            // set all inQureg to sv (as will be outQureg)
+            int numIn = 5;
+            vector<Qureg> inQuregs(numIn, sv);
+
+            // set one to dm
+            int badInd = GENERATE_COPY( range(0,numIn) );
+            inQuregs[badInd] = dm;
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToWeightedSum(sv, nullptr, inQuregs.data(), numIn), 
+                ContainsSubstring("inconsistent attributes") );
+
+            destroyQureg(sv);
+            destroyQureg(dm);
+        }
+
+        SECTION( "inconsistent qureg sizes" ) {
+
+            // must create new Quregs to ensure they are identically deployed
+            Qureg quregA = createCustomQureg(getNumCachedQubits(),     0,0,0,0);
+            Qureg quregB = createCustomQureg(getNumCachedQubits() + 1, 0,0,0,0);
+
+            // set all inQureg to quregA (as will be outQureg)
+            int numIn = 5;
+            vector<Qureg> inQuregs(numIn, quregA);
+
+            // set one to quregB
+            int badInd = GENERATE_COPY( range(0,numIn) );
+            inQuregs[badInd] = quregB;
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToWeightedSum(quregA, nullptr, inQuregs.data(), numIn), 
+                ContainsSubstring("inconsistent attributes") );
+
+            destroyQureg(quregA);
+            destroyQureg(quregB);
+        }
+
+        SECTION( "inconsistent qureg deployments" ) {
+
+            // we do not necessarily have differently-distributed/GPU Quregs at
+            // runtime, so we enumerate all deployments and test when they differ
+
+            for (auto& [label, badQureg]: getCachedStatevecs()) {
+
+                if ((badQureg.isGpuAccelerated == qureg.isGpuAccelerated) &&
+                    (badQureg.isDistributed    == qureg.isDistributed))
+                    continue;
+
+                // set all inQureg to qureg (as will be outQureg)
+                int numIn = 5;
+                vector<Qureg> inQuregs(numIn, qureg);
+
+                // set one to badQureg
+                int badInd = GENERATE_COPY( range(0,numIn) );
+                inQuregs[badInd] = badQureg;
+
+                REQUIRE_THROWS_WITH( 
+                    setQuregToWeightedSum(qureg, nullptr, inQuregs.data(), numIn), 
+                    ContainsSubstring("inconsistent attributes") );
+            }
+
+            // automatically pass when there are no differing deployments
+            SUCCEED( );
+        }
+
+        SECTION( "different number of quregs and coeffs") {
+
+            // relevant only to the C++ overload
+
+            qcomp coeff = getQcomp(0,0);
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToWeightedSum(qureg, {coeff,coeff}, {qureg}), 
+                ContainsSubstring("different number of coefficients") );
+        }
+    }
+}
+
+
+TEST_CASE( "setQuregToMixture", TEST_CATEGORY ) {
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        // @todo
+        // below, we test when every inQureg is unqiue and distinct
+        // from the outQureg, and so do not test the valid scenarios of:
+        // - outQureg being among inQuregs
+        // - one or more inQuregs being repeated
+        // However, both CPU and GPU implementations are sufficiently
+        // trivial to validate by inspection (eep...)
+
+        // compile-time optimisations apply for <= 5
+        int numInQuregs = GENERATE( 1, 2, 3, 4, 5, 6, 20 );
+        CAPTURE( numInQuregs );
+
+        vector<qreal> probs = getRandomProbabilities(numInQuregs);
+
+        // we must pass identically-deployed inQureg as outQureg,
+        // which itself gets tested being each possible deployment,
+        // so we defer allocation of the inQuregs
+        vector<Qureg> inQuregs(numInQuregs);
+
+        SECTION( LABEL_DENSMATR ) {
+
+            // generate (unnormalised) input reference matrices
+            vector<qmatrix> inMatrRefs(numInQuregs);
+            for (int i=0; i<numInQuregs; i++)
+                inMatrRefs[i] = getRandomMatrix(getPow2(getNumCachedQubits()));
+                
+            // compute output reference matrix
+            qmatrix outMatrRef = getZeroMatrix(getPow2(getNumCachedQubits()));
+            for (int i=0; i<numInQuregs; i++)
+                outMatrRef += probs[i] * inMatrRefs[i];
+
+            auto apiFunc = [&](Qureg outQureg) { 
+
+                // prepare input quregs
+                for (int i=0; i<numInQuregs; i++) {
+                    inQuregs[i] = createCloneQureg(outQureg);
+                    setQuregToReference(inQuregs[i], inMatrRefs[i]);
+                }
+
+                // modify outQureg
+                setQuregToMixture(outQureg, probs.data(), inQuregs.data(), numInQuregs);
+
+                // free input quregs
+                for (int i=0; i<numInQuregs; i++)
+                    destroyQureg(inQuregs[i]);
+            };
+
+            TEST_ON_CACHED_QUREGS(getCachedDensmatrs(), apiFunc, outMatrRef);
+        }
+    }
+
+    SECTION( LABEL_VALIDATION ) {
+
+        // arbitrary existing qureg
+        Qureg qureg = getCachedDensmatrs().begin()->second;
+
+        SECTION( "out qureg uninitialised" ) {
+
+            // spoof uninitialised value to be sure
+            Qureg badQureg;
+            badQureg.numQubits = -123;
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToMixture(badQureg, nullptr, nullptr, 1), 
+                ContainsSubstring("invalid Qureg") );
+        }
+
+        SECTION( "in qureg uninitialised" ) {
+
+            // set all inQureg to arbitrary existing qureg
+            int numIn = 5;
+            vector<Qureg> inQuregs(numIn, qureg);
+
+            // hide an unitialised qureg among them
+            Qureg badQureg;
+            badQureg.numQubits = -123;
+            int badInd = GENERATE_COPY( range(0,numIn) );
+            inQuregs[badInd] = badQureg;
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToMixture(qureg, nullptr, inQuregs.data(), numIn), 
+                ContainsSubstring("invalid Qureg") );
+        }
+
+        SECTION( "out qureg is statevector" ) {
+
+            Qureg badQureg = getCachedStatevecs().begin()->second;
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToMixture(badQureg, nullptr, nullptr, 1), 
+                ContainsSubstring("received a statevector") );
+        }
+
+        SECTION( "in qureg is statevector" ) {
+
+            // set all inQureg to arbitrary existing density matrix
+            int numIn = 5;
+            vector<Qureg> inQuregs(numIn, qureg);
+
+            // hide a statevector among them
+            int badInd = GENERATE_COPY( range(0,numIn) );
+            inQuregs[badInd] = getCachedStatevecs().begin()->second;;
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToMixture(qureg, nullptr, inQuregs.data(), numIn), 
+                ContainsSubstring("One or more Quregs were statevectors") );
+        }
+
+        SECTION( "number of quregs" ) {
+
+            int numIn = GENERATE( -1, 0 );
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToMixture(qureg, nullptr, nullptr, numIn), 
+                ContainsSubstring("number of passed Quregs") && ContainsSubstring("is invalid") );
+        }
+
+        SECTION( "inconsistent qureg sizes" ) {
+
+            // must create new Quregs to ensure they are identically deployed
+            Qureg quregA = createCustomQureg(getNumCachedQubits(),     1,0,0,0);
+            Qureg quregB = createCustomQureg(getNumCachedQubits() + 1, 1,0,0,0);
+
+            // set all inQureg to quregA (as will be outQureg)
+            int numIn = 5;
+            vector<Qureg> inQuregs(numIn, quregA);
+
+            // set one to quregB
+            int badInd = GENERATE_COPY( range(0,numIn) );
+            inQuregs[badInd] = quregB;
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToMixture(quregA, nullptr, inQuregs.data(), numIn), 
+                ContainsSubstring("inconsistent attributes") );
+
+            destroyQureg(quregA);
+            destroyQureg(quregB);
+        }
+
+        SECTION( "inconsistent qureg deployments" ) {
+
+            // we do not necessarily have differently-distributed/GPU Quregs at
+            // runtime, so we enumerate all deployments and test when they differ
+
+            for (auto& [label, badQureg]: getCachedDensmatrs()) {
+
+                if ((badQureg.isGpuAccelerated == qureg.isGpuAccelerated) &&
+                    (badQureg.isDistributed    == qureg.isDistributed))
+                    continue;
+
+                // set all inQureg to qureg (as will be outQureg)
+                int numIn = 5;
+                vector<Qureg> inQuregs(numIn, qureg);
+
+                // set one to badQureg
+                int badInd = GENERATE_COPY( range(0,numIn) );
+                inQuregs[badInd] = badQureg;
+
+                REQUIRE_THROWS_WITH( 
+                    setQuregToMixture(qureg, nullptr, inQuregs.data(), numIn), 
+                    ContainsSubstring("inconsistent attributes") );
+            }
+
+            // automatically pass when there are no differing deployments
+            SUCCEED( );
+        }
+
+        SECTION( "invalid probs" ) {
+
+            // set all inQureg to arbitrary existing density matrix
+            int numIn = 5;
+            vector<Qureg> inQuregs(numIn, qureg);
+
+            // get valid probabilities then mess one up
+            int badInd = GENERATE_COPY( range(0,numIn) );
+            vector<qreal> probs = getRandomProbabilities(numIn);
+            probs[badInd] = GENERATE( -1., -0.1, 1.1, 2. );
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToMixture(qureg, probs.data(), inQuregs.data(), numIn), 
+                ContainsSubstring("One or more given probabilities are invalid") );
+        }
+
+        SECTION( "unnormalised probs" ) {
+
+            // set all inQureg to arbitrary existing density matrix
+            int numIn = 5;
+            vector<Qureg> inQuregs(numIn, qureg);
+
+            // these illegal non-unity values assume eps < 0.1
+            qreal probSum = GENERATE( 0.9, 1.1 );
+            vector<qreal> probs(numIn, probSum / numIn);
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToMixture(qureg, probs.data(), inQuregs.data(), numIn), 
+                ContainsSubstring("probabilities do not sum to") && ContainsSubstring("one") );
+        }
+
+        SECTION( "different number of quregs and probs") {
+
+            // relevant only to the C++ overload
+
+            qreal prob = 0;
+
+            REQUIRE_THROWS_WITH( 
+                setQuregToMixture(qureg, {prob,prob}, {qureg}), 
+                ContainsSubstring("different number of probabilities") );
+        }
+    }
+}
+
+
 /** @} (end defgroup) */
 
 
@@ -418,8 +840,9 @@ TEST_CASE( "setQuregToPauliStrSum", TEST_CATEGORY ) {
  * UNTESTED FUNCTIONS
  */
 
-// these require we deploy the Quregs differently
-// to thoroughly test all QuEST control flows
+// these require we deploy each passed Qureg distinctly
+// to thoroughly test all QuEST control flows, for which
+// we do not yet have the macros/scaffolding
 
 void initPureState(Qureg qureg, Qureg pure);
 

From ab8b5603abeea8da52949edeb43005fc37d66d9a Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 22 Aug 2025 19:02:19 +0200
Subject: [PATCH 22/32] removed setQuregToSuperposition()

since superseded by setQuregToWeightedSum().

Additionally defined internal convenience functions...
- localiser_statevec_scaleAmps
- localiser_statevec_setQuregToClone
which merely call localiser_statevec_setQuregToWeightedSum, for code clarity
---
 quest/include/deprecated.h                    | 21 ++++++----
 quest/include/initialisations.h               | 22 -----------
 quest/include/operations.h                    |  8 ++--
 quest/src/api/initialisations.cpp             | 20 ++--------
 quest/src/api/multiplication.cpp              |  8 ++--
 quest/src/core/accelerator.cpp                |  9 -----
 quest/src/core/accelerator.hpp                |  2 -
 quest/src/core/errors.cpp                     | 11 ------
 quest/src/core/errors.hpp                     |  3 --
 quest/src/core/localiser.cpp                  | 21 +++++-----
 quest/src/core/localiser.hpp                  |  6 ++-
 quest/src/core/validation.cpp                 | 38 -------------------
 quest/src/core/validation.hpp                 |  2 -
 quest/src/cpu/cpu_subroutines.cpp             | 15 --------
 quest/src/cpu/cpu_subroutines.hpp             |  2 -
 quest/src/gpu/gpu_subroutines.cpp             | 12 ------
 quest/src/gpu/gpu_subroutines.hpp             |  2 -
 quest/src/gpu/gpu_thrust.cuh                  | 24 ------------
 .../deprecated/test_state_initialisations.cpp | 18 ++++-----
 tests/unit/initialisations.cpp                |  2 -
 20 files changed, 51 insertions(+), 195 deletions(-)

diff --git a/quest/include/deprecated.h b/quest/include/deprecated.h
index 47df35ad5..1bc8bda5f 100644
--- a/quest/include/deprecated.h
+++ b/quest/include/deprecated.h
@@ -865,16 +865,23 @@ static inline void _applyGateSubDiagonalOp(Qureg qureg, int* targets, int numTar
     _WARN_FUNC_RENAMED("cloneQureg()", "setQuregToClone()") \
     setQuregToClone(__VA_ARGS__)
 
+
+
+static inline void _setWeightedQureg(qcomp f1, Qureg q1, qcomp f2, Qureg q2, qcomp fOut, Qureg qOut) {
+    qcomp coeffs[] = {fOut, f1, f2};
+    Qureg quregs[] = {qOut, q1, q2};
+    setQuregToWeightedSum(qOut, coeffs, quregs, 3);
+}
+
 #define setWeightedQureg(f1, q1, f2, q2, fOut, qOut) \
     _WARN_GENERAL_MSG( \
         "The QuEST function 'setWeightedQureg(f1,q1, f2,q2, fOut,qOut)' is deprecated, and replaced with " \
-        "'setQuregToSuperposition(fOut,qOut, f1,q1, f2,q2)' which has been automatically invoked. The new " \
-        "fucntion however accepts only statevectors, not density matrices, so may error at runtime. Beware " \
-        "that the order of the arguments has changed, so that the first supplied Qureg is modified." ) \
-    setQuregToSuperposition( \
-        getQcomp(fOut.real, fOut.imag), qOut, \
-        getQcomp(f1.real, f1.imag), q1, \
-        getQcomp(f2.real, f2.imag), q2)
+        "'setQuregToWeightedSum(qOut, {f1,f2,...}, {q1,q2,..}, len)' which has been automatically invoked. " \
+        "Beware that the order of the arguments has changed, so that the first supplied Qureg is modified." ) \
+    _setWeightedQureg( \
+        _GET_QCOMP_FROM_COMPLEX_STRUCT(f1) ,q1, \
+        _GET_QCOMP_FROM_COMPLEX_STRUCT(f2) ,q2, \
+        _GET_QCOMP_FROM_COMPLEX_STRUCT(fOut), qOut)
 
 
 
diff --git a/quest/include/initialisations.h b/quest/include/initialisations.h
index 05de64588..99f415bd3 100644
--- a/quest/include/initialisations.h
+++ b/quest/include/initialisations.h
@@ -126,28 +126,6 @@ void setQuregToWeightedSum(Qureg out, qcomp* coeffs, Qureg* in, int numIn);
 void setQuregToMixture(Qureg out, qreal* probs, Qureg* in, int numIn);
 
 
-/** @notyetdoced
- * @notyettested
- * 
- * @formulae
- * 
- * Let @f$ f_{\text{out}} = @f$ @p facOut, @f$ f_1 = @f$ @p fac1 and @f$ f_2 = @f$ @p fac2.
- * Similarly, let @f$ \psi_{\text{out}} = @f$ @p out, @f$ \psi_{1} = @f$ @p qureg1 and @f$ \psi_{2} = @f$ @p qureg2.
- * 
- * This function modifies only @p facOut to become
- * @f[
-     |\psi_{\text{out}}\rangle \; \rightarrow \;
-        f_{\text{out}} |\psi_{\text{out}}\rangle \, + \,
-        f_1 |\psi_1\rangle \, + \,
-        f_2 |\psi_2\rangle.
- * @f]
- *
- * All factors are unconstrained and are permitted to be zero, and the same @p Qureg can be duplicated among
- * all arguments.
- */
-void setQuregToSuperposition(qcomp facOut, Qureg out, qcomp fac1, Qureg qureg1, qcomp fac2, Qureg qureg2);
-
-
 /// @notyetdoced
 /// @notyetvalidated
 qreal setQuregToRenormalized(Qureg qureg);
diff --git a/quest/include/operations.h b/quest/include/operations.h
index 075ab6689..0108af431 100644
--- a/quest/include/operations.h
+++ b/quest/include/operations.h
@@ -1670,14 +1670,16 @@ extern "C" {
           - \iu  \sin\left( \frac{\theta}{2} \right) \, \hat{\sigma},
  *   @f]
  *   this function is equivalent to (but much faster than) effecting @f$ \hat{\sigma} @f$
- *   upon a clone which is subsequently superposed.
+ *   upon a clone which is subsequently combined.
  *   ```
      // prepare |temp> = str |qureg>
      Qureg temp = createCloneQureg(qureg);
      applyPauliStr(temp, str);
 
      // set |qureg> = cos(theta/2) |qureg> - i sin(theta/2) str |qureg>
-     setQuregToSuperposition(cos(theta/2), qureg, - 1.0i * sin(theta/2), temp, 0, temp);
+     qcomp coeffs[] = {cos(theta/2), -1i * sin(theta/2)};
+     Qureg quregs[] = {qureg, temp};
+     setQuregToWeightedSum(qureg, coeffs, quregs, 2);
  *   ```
  * - When @p str contains only @f$ \hat{Z} @f$ or @f$ \id @f$ Paulis, this function will
  *   automatically invoke applyPhaseGadget() which leverages an optimised implementation.
@@ -1686,7 +1688,7 @@ extern "C" {
  *   unchanged.
  *   ```
      qcomp factor = cexp(- theta / 2 * 1.i);
-     setQuregToSuperposition(factor, qureg, 0,qureg,0,qureg);
+     setQuregToWeightedSum(qureg, &factor, &qureg, 1);
  *   ```
  * - Passing @p angle=0 is equivalent to effecting the identity, leaving the state unchanged.
  *
diff --git a/quest/src/api/initialisations.cpp b/quest/src/api/initialisations.cpp
index c3f6bb236..e027ae7ed 100644
--- a/quest/src/api/initialisations.cpp
+++ b/quest/src/api/initialisations.cpp
@@ -33,7 +33,6 @@ using std::vector;
 extern "C" {
 
 
-
 /*
  * INIT
  */
@@ -72,12 +71,9 @@ void initPureState(Qureg qureg, Qureg pure) {
     validate_quregFields(pure, __func__);
     validate_quregCanBeInitialisedToPureState(qureg, pure, __func__);
 
-    // when qureg=statevec, we lazily invoke setQuregToSuperposition which
-    // invokes superfluous floating-point operations which will be happily
-    // occluded by the memory movement costs
     (qureg.isDensityMatrix)?
         localiser_densmatr_initPureState(qureg, pure):
-        localiser_statevec_setQuregToSuperposition(0, qureg, 1, pure, 0, pure);
+        localiser_statevec_setQuregToClone(qureg, pure);
 }
 
 
@@ -185,17 +181,7 @@ void setQuregToClone(Qureg targetQureg, Qureg copyQureg) {
     // appreciable slowdown since simulation is memory-bound
     (targetQureg.isDensityMatrix)?
         localiser_densmatr_mixQureg(0, targetQureg, 1, copyQureg):
-        localiser_statevec_setQuregToSuperposition(0, targetQureg, 1, copyQureg, 0, copyQureg);
-}
-
-
-void setQuregToSuperposition(qcomp facOut, Qureg out, qcomp fac1, Qureg qureg1, qcomp fac2, Qureg qureg2) {
-    validate_quregFields(out, __func__);
-    validate_quregFields(qureg1, __func__);
-    validate_quregFields(qureg2, __func__);
-    validate_quregsCanBeSuperposed(out, qureg1, qureg2, __func__); // asserts statevectors
-
-    localiser_statevec_setQuregToSuperposition(facOut, out, fac1, qureg1, fac2, qureg2);
+        localiser_statevec_setQuregToClone(targetQureg, copyQureg);
 }
 
 
@@ -207,7 +193,7 @@ qreal setQuregToRenormalized(Qureg qureg) {
 
     qreal norm = (qureg.isDensityMatrix)? prob : std::sqrt(prob);
     qreal fac = 1 / norm;
-    localiser_statevec_setQuregToSuperposition(fac, qureg, 0, qureg, 0, qureg);
+    localiser_statevec_scaleAmps(qureg, fac);
 
     return fac;
 }
diff --git a/quest/src/api/multiplication.cpp b/quest/src/api/multiplication.cpp
index 460a45895..9761735a5 100644
--- a/quest/src/api/multiplication.cpp
+++ b/quest/src/api/multiplication.cpp
@@ -644,13 +644,13 @@ void leftapplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
     validate_pauliStrSumTargets(sum, qureg, __func__);
 
     // clone qureg to workspace, set qureg to blank
-    localiser_statevec_setQuregToSuperposition(0, workspace, 1, qureg, 0, qureg);
+    localiser_statevec_setQuregToClone(workspace, qureg);
     localiser_statevec_initUniformState(qureg, 0);
 
     // left-multiply each term in-turn, mixing into output qureg, then undo using idempotency
     for (qindex i=0; i<sum.numTerms; i++) {
         localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, sum.strings[i]);
-        localiser_statevec_setQuregToSuperposition(1, qureg, sum.coeffs[i], workspace, 0, workspace);
+        localiser_statevec_setQuregToWeightedSum(qureg, {1, sum.coeffs[i]}, {qureg, workspace});
         localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, sum.strings[i]);
     }
 
@@ -666,7 +666,7 @@ void rightapplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
     validate_pauliStrSumTargets(sum, qureg, __func__);
 
     // clone qureg to workspace, set qureg to blank
-    localiser_statevec_setQuregToSuperposition(0, workspace, 1, qureg, 0, qureg);
+    localiser_statevec_setQuregToClone(workspace, qureg);
     localiser_statevec_initUniformState(qureg, 0);
 
     // post-multiply each term in-turn, mixing into output qureg, then undo using idempotency
@@ -675,7 +675,7 @@ void rightapplyPauliStrSum(Qureg qureg, PauliStrSum sum, Qureg workspace) {
         qcomp factor = paulis_getSignOfPauliStrConj(str); // undoes transpose
 
         localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, str, factor);
-        localiser_statevec_setQuregToSuperposition(1, qureg, sum.coeffs[i], workspace, 0, workspace);
+        localiser_statevec_setQuregToWeightedSum(qureg, {1, sum.coeffs[i]}, {qureg, workspace});
         localiser_statevec_anyCtrlPauliTensor(workspace, {}, {}, str, factor);
     }
 
diff --git a/quest/src/core/accelerator.cpp b/quest/src/core/accelerator.cpp
index 812e2cf25..7bdcc1709 100644
--- a/quest/src/core/accelerator.cpp
+++ b/quest/src/core/accelerator.cpp
@@ -556,15 +556,6 @@ void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coef
 }
 
 
-void accel_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2) {
-
-    // consult outQureg's deployment (other quregs should match, though we dangerously do not assert this post-validation)
-    (outQureg.isGpuAccelerated)?
-        gpu_statevec_setQuregToSuperposition_sub(facOut, outQureg, fac1, inQureg1, fac2, inQureg2):
-        cpu_statevec_setQuregToSuperposition_sub(facOut, outQureg, fac1, inQureg1, fac2, inQureg2); 
-}
-
-
 void accel_densmatr_mixQureg_subA(qreal outProb, Qureg out, qreal inProb, Qureg in) {
 
     // quregs are equally-sized density matrices and are equally-distributed... 
diff --git a/quest/src/core/accelerator.hpp b/quest/src/core/accelerator.hpp
index 600fe4655..be50e22da 100644
--- a/quest/src/core/accelerator.hpp
+++ b/quest/src/core/accelerator.hpp
@@ -234,8 +234,6 @@ void accel_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector<int>
 
 void accel_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs);
 
-void accel_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2);
-
 void accel_densmatr_mixQureg_subA(qreal outProb, Qureg out, qreal inProb, Qureg in);
 void accel_densmatr_mixQureg_subB(qreal outProb, Qureg out, qreal inProb, Qureg in);
 void accel_densmatr_mixQureg_subC(qreal outProb, Qureg out, qreal inProb);
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 8261b0ad5..4d9d77e16 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -563,17 +563,6 @@ void assert_quregDistribAndFullStateDiagMatrLocal(Qureg qureg, FullStateDiagMatr
         raiseInternalError("The FullStateDiagMatr was unexpectedly distributed.");
 }
 
-void assert_superposedQuregDimsAndDeploysMatch(Qureg facOut, Qureg in1, Qureg in2) {
-
-    if (
-        facOut.isDistributed    != in1.isDistributed    || in1.isDistributed    != in2.isDistributed    ||
-        facOut.isDensityMatrix  != in1.isDensityMatrix  || in1.isDensityMatrix  != in2.isDensityMatrix  ||
-        facOut.isGpuAccelerated != in1.isGpuAccelerated || in1.isGpuAccelerated != in2.isGpuAccelerated ||
-        facOut.numQubits        != in1.numQubits        || in1.numQubits        != in2.numQubits
-    )
-        raiseInternalError("An internal function *_setQuregToSuperposition() received Quregs of mismatching dimensions and/or deployments.");
-}
-
 
 
 /*
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index c31d8df9a..dccaaf467 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -204,9 +204,6 @@ void assert_quregAndFullStateDiagMatrHaveSameDistrib(Qureg qureg, FullStateDiagM
 
 void assert_quregDistribAndFullStateDiagMatrLocal(Qureg qureg, FullStateDiagMatr matr);
 
-void assert_superposedQuregDimsAndDeploysMatch(Qureg facOut, Qureg in1, Qureg in2);
-
-
 
 /*
  * CPU ERRORS
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index f62f33e8a..8e64814d1 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -676,6 +676,11 @@ void localiser_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, Pa
     accel_fullstatediagmatr_setElemsToPauliStrSum(out, in);
 }
 
+void localiser_statevec_scaleAmps(Qureg qureg, qcomp factor) {
+
+    localiser_statevec_setQuregToWeightedSum(qureg, {factor}, {qureg});
+}
+
 
 
 /*
@@ -1384,17 +1389,15 @@ void localiser_statevec_setQuregToWeightedSum(Qureg outQureg, vector<qcomp> coef
 }
 
 
-void localiser_statevec_setQuregToSuperposition(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2) {
+void localiser_statevec_setQuregToClone(Qureg out, Qureg in) {
 
     /// @todo
-    /// this function requires (as validated) distributions are identical.
-    /// It would be trivial to generalise this so that Qureg distributions
-    /// can differ (we merely spoof local Quregs, offsetting their memory).
-    /// They must still however be identically GPU-accelerated; this is a
-    /// low priority because this situation is non-sensical
+    /// we lazily re-use setQuregToWeightedSum(), inducing a gratuitous
+    /// x1 multiplication per element which we expected is completely
+    /// occluded by memory movement costs. We should check this and
+    /// potentially replace this function with (NUMA-aware?) memory copying!
 
-    // given Qureg dimensions must match, this is always embarrassingly parallel
-    accel_statevec_setQuregToSuperposition_sub(facOut, outQureg, fac1, inQureg1, fac2, inQureg2);
+    localiser_statevec_setQuregToWeightedSum(out, {1}, {in});
 }
 
 
@@ -2327,7 +2330,7 @@ void localiser_statevec_multiQubitProjector(Qureg qureg, vector<int> qubits, vec
     // all other nodes has some or all states consistent with suffix outcomes
     removePrefixQubitsAndStates(qureg, qubits, outcomes);
     (qubits.empty())?
-        accel_statevec_setQuregToSuperposition_sub(1/std::sqrt(prob), qureg,0,qureg, 0,qureg): // scale by norm
+        localiser_statevec_scaleAmps(qureg, 1/std::sqrt(prob)):
         accel_statevec_multiQubitProjector_sub(qureg, qubits, outcomes, prob);
 }
 
diff --git a/quest/src/core/localiser.hpp b/quest/src/core/localiser.hpp
index 7f4565322..b56ad92a4 100644
--- a/quest/src/core/localiser.hpp
+++ b/quest/src/core/localiser.hpp
@@ -48,6 +48,8 @@ void localiser_fullstatediagmatr_setElems(FullStateDiagMatr matr, qindex startIn
 
 void localiser_fullstatediagmatr_setElemsToPauliStrSum(FullStateDiagMatr out, PauliStrSum in);
 
+void localiser_statevec_scaleAmps(Qureg qureg, qcomp factor);
+
 
 /*
  * STATE INITIALISATION
@@ -127,9 +129,9 @@ void localiser_statevec_anyCtrlPhaseGadget(Qureg qureg, vector<int> ctrls, vecto
  * QUREG COMBINATION
  */
 
-void localiser_statevec_setQuregToWeightedSum(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs);
+void localiser_statevec_setQuregToClone(Qureg out, Qureg in);
 
-void localiser_statevec_setQuregToSuperposition(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2);
+void localiser_statevec_setQuregToWeightedSum(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs);
 
 void localiser_densmatr_mixQureg(qreal outProb, Qureg out, qreal inProb, Qureg in);
 
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 330aef0c0..83935bf20 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -979,19 +979,6 @@ namespace report {
         "The given density matrix was local, but the statevector was distributed; this configuration is unsupported (and is ridiculous!).";
 
 
-    string SUPERPOSED_QUREGS_ARE_NOT_ALL_STATEVECTORS =
-        "Cannot superpose a density matrix. All quregs must be statevectors.";
-
-    string SUPERPOSED_QUREGS_HAVE_INCONSISTENT_NUM_QUBITS =
-        "Cannot superpose Quregs with differing numbers of qubits.";
-
-    string SUPERPOSED_QUREGS_HAVE_INCONSISTENT_GPU_DEPLOYMENT =
-        "Cannot superpose Quregs with inconsistent GPU deployments. All or no Quregs must be GPU-accelerated.";
-
-    string SUPERPOSED_QUREGS_HAVE_INCONSISTENT_DISTRIBUTION =
-        "Cannot superpose Quregs which are inconsistently distributed. All or no Quregs must be distributed.";
-
-
     string INIT_PURE_STATE_IS_DENSMATR =
         "The pure-state Qureg (the second argument) must be a statevector, not a density matrix.";
 
@@ -4080,31 +4067,6 @@ void validate_numQuregsMatchesProbs(size_t numQuregs, size_t numProbs, const cha
     assertThat(numQuregs == numProbs, report::DIFFERENT_NUM_QUREGS_AND_PROBS, vars, caller);
 }
 
-void validate_quregsCanBeSuperposed(Qureg qureg1, Qureg qureg2, Qureg qureg3, const char* caller) {
-
-    // all quregs must be statevectors
-    assertThat(
-        !qureg1.isDensityMatrix && !qureg2.isDensityMatrix && !qureg3.isDensityMatrix,
-        report::SUPERPOSED_QUREGS_ARE_NOT_ALL_STATEVECTORS, caller);
-
-    // and the same dimension
-    int nQb = qureg1.numQubits;
-    assertThat(
-        qureg2.numQubits == nQb && qureg3.numQubits == nQb, 
-        report::SUPERPOSED_QUREGS_HAVE_INCONSISTENT_NUM_QUBITS, caller);
-
-    // and all the same deployment (GPU & distribution; multithreading doesn't matter)
-    int isGpu = qureg1.isGpuAccelerated;
-    assertThat(
-        qureg2.isGpuAccelerated == isGpu && qureg3.isGpuAccelerated == isGpu, 
-        report::SUPERPOSED_QUREGS_HAVE_INCONSISTENT_GPU_DEPLOYMENT, caller);
-
-    int isDis = qureg1.isDistributed;
-    assertThat(
-        qureg2.isDistributed == isDis && qureg3.isDistributed == isDis, 
-        report::SUPERPOSED_QUREGS_HAVE_INCONSISTENT_DISTRIBUTION, caller);
-}
-
 void validateDensMatrCanBeInitialisedToPureState(Qureg qureg, Qureg pure, const char* caller) {
 
     // initPureState calls mixQureg which only additionally
diff --git a/quest/src/core/validation.hpp b/quest/src/core/validation.hpp
index 53b511d74..b0c08bc58 100644
--- a/quest/src/core/validation.hpp
+++ b/quest/src/core/validation.hpp
@@ -474,8 +474,6 @@ void validate_numQuregsMatchesCoeffs(size_t numQuregs, size_t numCoeffs, const c
 
 void validate_numQuregsMatchesProbs(size_t numQuregs, size_t numProbs, const char* caller);
 
-void validate_quregsCanBeSuperposed(Qureg qureg1, Qureg qureg2, Qureg qureg3, const char* caller);
-
 void validate_quregCanBeInitialisedToPureState(Qureg qureg, Qureg pure, const char* caller);
 
 void validate_quregsCanBeCloned(Qureg quregA, Qureg quregB, const char* caller);
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 8db92111f..26c80dabc 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -1057,21 +1057,6 @@ void cpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
 }
 
 
-void cpu_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2) {
-
-    assert_superposedQuregDimsAndDeploysMatch(outQureg, inQureg1, inQureg2);
-
-    qindex numIts = outQureg.numAmpsPerNode;
-    qcomp* out = outQureg.cpuAmps;
-    qcomp* in1 = inQureg1.cpuAmps;
-    qcomp* in2 = inQureg2.cpuAmps;
-
-    #pragma omp parallel for if(outQureg.isMultithreaded)
-    for (qindex n=0; n<numIts; n++)
-        out[n] = (facOut * out[n]) + (fac1 * in1[n]) + (fac2 * in2[n]);
-}
-
-
 void cpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qureg inDensMatr) {
 
     qindex numIts = outQureg.numAmpsPerNode;
diff --git a/quest/src/cpu/cpu_subroutines.hpp b/quest/src/cpu/cpu_subroutines.hpp
index ab98cd957..9da8fe199 100644
--- a/quest/src/cpu/cpu_subroutines.hpp
+++ b/quest/src/cpu/cpu_subroutines.hpp
@@ -102,8 +102,6 @@ template <int NumCtrls> void cpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qu
 
 template <int NumQuregs> void cpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs);
 
-void cpu_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2);
-
 void cpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qureg inDensMatr);
 void cpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qureg inStateVec);
 void cpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb);
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 607ebebb6..034d87f48 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -938,18 +938,6 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs
 }
 
 
-void gpu_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2) {
-
-#if COMPILE_CUDA || COMPILE_CUQUANTUM
-
-    thrust_statevec_setQuregToSuperposition_sub(toCuQcomp(facOut), outQureg, toCuQcomp(fac1), inQureg1, toCuQcomp(fac2), inQureg2);
-
-#else
-    error_gpuSimButGpuNotCompiled();
-#endif
-}
-
-
 void gpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qureg inQureg) {
 
 #if COMPILE_CUDA || COMPILE_CUQUANTUM
diff --git a/quest/src/gpu/gpu_subroutines.hpp b/quest/src/gpu/gpu_subroutines.hpp
index 35cf2839d..ff42c2239 100644
--- a/quest/src/gpu/gpu_subroutines.hpp
+++ b/quest/src/gpu/gpu_subroutines.hpp
@@ -95,8 +95,6 @@ template <int NumCtrls> void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qu
 
 template <int NumQuregs> void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector<qcomp> coeffs, vector<Qureg> inQuregs);
 
-void gpu_statevec_setQuregToSuperposition_sub(qcomp facOut, Qureg outQureg, qcomp fac1, Qureg inQureg1, qcomp fac2, Qureg inQureg2);
-
 void gpu_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb, Qureg inDensMatr);
 void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qureg inStateVec);
 void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb);
diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh
index b6e48beca..44a37fadd 100644
--- a/quest/src/gpu/gpu_thrust.cuh
+++ b/quest/src/gpu/gpu_thrust.cuh
@@ -386,21 +386,6 @@ struct functor_mixAmps : public thrust::binary_function<cu_qcomp,cu_qcomp,cu_qco
 };
 
 
-struct functor_superposeAmps {
-
-    // this functor linearly combines the given trio
-    // of amplitudes, weighted by fixed qcomps, and is
-    // used by setQuregToSuperposition
-
-    cu_qcomp fac0, fac1, fac2;
-    functor_superposeAmps(cu_qcomp f0, cu_qcomp f1, cu_qcomp f2) : fac0(f0), fac1(f1), fac2(f2) {}
-
-    template <typename Tuple> __host__ __device__ void operator()(Tuple t) {
-        thrust::get<0>(t) = fac0*thrust::get<0>(t) + fac1*thrust::get<1>(t) + fac2*thrust::get<2>(t);
-    }
-};
-
-
 template <bool HasPower, bool UseRealPow, bool Norm>
 struct functor_multiplyElemPowerWithAmpOrNorm : public thrust::binary_function<cu_qcomp,cu_qcomp,cu_qcomp> {
 
@@ -737,15 +722,6 @@ void thrust_densmatr_mixQureg_subA(qreal outProb, Qureg outQureg, qreal inProb,
 }
 
 
-void thrust_statevec_setQuregToSuperposition_sub(cu_qcomp facOut, Qureg outQureg, cu_qcomp fac1, Qureg inQureg1, cu_qcomp fac2, Qureg inQureg2) {
-
-    thrust::for_each(
-        thrust::make_zip_iterator(thrust::make_tuple(getStartPtr(outQureg), getStartPtr(inQureg1), getStartPtr(inQureg2))),
-        thrust::make_zip_iterator(thrust::make_tuple(getEndPtr(outQureg),   getEndPtr(inQureg1),   getEndPtr(inQureg2))),
-        functor_superposeAmps(facOut, fac1, fac2));
-}
-
-
 template <bool HasPower>
 void thrust_statevec_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, cu_qcomp exponent) {
 
diff --git a/tests/deprecated/test_state_initialisations.cpp b/tests/deprecated/test_state_initialisations.cpp
index 160721f7a..aca87f68f 100644
--- a/tests/deprecated/test_state_initialisations.cpp
+++ b/tests/deprecated/test_state_initialisations.cpp
@@ -743,14 +743,14 @@ TEST_CASE( "setWeightedQureg", "[state_initialisations]" ) {
             Complex f; f.real = 0; f.imag = 0;
             
             // two state-vecs, one density-matrix
-            REQUIRE_THROWS_WITH( setWeightedQureg(f, mat, f, vec, f, vec), ContainsSubstring("Cannot superpose a density matrix. All quregs must be statevectors") );
-            REQUIRE_THROWS_WITH( setWeightedQureg(f, vec, f, mat, f, vec), ContainsSubstring("Cannot superpose a density matrix. All quregs must be statevectors") );
-            REQUIRE_THROWS_WITH( setWeightedQureg(f, vec, f, vec, f, mat), ContainsSubstring("Cannot superpose a density matrix. All quregs must be statevectors") );
+            REQUIRE_THROWS_WITH( setWeightedQureg(f, mat, f, vec, f, vec), ContainsSubstring("inconsistent attributes") );
+            REQUIRE_THROWS_WITH( setWeightedQureg(f, vec, f, mat, f, vec), ContainsSubstring("inconsistent attributes") );
+            REQUIRE_THROWS_WITH( setWeightedQureg(f, vec, f, vec, f, mat), ContainsSubstring("inconsistent attributes") );
 
             // one state-vec, two density-matrices
-            REQUIRE_THROWS_WITH( setWeightedQureg(f, vec, f, mat, f, mat), ContainsSubstring("Cannot superpose a density matrix. All quregs must be statevectors") );
-            REQUIRE_THROWS_WITH( setWeightedQureg(f, mat, f, vec, f, mat), ContainsSubstring("Cannot superpose a density matrix. All quregs must be statevectors") );
-            REQUIRE_THROWS_WITH( setWeightedQureg(f, mat, f, mat, f, vec), ContainsSubstring("Cannot superpose a density matrix. All quregs must be statevectors") );
+            REQUIRE_THROWS_WITH( setWeightedQureg(f, vec, f, mat, f, mat), ContainsSubstring("inconsistent attributes") );
+            REQUIRE_THROWS_WITH( setWeightedQureg(f, mat, f, vec, f, mat), ContainsSubstring("inconsistent attributes") );
+            REQUIRE_THROWS_WITH( setWeightedQureg(f, mat, f, mat, f, vec), ContainsSubstring("inconsistent attributes") );
         
             destroyQureg(vec);
             destroyQureg(mat);
@@ -764,9 +764,9 @@ TEST_CASE( "setWeightedQureg", "[state_initialisations]" ) {
             Complex f; f.real = 0; f.imag = 0;
             
             // state-vecs
-            REQUIRE_THROWS_WITH( setWeightedQureg(f, vecA, f, vecB, f, vecB), ContainsSubstring("differing numbers of qubits") );
-            REQUIRE_THROWS_WITH( setWeightedQureg(f, vecB, f, vecA, f, vecB), ContainsSubstring("differing numbers of qubits") );
-            REQUIRE_THROWS_WITH( setWeightedQureg(f, vecB, f, vecB, f, vecA), ContainsSubstring("differing numbers of qubits") );
+            REQUIRE_THROWS_WITH( setWeightedQureg(f, vecA, f, vecB, f, vecB), ContainsSubstring("inconsistent attributes") );
+            REQUIRE_THROWS_WITH( setWeightedQureg(f, vecB, f, vecA, f, vecB), ContainsSubstring("inconsistent attributes") );
+            REQUIRE_THROWS_WITH( setWeightedQureg(f, vecB, f, vecB, f, vecA), ContainsSubstring("inconsistent attributes") );
             
             // v4 does not permit superposing density matrices
 
diff --git a/tests/unit/initialisations.cpp b/tests/unit/initialisations.cpp
index 758a863bd..8890c8abb 100644
--- a/tests/unit/initialisations.cpp
+++ b/tests/unit/initialisations.cpp
@@ -848,8 +848,6 @@ void initPureState(Qureg qureg, Qureg pure);
 
 void setQuregToClone(Qureg targetQureg, Qureg copyQureg);
 
-void setQuregToSuperposition(qcomp facOut, Qureg out, qcomp fac1, Qureg qureg1, qcomp fac2, Qureg qureg2);
-
 void setQuregToPartialTrace(Qureg out, Qureg in, int* traceOutQubits, int numTraceQubits);
 
 void setQuregToReducedDensityMatrix(Qureg out, Qureg in, int* retainQubits, int numRetainQubits);

From 851691dd518aaa8089cc01eff382360209269b3c Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 4 Sep 2025 15:08:36 -0400
Subject: [PATCH 23/32] optimised one-qubit Paulis to use applyCompMatr1 (#682)

as per #638. The previous use of the Pauli-specific multi-qubit backend logic was suboptimal for single-target since it involved superfluous per-amplitude evaluation of bitstring parity. This introduced a performance regression in single-core QuEST v4 since v3 which used single-target matrix logic.

This affects the performance of all explicitly single-target Pauli functions. Specifically:
- applyPauliX()
- applyControlledPauliX()
- applyMultiControlledPauliX()
- applyMultiStateControlledPauliX()
- applyPauliY()
- applyControlledPauliY()
- applyMultiControlledPauliY()
- applyMultiStateControlledPauliY()
- applyMultiStateControlledPauliX()
- applyPauliZ()
- applyControlledPauliZ()
- applyMultiControlledPauliZ()
- applyMultiStateControlledPauliZ()
- applyRotateX()
- applyControlledRotateX()
- applyMultiControlledRotateX()
- applyMultiStateControlledRotateX()
- applyRotateY()
- applyControlledRotateY()
- applyMultiControlledRotateY()
- applyMultiStateControlledRotateY()
- applyMultiStateControlledRotateX()
- applyRotateZ()
- applyControlledRotateZ()
- applyMultiControlledRotateZ()
- applyMultiStateControlledRotateZ()

which are concisely summarised as X cX ccX csX Y cY ccY csY Z cZ ccZ csZ Rx cRx ccRx csRx Ry cRy ccRy csRy Rz cRz ccRz csRz.

Beware this does not affect when incidentally passing a single-target through functions which can accept many, such as applyPauliStr() and applyPauliGadget(). Note too that further changes are expected necessary to recover single-core v3 performance.
---
 quest/src/api/operations.cpp | 55 +++++++++++++++++++++++++++---------
 quest/src/core/utilities.cpp | 48 +++++++++++++++++++++++++++++++
 quest/src/core/utilities.hpp |  8 ++++++
 3 files changed, 98 insertions(+), 13 deletions(-)

diff --git a/quest/src/api/operations.cpp b/quest/src/api/operations.cpp
index a2b09d84e..e458605a7 100644
--- a/quest/src/api/operations.cpp
+++ b/quest/src/api/operations.cpp
@@ -859,8 +859,18 @@ void applyMultiStateControlledPauliX(Qureg qureg, int* controls, int* states, in
     validate_controlsAndTarget(qureg, controls, numControls, target, __func__);
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
-    // harmlessly re-validates
-    applyMultiStateControlledPauliStr(qureg, controls, states, numControls, getPauliStr("X", {target}));
+    // note that for the single-target scenario, we do not call the backend of
+    // applyMultiStateControlledPauliStr() since it contains sub-optimal logic
+    // which sees the factor of every amplitude dynamically evaluated (based on
+    // index parity, etc); the dense-matrix element lookup is faster
+
+    /// @todo
+    /// a bespoke all-pauli-X function (like in QuEST v3) will be faster still 
+    /// since it avoids all superfluous flops; check worthwhile for multi-qubit
+
+    // harmlessly re-validates, including hardcoded matrix unitarity
+    CompMatr1 matrix = util_getPauliX();
+    validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
 void applyMultiStateControlledPauliY(Qureg qureg, int* controls, int* states, int numControls, int target) {
@@ -868,8 +878,9 @@ void applyMultiStateControlledPauliY(Qureg qureg, int* controls, int* states, in
     validate_controlsAndTarget(qureg, controls, numControls, target, __func__);
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
-    // harmlessly re-validates
-    applyMultiStateControlledPauliStr(qureg, controls, states, numControls, getPauliStr("Y", {target}));
+    // harmlessly re-validates, including hardcoded matrix unitarity
+    CompMatr1 matrix = util_getPauliY();
+    validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
 void applyMultiStateControlledPauliZ(Qureg qureg, int* controls, int* states, int numControls, int target)  {
@@ -877,9 +888,9 @@ void applyMultiStateControlledPauliZ(Qureg qureg, int* controls, int* states, in
     validate_controlsAndTarget(qureg, controls, numControls, target, __func__);
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
-    // harmlessly re-validates
-    DiagMatr1 matr = getDiagMatr1({1, -1});
-    applyMultiStateControlledDiagMatr1(qureg, controls, states, numControls, target, matr);
+    // harmlessly re-validates, including hardcoded matrix unitarity
+    DiagMatr1 matrix = util_getPauliZ();
+    validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
 } // end de-mangler
@@ -1077,8 +1088,14 @@ void applyMultiStateControlledRotateX(Qureg qureg, int* controls, int* states, i
     validate_controlsAndTarget(qureg, controls, numControls, target, __func__);
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
-    // harmlessly re-validates
-    applyMultiStateControlledPauliGadget(qureg, controls, states, numControls, getPauliStr("X", {target}), angle);
+    // note that for the single-target scenario, we do not call the backend of
+    // applyMultiStateControlledPauliGadget() since it contains sub-optimal logic
+    // which sees the factor of every amplitude dynamically evaluated (based on
+    // index parity, etc); the dense-matrix element lookup is faster
+
+    // harmlessly re-validates, including hardcoded matrix unitarity
+    CompMatr1 matrix = util_getExpPauliX(angle);
+    validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
 void applyMultiStateControlledRotateY(Qureg qureg, int* controls, int* states, int numControls, int target, qreal angle) {
@@ -1086,8 +1103,14 @@ void applyMultiStateControlledRotateY(Qureg qureg, int* controls, int* states, i
     validate_controlsAndTarget(qureg, controls, numControls, target, __func__);
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
-    // harmlessly re-validates
-    applyMultiStateControlledPauliGadget(qureg, controls, states, numControls, getPauliStr("Y", {target}), angle);
+    // note that for the single-target scenario, we do not call the backend of
+    // applyMultiStateControlledPauliGadget() since it contains sub-optimal logic
+    // which sees the factor of every amplitude dynamically evaluated (based on
+    // index parity, etc); the dense-matrix element lookup is faster
+
+    // harmlessly re-validates, including hardcoded matrix unitarity
+    CompMatr1 matrix = util_getExpPauliY(angle);
+    validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
 void applyMultiStateControlledRotateZ(Qureg qureg, int* controls, int* states, int numControls, int target, qreal angle) {
@@ -1095,8 +1118,14 @@ void applyMultiStateControlledRotateZ(Qureg qureg, int* controls, int* states, i
     validate_controlsAndTarget(qureg, controls, numControls, target, __func__);
     validate_controlStates(states, numControls, __func__); // permits states==nullptr
 
-    // harmlessly re-validates
-    applyMultiStateControlledPauliGadget(qureg, controls, states, numControls, getPauliStr("Z", {target}), angle);
+    // note that for the single-target scenario, we do not call the backend of
+    // applyMultiStateControlledPauliGadget() since it contains sub-optimal logic
+    // which sees the factor of every amplitude dynamically evaluated (based on
+    // index parity, etc); the dense-matrix element lookup is faster
+
+    // harmlessly re-validates, including hardcoded matrix unitarity
+    DiagMatr1 matrix = util_getExpPauliZ(angle);
+    validateAndApplyAnyCtrlAnyTargUnitaryMatrix(qureg, controls, states, numControls, &target, 1, matrix, __func__);
 }
 
 } // end de-mangler
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index aba32ef09..4966d1916 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -1007,6 +1007,54 @@ qcomp util_getPhaseFromGateAngle(qcomp angle) {
     return - angle / 2;
 }
 
+CompMatr1 util_getPauliX() {
+    return getCompMatr1({
+        {0,1},
+        {1,0}
+    });
+}
+CompMatr1 util_getPauliY() {
+    return getCompMatr1({
+        {0,qcomp(0,-1)},
+        {qcomp(0,1),0}
+    });
+}
+DiagMatr1 util_getPauliZ() {
+    return getDiagMatr1({1,-1});
+}
+
+CompMatr1 util_getExpPauliX(qreal angle) {
+
+    qreal x = util_getPhaseFromGateAngle(angle);
+    qreal c = std::cos(x);
+    qreal s = std::sin(x);
+
+    return getCompMatr1({
+        {qcomp(c,0), qcomp(0,s)},
+        {qcomp(0,s), qcomp(c,0)}
+    });
+}
+
+CompMatr1 util_getExpPauliY(qreal angle) {
+
+    qreal x = util_getPhaseFromGateAngle(angle);
+    qreal c = std::cos(x);
+    qreal s = std::sin(x);
+
+    return getCompMatr1({
+        { c, s},
+        {-s, c}
+    });
+}
+
+DiagMatr1 util_getExpPauliZ(qreal angle) {
+
+    qreal x = util_getPhaseFromGateAngle(angle);
+    qcomp y = qcomp(0, x);
+
+    return getDiagMatr1({std::exp(y), std::exp(-y)});
+}
+
 
 
 /*
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index e3a92a12c..cb2d8e713 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -365,6 +365,14 @@ std::pair<qindex, qindex> util_getBlockMultipleSubRange(qindex rangeLen, qindex
 qreal util_getPhaseFromGateAngle(qreal angle);
 qcomp util_getPhaseFromGateAngle(qcomp angle);
 
+CompMatr1 util_getPauliX();
+CompMatr1 util_getPauliY();
+DiagMatr1 util_getPauliZ();
+
+CompMatr1 util_getExpPauliX(qreal angle);
+CompMatr1 util_getExpPauliY(qreal angle);
+DiagMatr1 util_getExpPauliZ(qreal angle);
+
 
 
 /*

From 5bd864b5881ab783465bde3aebf2195216641ec1 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Thu, 4 Sep 2025 15:38:26 -0400
Subject: [PATCH 24/32] added news doc

---
 docs/README.md |   1 +
 docs/news.md   | 160 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 161 insertions(+)
 create mode 100644 docs/news.md

diff --git a/docs/README.md b/docs/README.md
index 4bf0ea018..5d164316d 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -35,6 +35,7 @@ Want to learn how what's under the hood? Read the
 - 🏆  [whitepaper](https://www.nature.com/articles/s41598-019-47174-9) which featured in Scientific Report's [Top 100 in Physics](https://www.nature.com/collections/ecehgdfcba/)
 - 📝  [preprint](https://arxiv.org/abs/2311.01512) which derives `v4`'s optimised algorithms.
 - 🧪  [tests](/tests) which compare QuEST's outputs to non-optimised calculations.
+- 📰  [news](news.md) which summarises QuEST's history and accolades.
 - 📈  [benchmarks](https://www.youtube.com/watch?v=dQw4w9WgXcQ) which are coming soon!
 
 
diff --git a/docs/news.md b/docs/news.md
new file mode 100644
index 000000000..ef7c6da1d
--- /dev/null
+++ b/docs/news.md
@@ -0,0 +1,160 @@
+# 📰  News
+
+<!--
+  Random assortment of QuEST-related news, in reverse chronology
+  (this comment must be under the title for valid doxygen rendering)
+
+  @author Tyson Jones
+-->
+
+
+## 🧭  Development
+
+
+### 2025
+
+  - Oliver Brown of EPCC leads QuEST development (`v4.3`+)
+
+
+### 2018
+
+  - Tyson Jones of QTechTheory leads QuEST development (`v1.1` - `v4.2`)
+
+
+### 2017
+
+  - Mihai Duta and Simon Benjamin begin QuEST development (`v0.1` - `v0.9`)
+  - Anna (Ania) Brown leads QuEST development (`v0.9` - `v1.0`)
+
+
+
+## 🏆  Recognitions
+
+
+### 2025
+
+  - QuEST is an invited project to [unitaryHACK 2025](https://unitaryhack.dev/) with challenges [#600](https://github.com/QuEST-Kit/QuEST/issues/600), [#594](https://github.com/QuEST-Kit/QuEST/issues/594), [#599](https://github.com/QuEST-Kit/QuEST/issues/599), [#598](https://github.com/QuEST-Kit/QuEST/issues/598), [#596](https://github.com/QuEST-Kit/QuEST/issues/596) and [#595](https://github.com/QuEST-Kit/QuEST/issues/595)
+  - QuEST is a highlighted integration on the NVIDIA [cuQuantum site](https://developer.nvidia.com/cuquantum-sdk)
+
+
+### 2024
+
+  - QuEST ranks 3rd in Quantum Insider's [Top 63 Quantum Computer Simulators for 2024](https://thequantuminsider.com/2022/06/14/top-63-quantum-computer-simulators-for-2022/)
+
+
+### 2023
+
+  - QuESTlink wins a [Wolfram Innovator Award](https://blog.wolfram.com/2023/11/02/announcing-the-2023-wolfram-innovator-award-winners/)
+
+
+### 2021
+
+  - QuEST features in the final challenge of the [ASC20-21 Student Supercomputer Challenge](https://www.businesswire.com/news/home/20210127005355/en/28-University-Teams-from-Around-the-World-Advance-to-the-Finals-of-the-ASC20-21-Student-Supercomputer-Challenge)
+
+
+### 2020
+
+  - QuEST's [whitepaper](https://www.nature.com/articles/s41598-019-47174-9) ranks 11th in Scientific Reports' [Top 100 in Physics](https://www.nature.com/collections/ecehgdfcba)
+
+
+
+## 💪  Major features
+
+
+### 2025
+
+#### [v4.2](https://github.com/QuEST-Kit/QuEST/releases/tag/v4.2.0)
+
+  - multi-controlled Trotter circuits
+  - non-unitary Trotter circuits (permitting e.g. imaginary-time evolution)
+  - noisy time evolution via the Lindbladian
+  - customisation environment variables
+  - restored NUMA awareness (woops)
+
+#### [v4.0](https://github.com/QuEST-Kit/QuEST/releases/tag/v4.0.0)
+
+  - multi-GPU deployment
+  - automatic deployment
+  - partial tracing
+  - multi-qubit projectors
+  - distance measures
+  - numerical tolerance control
+
+### 2023
+
+#### [v3.7](https://github.com/QuEST-Kit/QuEST/releases/tag/v3.7.0)
+
+  - cuQuantum integration
+
+#### [v3.6](https://github.com/QuEST-Kit/QuEST/releases/tag/v3.6.0)
+
+  - AMD GPU support
+  - diagonal matrices
+
+
+### 2021
+
+#### [v3.4](https://github.com/QuEST-Kit/QuEST/releases/tag/v3.4.0)
+
+  - MSVC (Windows) support
+
+#### [v3.3](https://github.com/QuEST-Kit/QuEST/releases/tag/v3.3.0)
+
+  - all-outcome probabilities
+  - multi-controlled Pauli rotations
+  - custom phase functions
+  - QFT
+
+
+### 2020
+
+#### [v3.2](https://github.com/QuEST-Kit/QuEST/releases/tag/v3.2.0)
+
+  - Trotter-Suzuki circuits
+  - full-state diagonal operators
+
+#### [v3.1](https://github.com/QuEST-Kit/QuEST/releases/tag/v3.1.0)
+
+  - unit tests
+  - continuous integration
+
+#### [v3.0](https://github.com/QuEST-Kit/QuEST/releases/tag/3.0.0)
+
+  - general any-sized matrices
+  - general any-sized Kraus maps
+  - inhomogeneous Pauli channels
+  - multi-Pauli expectation values
+  - multi-target Pauli rotations
+  - any-controlled operations
+  - density-matrix inner products
+  - custom error handling
+
+
+### 2019
+
+#### [v2.1](https://github.com/QuEST-Kit/QuEST/releases/tag/2.1.0)
+
+  - CMake build
+  - amplitude damping channel
+
+
+### 2018
+
+#### [v2.0](https://github.com/QuEST-Kit/QuEST/releases/tag/v2.0.0)
+
+  - density matrices
+  - dephasing channels
+  - depolarising channels
+  - purity calculation
+  - fidelity calculation
+  - QASM generation
+  - input validation
+
+#### [v1.1](https://github.com/QuEST-Kit/QuEST/releases/tag/v1.1.0)
+
+  - `C` and `C++` agnosticism
+  - unification of CPU and GPU backends
+
+#### [v1.0](https://github.com/QuEST-Kit/QuEST/releases/tag/v1.0.0)
+
+  - controlled axis rotations
\ No newline at end of file

From 347d7e55391779d4eed1fd78db6e840b22aff48c Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Fri, 5 Sep 2025 10:19:38 -0400
Subject: [PATCH 25/32] minor code cleanup (#672)

which had only the below 4 changes visible to users:
- changed the significant figures from 3 to 4 of reported memory (e.g. `3.23e1 KiB` becomes `32.30 KiB`)
- fixed a README doc link
- renamed to clarify the `setQuregToClone` parameters
- suppressed illegitimate unused-variable compiler warnings

and otherwise merely tidied internal code. See #672 for all changes.
---
 README.md                         |  2 +-
 quest/include/initialisations.h   |  2 +-
 quest/src/api/initialisations.cpp | 16 ++++++++--------
 quest/src/api/matrices.cpp        | 10 +++++-----
 quest/src/core/errors.cpp         |  7 +------
 quest/src/core/errors.hpp         |  2 --
 quest/src/core/localiser.cpp      |  4 ++++
 quest/src/core/printer.cpp        |  5 ++---
 quest/src/core/validation.cpp     |  3 +++
 quest/src/gpu/gpu_cuquantum.cuh   |  4 ++--
 tests/unit/initialisations.cpp    |  3 ---
 11 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/README.md b/README.md
index 39e193f13..c0c510ba0 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@
   [![DOI](https://img.shields.io/badge/DOI-10.1038%2Fs41598--019--47174--9-yellow.svg)](https://doi.org/10.1038/s41598-019-47174-9)
   <br>
   [![GitHub release](https://img.shields.io/github/release/QuEST-Kit/QuEST)](https://GitHub.com/QuEST-Kit/QuEST/releases/) 
-  [![Doc](https://img.shields.io/badge/doc-Github.io-orange.svg)](https://quest-kit.github.io/QuEST/modules.html)
+  [![Doc](https://img.shields.io/badge/doc-Github.io-orange.svg)](https://quest-kit.github.io/QuEST/group__api.html)
   [![MIT license](https://img.shields.io/badge/license-MIT-lightgrey.svg)](LICENCE.txt)
 
 
diff --git a/quest/include/initialisations.h b/quest/include/initialisations.h
index 99f415bd3..468bb794f 100644
--- a/quest/include/initialisations.h
+++ b/quest/include/initialisations.h
@@ -113,7 +113,7 @@ void setDensityQuregFlatAmps(Qureg qureg, qindex startInd, qcomp* amps, qindex n
 
 /// @notyetdoced
 /// @notyettested
-void setQuregToClone(Qureg targetQureg, Qureg copyQureg);
+void setQuregToClone(Qureg outQureg, Qureg inQureg);
 
 
 /// @notyetdoced
diff --git a/quest/src/api/initialisations.cpp b/quest/src/api/initialisations.cpp
index e027ae7ed..f49ae472c 100644
--- a/quest/src/api/initialisations.cpp
+++ b/quest/src/api/initialisations.cpp
@@ -171,17 +171,17 @@ void setDensityQuregFlatAmps(Qureg qureg, qindex startInd, qcomp* amps, qindex n
 }
 
 
-void setQuregToClone(Qureg targetQureg, Qureg copyQureg) {
-    validate_quregFields(targetQureg, __func__);
-    validate_quregFields(copyQureg, __func__);
-    validate_quregsCanBeCloned(targetQureg, copyQureg, __func__);
+void setQuregToClone(Qureg outQureg, Qureg inQureg) {
+    validate_quregFields(outQureg, __func__);
+    validate_quregFields(inQureg, __func__);
+    validate_quregsCanBeCloned(outQureg, inQureg, __func__);
 
     // we invoke mixing/superposing, which involves superfluous
     // floating-point operators but is not expected to cause an
-    // appreciable slowdown since simulation is memory-bound
-    (targetQureg.isDensityMatrix)?
-        localiser_densmatr_mixQureg(0, targetQureg, 1, copyQureg):
-        localiser_statevec_setQuregToClone(targetQureg, copyQureg);
+    // appreciable slowdown since simulation is often memory-bound
+    (outQureg.isDensityMatrix)?
+        localiser_densmatr_mixQureg(0, outQureg, 1, inQureg):
+        localiser_statevec_setQuregToClone(outQureg, inQureg);
 }
 
 
diff --git a/quest/src/api/matrices.cpp b/quest/src/api/matrices.cpp
index 2c817253c..d1c118d17 100644
--- a/quest/src/api/matrices.cpp
+++ b/quest/src/api/matrices.cpp
@@ -121,14 +121,14 @@ template <class T>
 bool didAnyLocalAllocsFail(T matr) {
 
     // god help us if these single-integer malloc failed
-    if (!mem_isAllocated(matr.isApproxUnitary))     return true;
-    if (!mem_isAllocated(matr.isApproxHermitian))   return true;
-    if (!mem_isAllocated(matr.wasGpuSynced))  return true;
+    if (!mem_isAllocated(matr.isApproxUnitary))   return true;
+    if (!mem_isAllocated(matr.isApproxHermitian)) return true;
+    if (!mem_isAllocated(matr.wasGpuSynced))      return true;
 
     // only diagonal matrices (which can be raised to
     // exponents) have these addtional fields
     if constexpr (!util_isDenseMatrixType<T>()) {
-        if (!mem_isAllocated(matr.isApproxNonZero))     return true;
+        if (!mem_isAllocated(matr.isApproxNonZero))       return true;
         if (!mem_isAllocated(matr.isStrictlyNonNegative)) return true;
     }
 
@@ -280,7 +280,7 @@ FullStateDiagMatr validateAndCreateCustomFullStateDiagMatr(int numQubits, int us
     // validate parameters before passing them to autodeployer
     validate_newFullStateDiagMatrParams(numQubits, useDistrib, useGpuAccel, useMultithread, caller);
 
-    // overwrite useDistrib and useGpuAccel if they were left as AUTO_FLAG
+    // overwrite all args left as AUTO_FLAG
     autodep_chooseFullStateDiagMatrDeployment(numQubits, useDistrib, useGpuAccel, useMultithread, env);
 
     // validation ensures this never overflows
diff --git a/quest/src/core/errors.cpp b/quest/src/core/errors.cpp
index 4d9d77e16..9e72b1e0b 100644
--- a/quest/src/core/errors.cpp
+++ b/quest/src/core/errors.cpp
@@ -161,11 +161,6 @@ void error_commButEnvNotDistributed() {
     raiseInternalError("A function attempted to invoke communication despite QuEST being compiled in non-distributed mode.");
 }
 
-void error_commButQuregNotDistributed() {
-
-    raiseInternalError("A function attempted to invoke communication of a Qureg which was not distributed.");
-}
-
 void error_commOutOfBounds() {
 
     raiseInternalError("A function invoked communication which attempted to exchange amplitudes between arrays at invalid bounds.");
@@ -209,7 +204,7 @@ void assert_commPayloadIsPowerOf2(qindex numAmps) {
 void assert_commQuregIsDistributed(Qureg qureg) {
 
     if (!qureg.isDistributed)
-        error_commButQuregNotDistributed();
+        raiseInternalError("A function attempted to invoke communication of a Qureg which was not distributed.");
 }
 
 void assert_commFullStateDiagMatrIsDistributed(FullStateDiagMatr matr) {
diff --git a/quest/src/core/errors.hpp b/quest/src/core/errors.hpp
index dccaaf467..950ac17ed 100644
--- a/quest/src/core/errors.hpp
+++ b/quest/src/core/errors.hpp
@@ -77,8 +77,6 @@ void error_commAlreadyInit();
 
 void error_commButEnvNotDistributed();
 
-void error_commButQuregNotDistributed();
-
 void error_commOutOfBounds();
 
 void error_commWithSameRank();
diff --git a/quest/src/core/localiser.cpp b/quest/src/core/localiser.cpp
index 8e64814d1..9d4dbce09 100644
--- a/quest/src/core/localiser.cpp
+++ b/quest/src/core/localiser.cpp
@@ -1232,6 +1232,10 @@ void localiser_statevec_anyCtrlAnyTargAnyMatr(Qureg qureg, vector<int> ctrls, ve
     bool transp = false;
     qcomp expo = 1;
 
+    // suppress warnings since not used by all compile-time expansions below
+    (void) transp;
+    (void) expo;
+
     if constexpr (util_isDiagMatr <T>()) localiser_statevec_anyCtrlAnyTargDiagMatr(qureg,  ctrls, ctrlStates, targs,        matr, expo, conj);
     if constexpr (util_isDiagMatr1<T>()) localiser_statevec_anyCtrlOneTargDiagMatr(qureg,  ctrls, ctrlStates, targs[0],           matr, conj);
     if constexpr (util_isDiagMatr2<T>()) localiser_statevec_anyCtrlTwoTargDiagMatr(qureg,  ctrls, ctrlStates, targs[0], targs[1], matr, conj);
diff --git a/quest/src/core/printer.cpp b/quest/src/core/printer.cpp
index 016d05257..3d2a4d9f0 100644
--- a/quest/src/core/printer.cpp
+++ b/quest/src/core/printer.cpp
@@ -460,10 +460,9 @@ string printer_getMemoryWithUnitStr(size_t numBytes) {
         ind++;
     ind--;
 
-    // express numBytes in terms of new unit, forcefully rounding to 3 sig-figs max,
-    // except when the chosen unit is bytes (then we permit all 4 digits)
+    // express numBytes in terms of new unit, forcefully rounding to 4 sig-figs max
     qreal frac = numBytes / static_cast<qreal>(sizes[ind]);
-    return floatToStr(frac, false, (ind==0)? 4 : 3) + " " + units[ind];
+    return floatToStr(frac, false, 4) + " " + units[ind];
 }
 
 
diff --git a/quest/src/core/validation.cpp b/quest/src/core/validation.cpp
index 83935bf20..2639b46f8 100644
--- a/quest/src/core/validation.cpp
+++ b/quest/src/core/validation.cpp
@@ -3994,6 +3994,9 @@ void validate_quregCanBeWorkspace(Qureg qureg, Qureg workspace, const char* call
     assertThat(
         doQuregsHaveIdenticalMemoryLayouts(qureg, workspace),
         report::QUREG_IS_INCOMPATIBLE_WITH_WORKSPACE, caller);
+
+    // @todo
+    // check whether any of their memories overlap, which is forbidden
 }
 
 void validate_numQuregsInSum(int numQuregs, const char* caller) {
diff --git a/quest/src/gpu/gpu_cuquantum.cuh b/quest/src/gpu/gpu_cuquantum.cuh
index 13afaae87..f627990fd 100644
--- a/quest/src/gpu/gpu_cuquantum.cuh
+++ b/quest/src/gpu/gpu_cuquantum.cuh
@@ -260,7 +260,7 @@ void cuquantum_densmatr_oneQubitDephasing_subA(Qureg qureg, int qubit, qreal pro
     cu_qcomp a = {1,        0};
     cu_qcomp b = {1-2*prob, 0};
     cu_qcomp elems[] = {a, b, b, a};
-    vector<int> targs {qubit, qubit + qureg.numQubits};
+    vector<int> targs {qubit, util_getBraQubit(qubit,qureg)};
 
     bool conj = false;
     cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, {}, {}, targs, elems, conj);
@@ -297,7 +297,7 @@ void cuquantum_densmatr_twoQubitDephasing_subA(Qureg qureg, int qubitA, int qubi
     cu_qcomp a = {1,          0};
     cu_qcomp b = {1-4*prob/3, 0};
     cu_qcomp elems[] = {a,b,b,b, b,a,b,b, b,b,a,b, b,b,b,a};
-    vector<int> targs {qubitA, qubitB, qubitA + qureg.numQubits, qubitB + qureg.numQubits};
+    vector<int> targs {qubitA, qubitB, util_getBraQubit(qubitA,qureg), util_getBraQubit(qubitB,qureg)};
 
     bool conj = false;
     cuquantum_statevec_anyCtrlAnyTargDiagMatr_sub(qureg, {}, {}, targs, elems, conj);
diff --git a/tests/unit/initialisations.cpp b/tests/unit/initialisations.cpp
index 8890c8abb..649b92905 100644
--- a/tests/unit/initialisations.cpp
+++ b/tests/unit/initialisations.cpp
@@ -315,9 +315,6 @@ TEST_CASE( "setDensityQuregFlatAmps", TEST_CATEGORY ) {
 
 TEST_CASE( "setDensityQuregAmps", TEST_CATEGORY ) {
 
-    //void setDensityQuregAmps(Qureg qureg, qindex startRow, qindex startCol, qcomp** amps, qindex numRows, qindex numCols);
-
-
     SECTION( LABEL_CORRECTNESS ) {
 
         int numTotalRowsCols = getPow2(getNumCachedQubits());

From be7edbb1f3e2ff571cd1742db4228c334c7dc587 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 14 Sep 2025 20:31:20 -0400
Subject: [PATCH 26/32] tidied CMakeLists.txt

so I can subsequently refactor it (stop propagating options to preprocessors) without pulling out all my hair. Whitespace is free! :^)
---
 CMakeLists.txt | 180 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 143 insertions(+), 37 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e2e52d85e..32db779d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 # @author Oliver Thomson Brown
 # @author Erich Essmann (patches including MSVC support)
-# @author Tyson Jones (patches including clang multithreading)
+# @author Tyson Jones (tidying + patches including clang multithreading)
 # @author Luc Jaulmes (NUMA awareness, patching install)
 #
 # Contributions to previous builds from:
@@ -13,17 +13,31 @@
 #  - Christopher J. Anders
 #  - Drew Silcock
 
+
+
+# ============================
+# Project
+# ============================
+
+
 cmake_minimum_required(VERSION 3.21)
 
+
 project(QuEST
   VERSION 4.1.0
   DESCRIPTION "Quantum Exact Simulation Toolkit"
   LANGUAGES CXX C
 )
 
+
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
 
-## Dependencies
+
+
+# ============================
+# Dependencies
+# ============================
+
 
 # GNUInstallDirs to provide sensible default install directory names
 cmake_path(SET ORG_INSTALL_PATH NORMALIZE "${CMAKE_INSTALL_PREFIX}")
@@ -31,12 +45,18 @@ cmake_path(APPEND CMAKE_INSTALL_PREFIX "quest")
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 
+
 # Maths
 if (NOT WIN32)
   find_library(MATH_LIBRARY m REQUIRED)
 endif()
 
-## Configuration options
+
+
+# ============================
+# Declare options
+# ============================
+
 
 # Build type
 # Default to "Release"
@@ -52,11 +72,13 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
     "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 
+
 # Library type
 # Shared library by default
 option(BUILD_SHARED_LIBS "Build shared library. Turned ON by default." ON)
 message(STATUS "Shared library is turned ${BUILD_SHARED_LIBS}. Set BUILD_SHARED_LIBS to modify.")
 
+
 # Library naming
 set(LIB_NAME QuEST 
   CACHE 
@@ -101,6 +123,7 @@ if (VERBOSE_LIB_NAME)
   string(CONCAT LIB_NAME ${LIB_NAME} "-fp${FLOAT_PRECISION}")
 endif()
 
+
 # Examples
 option(
   BUILD_EXAMPLES
@@ -109,6 +132,7 @@ option(
 )
 message(STATUS "Examples are turned ${BUILD_EXAMPLES}. Set BUILD_EXAMPLES to modify.")
 
+
 # Testing
 option(
   ENABLE_TESTING
@@ -123,6 +147,7 @@ option(
   ON
 )
 
+
 # Multithreading
 option(
   ENABLE_MULTITHREADING 
@@ -131,6 +156,7 @@ option(
 )
 message(STATUS "Multithreading is turned ${ENABLE_MULTITHREADING}. Set ENABLE_MULTITHREADING to modify.")
 
+
 # Distribution
 option(
   ENABLE_DISTRIBUTION 
@@ -139,6 +165,7 @@ option(
 )
 message(STATUS "Distribution is turned ${ENABLE_DISTRIBUTION}. Set ENABLE_DISTRIBUTION to modify.")
 
+
 # GPU Acceleration
 option(
   ENABLE_CUDA
@@ -163,7 +190,7 @@ option(
 )
 message(STATUS "AMD GPU acceleration is turned ${ENABLE_HIP}. Set ENABLE_HIP to modify.")
 
-# Throw on disallowed combinations
+# Throw on disallowed GPU combinations
 if (ENABLE_CUDA AND ENABLE_HIP)
   message(FATAL_ERROR "QuEST cannot support CUDA and HIP simultaneously.")
 endif()
@@ -172,6 +199,7 @@ if ((ENABLE_CUDA OR ENABLE_HIP) AND FLOAT_PRECISION STREQUAL 4)
   message(FATAL_ERROR "Quad precision is not supported on GPU. Please disable GPU acceleration or lower precision.")
 endif()
 
+
 # Deprecated API
 option(
   ENABLE_DEPRECATED_API
@@ -180,6 +208,7 @@ option(
 )
 message(STATUS "Deprecated API support is turned ${ENABLE_DEPRECATED_API}. Set ENABLE_DEPRECATED_API to modify.")
 
+
 # Windows Specific Options
 if(WIN32)
   
@@ -190,13 +219,19 @@ if(WIN32)
   endif()
 endif()
 
-## Library
+
+
+# ============================
+# Library
+# ============================
+
 
 add_library(QuEST)
 
 # Add namespaced alias to support inclusion of QuEST as a subproject
 add_library(QuEST::QuEST ALIAS QuEST)
 
+
 # Set include directories
 target_include_directories(QuEST
         PUBLIC
@@ -210,6 +245,7 @@ set_target_properties(QuEST PROPERTIES
         SOVERSION   ${PROJECT_VERSION_MAJOR}
 )
 
+
 # Add required C and C++ standards.
 # Note the QuEST interface(s) require only C11 and C++14, 
 # while the source code is entirely C++ and requires C++17,
@@ -224,6 +260,7 @@ target_compile_features(QuEST
   cxx_std_17
 )
 
+
 # Turn on all compiler warnings
 if (MSVC)
   set(WARNING_FLAG /W4)
@@ -238,9 +275,16 @@ target_compile_options(QuEST
 )
 
 
-# Set user options
+
+# ============================
+# Pass options to library
+# ============================
+
+
 compile_option(FLOAT_PRECISION ${FLOAT_PRECISION})
 
+
+# OpenMP
 if (ENABLE_MULTITHREADING)
 
   # find OpenMP, but fail gracefully...
@@ -293,6 +337,8 @@ else()
   compile_option(COMPILE_OPENMP 0)
 endif()
 
+
+# MPI
 if (ENABLE_DISTRIBUTION)
   find_package(MPI REQUIRED
     COMPONENTS CXX
@@ -309,6 +355,8 @@ else()
   compile_option(COMPILE_MPI 0)
 endif()
 
+
+# CUDA
 if (ENABLE_CUDA)
 
   # make nvcc use user cxx-compiler as default host (before cuda-host is set below)
@@ -326,8 +374,14 @@ if (ENABLE_CUDA)
   if (VERBOSE_LIB_NAME)
     string(CONCAT LIB_NAME ${LIB_NAME} "+cuda")
   endif()
+
+  # beware that compile_option(COMPILE_CUDA) is deferred to below because
+  # it is triggered by both/either ENABLE_CUDA and ENABLE_HIP
+
 endif()
 
+
+# cuQuantum
 if (ENABLE_CUQUANTUM)
   find_package(CUQUANTUM REQUIRED)
   compile_option(COMPILE_CUQUANTUM 1)
@@ -340,6 +394,8 @@ else()
   compile_option(COMPILE_CUQUANTUM 0)
 endif()
 
+
+# HIP
 if (ENABLE_HIP)
 
   # if generation fails (hip::amdhip64 not found), users can try setting
@@ -365,12 +421,16 @@ if (ENABLE_HIP)
   endif()
 endif()
 
+
+# set COMPILE_CUDA
 if (ENABLE_CUDA OR ENABLE_HIP)
   compile_option(COMPILE_CUDA 1)
 else()
   compile_option(COMPILE_CUDA 0)
 endif()
 
+
+# v3 API
 if (ENABLE_DEPRECATED_API)
   target_compile_definitions(QuEST PRIVATE INCLUDE_DEPRECATED_FUNCTIONS=1)
 
@@ -383,19 +443,33 @@ else()
 endif()
 
 
+
+# ============================
+# Pass files to library
+# ============================
+
+
 # add math library
 if (NOT MSVC)
   target_link_libraries(QuEST PRIVATE ${MATH_LIBRARY})
 endif()
 
+
 # Set output name
 set_target_properties(QuEST PROPERTIES OUTPUT_NAME ${LIB_NAME})
 
+
 # Add source files
 add_subdirectory(quest)
 
-## Examples
 
+
+# ============================
+# Examples
+# ============================
+
+
+# min example is always built
 add_executable(min_example
   examples/tutorials/min_example.c
 )
@@ -406,10 +480,13 @@ install(TARGETS min_example
   DESTINATION ${CMAKE_INSTALL_BINDIR}
 )
 
+
+# all examples optionally built
 if (BUILD_EXAMPLES)
   add_subdirectory(examples)
 endif()
 
+
 ## RATH
 set(BUILD_RPATH_USE_ORIGIN ON)
 if(APPLE)
@@ -433,14 +510,24 @@ endfunction()
 
 setup_quest_rpath(QuEST)
 setup_quest_rpath(min_example)
-## User Source
 
+
+
+# ============================
+# User source
+# ============================
+
+
+# validate
 if (USER_SOURCE AND NOT OUTPUT_EXE)
     message(SEND_ERROR "USER_SOURCE specified, but not OUTPUT_EXE.")
 endif()
 if (OUTPUT_EXE AND NOT USER_SOURCE)
     message(SEND_ERROR "OUTPUT_EXE specified, but not USER_SOURCE.")
 endif()
+
+
+# compile user source
 if (USER_SOURCE AND OUTPUT_EXE)
   message(STATUS "Compiling ${USER_SOURCE} to executable ${OUTPUT_EXE}.")
 
@@ -451,14 +538,19 @@ if (USER_SOURCE AND OUTPUT_EXE)
 endif()
 
 
-## Tests
+
+# ============================
+# Tests
+# ============================
+
 
 if (ENABLE_TESTING)
 
+  # try find Catch2
   set(CatchVersion 3.8.0)
-
   find_package(Catch2 ${CatchVersion} QUIET)
 
+  # else try download Catch2
   if (NOT TARGET Catch2::Catch2 AND DOWNLOAD_CATCH2)
     message(STATUS "Catch2 not found, it will be downloaded and built in the build directory.")
     Include(FetchContent)
@@ -471,6 +563,7 @@ if (ENABLE_TESTING)
     
     FetchContent_MakeAvailable(Catch2)
   
+  # otherwise fail
   else()
     # We won't magically find it here, but this is the easiest way to
     # a) Force the build to fail, and
@@ -478,61 +571,74 @@ if (ENABLE_TESTING)
     find_package(Catch2 ${CatchVersion} REQUIRED)
   endif()
 
+  # compile tests
   include(Catch)
   enable_testing()
   add_subdirectory(tests)
 endif()
 
-## Install
+
+
+# ============================
+# Installation
+# ============================
+
 
 install(TARGETS QuEST
-        EXPORT QuESTTargets
-        LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR}
+  EXPORT QuESTTargets
+  LIBRARY  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  RUNTIME  DESTINATION ${CMAKE_INSTALL_BINDIR}
 )
 
-## Export
+
 # Write CMake version file for QuEST
 set(QuEST_INSTALL_CONFIGDIR "${CMAKE_INSTALL_LIBDIR}/cmake/QuEST")
 
+
 # Write QuESTConfigVersion.cmake
 write_basic_package_version_file(
-        "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}ConfigVersion.cmake"
-        VERSION ${PROJECT_VERSION}
-        COMPATIBILITY AnyNewerVersion
+  "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}ConfigVersion.cmake"
+  VERSION ${PROJECT_VERSION}
+  COMPATIBILITY AnyNewerVersion
 )
 
+
 # Configure QuESTConfig.cmake (from template)
 configure_package_config_file(
-        "${CMAKE_CURRENT_SOURCE_DIR}/cmake/QuESTConfig.cmake.in"
-        "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}Config.cmake"
-        INSTALL_DESTINATION "${QuEST_INSTALL_CONFIGDIR}"
+  "${CMAKE_CURRENT_SOURCE_DIR}/cmake/QuESTConfig.cmake.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}Config.cmake"
+  INSTALL_DESTINATION "${QuEST_INSTALL_CONFIGDIR}"
 )
 
+
 # Install them
 install(FILES
-        "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}Config.cmake"
-        "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}ConfigVersion.cmake"
-        DESTINATION "${QuEST_INSTALL_CONFIGDIR}"
+  "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}Config.cmake"
+  "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}ConfigVersion.cmake"
+  DESTINATION "${QuEST_INSTALL_CONFIGDIR}"
 )
 
-install(FILES "${CMAKE_CURRENT_SOURCE_DIR}/quest/include/quest.h"
-        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+install(FILES 
+  "${CMAKE_CURRENT_SOURCE_DIR}/quest/include/quest.h"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
 )
 
-install(FILES "${CMAKE_CURRENT_BINARY_DIR}/include/quest/include/config.h"
-        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/quest/include"
+install(FILES
+  "${CMAKE_CURRENT_BINARY_DIR}/include/quest/include/config.h"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/quest/include"
 )
 
-install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/quest/include"
-        DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/quest"
-        FILES_MATCHING PATTERN "*.h"
-        PATTERN "quest.h" EXCLUDE
+install(
+  DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/quest/include"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/quest"
+  FILES_MATCHING PATTERN "*.h"
+  PATTERN "quest.h" EXCLUDE
 )
 
-install(EXPORT QuESTTargets
-        FILE "${LIB_NAME}Targets.cmake"
-        NAMESPACE QuEST::
-        DESTINATION "${QuEST_INSTALL_CONFIGDIR}"
+install(
+  EXPORT QuESTTargets
+  FILE "${LIB_NAME}Targets.cmake"
+  NAMESPACE QuEST::
+  DESTINATION "${QuEST_INSTALL_CONFIGDIR}"
 )

From 60f9b3a153b52430286e555d75d5defc6c41a7b5 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Mon, 15 Sep 2025 20:27:40 -0400
Subject: [PATCH 27/32] patched CPU performance

As described in issue #638, QuEST v4 contained a performance regression (from v3) only sometimes seen in CPU settings. This was due to the use of std::complex operator overloads in cpu_subroutines.cpp (whereas QuEST v3 hand-rolled complex arithmetic), and affected compilation with Clang (in both single-threaded and multithreaded settings) as well as in GCC (only in single-threaded settings) and potentially other compilers.

We tentatively patch this issue by passing additional compiler optimisation flags to cpu_subroutines.cpp which circumvent the issue. This is a rather aggravating solution to a major pitfall in the C++ standard library. After deliberation, it beat out other solutions including hand-rolling complex arithmetic, use of a custom complex type, and use of more precise and compiler-specific flags.
---
 CMakeLists.txt                    | 65 +++++++++++++++++++++++++++++++
 quest/src/cpu/cpu_subroutines.cpp | 37 ++++++++++++++++++
 2 files changed, 102 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 32db779d6..ee91a89d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -444,6 +444,71 @@ endif()
 
 
 
+# ============================
+# Patch CPU performance
+# ============================
+
+
+# Patch performance of CPU std::complex arithmetic operator overloads.
+# The cpu_subroutines.cpp file makes extensive use of std::complex operator
+# overloads, and alas these are significantly slower than hand-rolled 
+# arithmetic, due to their NaN and inf checks, and interference with SIMD.
+# It is crucial to pass additional optimisation flags to this file to restore
+# hand-rolled performance (else QuEST v3 is faster than v4 eep). In theory,
+# we can achieve this with specific, relatively 'safe' flags such as LLVM's:
+#     -ffinite-math-only -fno-signed-zeros -ffp-contract=fast
+# However, it is a nuisance to find equivalent flags for different compilers
+# and monitor their performance vs accuracy trade-offs. So instead, we use the
+# much more aggressive and ubiquitous -Ofast flag to guarantee performance. 
+# This introduces many potentially dangerous optimisations, such as asserting
+# associativity of flops, which would break techniques like Kahan summation.
+# The cpu_subroutines.cpp must ergo be very conscious of these optimisations.
+# We here also explicitly inform the file cpu_subroutines.cpp whether or not
+# we are passing the flags, so it can detect/error when flags are forgotten.
+
+if (CMAKE_BUILD_TYPE STREQUAL "Release")
+
+  # Release build will pass -Ofast when known for the given compiler, and
+  # fallback to giving a performance warning and proceeding with compilation
+
+  if (CMAKE_CXX_COMPILER_ID MATCHES "AppleClang|Clang|Cray|CrayClang|GNU|HP|Intel|IntelLLVM|NVHPC|NVIDIA|XL|XLClang")
+    set(patch_flags "-Ofast")
+    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
+  elseif (CMAKE_CXX_COMPILER_ID MATCHES "HP")
+    set(patch_flags "+Ofast")
+    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
+  elseif (CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
+    set(patch_flags "/fp:fast")
+    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=1")
+  else()
+    message(WARNING 
+      "The compiler (${CMAKE_CXX_COMPILER_ID}) is unrecognised and so crucial optimisation flags have not been "
+      "passed to the CPU backend. These flags are necessary for full performance when performing complex algebra, "
+      "otherwise a slowdown of 3-50x may be observed. Please edit the root CMakeLists.txt to include flags which are "
+      "equivalent to GNU's -Ofast flag for your compiler (search this warning), or contact the QuEST developers for help."
+    )
+    set(patch_flags "")
+    set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0")
+  endif()
+  
+else()
+
+  # Non-release builds (e.g. Debug) will pass no optimisation flags, and will
+  # communicate to cpu_subroutines.cpp that this is intentional via a macro
+
+  set(patch_flags "")
+  set(patch_macro "-DCOMPLEX_OVERLOADS_PATCHED=0")
+
+endif()
+
+set_source_files_properties(
+  quest/src/cpu/cpu_subroutines.cpp
+  PROPERTIES
+  COMPILE_FLAGS "${patch_flags} ${patch_macro}"
+)
+
+
+
 # ============================
 # Pass files to library
 # ============================
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index 26c80dabc..a853bc9be 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -2,6 +2,12 @@
  * CPU OpenMP-accelerated definitions of the main backend simulation routines,
  * as mirrored by gpu_subroutines.cpp, and called by accelerator.cpp. 
  * 
+ * BEWARE that this specific file receives additional compiler optimisation flags
+ * in order to counteract a performance issue in the use of std::complex operator
+ * overloads. These flags (like -Ofast) may induce assumed associativity of qcomp
+ * algebra, breaking techniques like Kahan summation. As such, this file CANNOT
+ * assume IEEE floating-point behaviour.
+ * 
  * Some of these definitions are templated, defining multiple versions optimised 
  * (at compile-time) for handling different numbers of input qubits; such functions
  * are proceeded by macro INSTANTIATE_FUNC_OPTIMISED_FOR_NUM_CTRLS(), to force the 
@@ -40,6 +46,28 @@
 using std::vector;
 
 
+/*
+ * Beware that this file makes extensive use of std::complex (qcomp) operator
+ * overloads and so requires additional compiler flags to achieve hand-rolled
+ * arithmetic performance; otherwise a 3-50x slowdown may be observed. We here
+ * enforce that these flags were not forgotton (but may be deliberatedly avoided).
+ * Beware these flags may induce associativity and break e.g. Kakan summation.
+ */
+
+#if !defined(COMPLEX_OVERLOADS_PATCHED)
+    #error "Crucial, bespoke optimisation flags were not passed (or acknowledged) to cpu_subroutines.cpp which are necessary for full complex arithmetic performance."
+    
+#elif !COMPLEX_OVERLOADS_PATCHED
+
+    #if defined(_MSC_VER)
+        #pragma message("Warning: The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance.")
+    #else
+        #warning "The CPU backend is being deliberately compiled without the necessary flags to obtain full complex arithmetic performance."
+    #endif
+
+#endif
+
+
 
 /*
  * GETTERS
@@ -568,6 +596,9 @@ void cpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector<int> ctrls, ve
                     /// qureg.cpuAmps[i] is being serially updated by only this thread,
                     /// so is a candidate for Kahan summation for improved numerical
                     /// stability. Explore whether this is time-free and worthwhile!
+                    ///
+                    /// BEWARE that Kahan summation is incompatible with the optimisation
+                    /// flags currently passed to this file
                 }
             }
         }
@@ -1758,6 +1789,9 @@ qreal cpu_statevec_calcTotalProb_sub(Qureg qureg) {
     /// final serial combination). This invokes several times
     /// as many arithmetic operations (4x?) but we are anyway
     /// memory-bandwidth bound
+    ///
+    /// BEWARE that Kahan summation is incompatible with the optimisation
+    /// flags currently passed to this file
 
     qreal prob = 0;
 
@@ -1783,6 +1817,9 @@ qreal cpu_densmatr_calcTotalProb_sub(Qureg qureg) {
     /// final serial combination). This invokes several times
     /// as many arithmetic operations (4x?) but we are anyway
     /// memory-bandwidth bound
+    ///
+    /// BEWARE that Kahan summation is incompatible with the optimisation
+    /// flags currently passed to this file
 
     qreal prob = 0;
 

From 265569090a45e1edcfa99b1345840fae16314ba8 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sat, 20 Sep 2025 17:20:17 -0400
Subject: [PATCH 28/32] updated LICENSE.txt

---
 LICENCE.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LICENCE.txt b/LICENCE.txt
index 96f802a6c..132a1f2f8 100644
--- a/LICENCE.txt
+++ b/LICENCE.txt
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2017 aniabrown
+Copyright (c) 2025 The QuEST Authors and Contributors
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

From cb682b239746ab239fd61ac3aaafe5af722437a3 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Wed, 8 Oct 2025 21:34:49 -0400
Subject: [PATCH 29/32] added calculation doc

---
 docs/tutorial.md               |   10 +-
 quest/include/calculations.h   | 1290 ++++++++++++++++++++++++++++++--
 quest/include/decoherence.h    |   14 +
 quest/include/operations.h     |  120 ++-
 quest/src/api/calculations.cpp |    4 +-
 utils/docs/Doxyfile            |    1 +
 utils/docs/latex/commands.tex  |    6 +-
 7 files changed, 1385 insertions(+), 60 deletions(-)

diff --git a/docs/tutorial.md b/docs/tutorial.md
index 9c6fcf20e..a006a3fcd 100644
--- a/docs/tutorial.md
+++ b/docs/tutorial.md
@@ -356,7 +356,7 @@ Qureg:
     globalTotal.......16 MiB
 ```
 
-> The spacing between the outputs of those two consecutive QuEST functions was determined by our earlier call to [`setMaxNumReportedSigFigs()`](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga29413703d609254244d6b13c663e6e06).
+> The spacing between the outputs of those two consecutive QuEST functions was determined by our earlier call to [`setNumReportedNewlines()`](https://quest-kit.github.io/QuEST/group__debug__reporting.html#ga29413703d609254244d6b13c663e6e06).
 
 
 A density matrix `Qureg` can model classical uncertainty as results from [decoherence](https://quest-kit.github.io/QuEST/group__decoherence.html), and proves useful when simulating quantum operations on a noisy quantum computer.
@@ -437,7 +437,7 @@ int targets[]  = {4,5,6};
 applyPhaseGadget(qureg, targets, 3, angle);
 ```
 
-> [!IMPORTANT]  
+> [!NOTE]  
 > Notice the type of `angle` is [`qreal`](https://quest-kit.github.io/QuEST/group__types.html#ga2d479c159621c76ca6f96abe66f2e69e) rather than the expected `double`. This is a precision agnostic alias for a floating-point, real scalar which allows you to recompile QuEST with a varying [precision](/docs/compile.md#precision) with no modifications to your code. 
 <!-- @todo the above link fails in Doxygen; it's too stupid to recognise the section ref -->
 
@@ -514,7 +514,7 @@ applyCompMatr1(qureg, target, matrix);
 > The type [`qcomp`](https://quest-kit.github.io/QuEST/group__types.html#ga4971f489e74bb185b9b2672c14301983) above is a precision agnostic complex scalar, and has beautiful arithmetic overloads!
 > ```cpp
 > qcomp x = 1.5 + 3.14i;
-> qcomp *= 1E3i - 1E-5i;
+> x *= 1E3i - 1E-5i;
 > ```
 > Beware that in `C++`, `1i` is a _double precision_ literal, so `C++` users should instead
 > use the custom precision-agnostic literal `1_i`.
@@ -574,7 +574,7 @@ PauliStrSum sum = createInlinePauliStrSum(R"(
 setFullStateDiagMatrFromPauliStrSum(fullmatrix, sum);
 ```
 > [!IMPORTANT]  
-> The argument to `createInlinePauliStrSum` is a multiline string for which the syntax differs between `C` and `C++`; we used the latter above. See examples [`initialisation.c`](/examples/paulis/initialisation.c) and [`initialisation.cpp`](/paulis/matrices/initialisation.cpp) for clarity.
+> The argument to `createInlinePauliStrSum` is a multiline string for which the syntax differs between `C` and `C++`; we used the latter above. See examples [`initialising_paulis.c`](/examples/isolated/initialising_paulis.c) and [`initialising_paulis.cpp`](/examples/isolated/initialising_paulis.cpp) for clarity.
 
 > [!CAUTION]
 > Beware that in distributed settings, because `fullmatrix` _may_ be distributed, we should must exercise extreme caution when modifying its `fullmatrix.cpuElems` directly. 
@@ -863,6 +863,6 @@ This is important because it ensures:
 - our GPU processes are killed quickly, freeing resources for other processes.
 
 > [!CAUTION]
-> After calling `finalizeQuESTEnv()`, MPI will close and each if being accessed directly by the user, will enter an undefined state. Subsequent calls to MPI routines may return gibberish, and distributed machines will have lost their ability to communicate. It is recommended to call `finalizeQuESTEnv()` immediately before exiting.
+> After calling `finalizeQuESTEnv()`, MPI will close and if being accessed directly by the user, will enter an undefined state. Subsequent calls to MPI routines may return gibberish, and distributed machines will lose their ability to communicate. It is recommended to call `finalizeQuESTEnv()` immediately before exiting.
 
 You are now a QuEST expert 🎉 though there are _many_ more functions in the [API](https://quest-kit.github.io/QuEST/group__api.html) not covered here. Go forth and simulate!
\ No newline at end of file
diff --git a/quest/include/calculations.h b/quest/include/calculations.h
index b54af4492..645f54c34 100644
--- a/quest/include/calculations.h
+++ b/quest/include/calculations.h
@@ -53,7 +53,8 @@ extern "C" {
  * state @p qureg without modifying it. 
  * 
  * @formulae
- * Let @f$ \pstr = @f$ @p str.
+ * 
+ * Let @f$ \pstr = @f$ @p str, which notates a tensor product of single-qubit Pauli operators.
  * - When @p qureg is a statevector @f$\svpsi@f$, this function returns
  *   @f[ 
     \brapsi \pstr \svpsi \in \mathbb{R}.
@@ -65,11 +66,15 @@ extern "C" {
  *   which is exact when @f$\dmrho@f$ is physical (specifically Hermitian).
  * 
  * @constraints
- * - The returned value is always real, even when @p qureg is an unnormalised density matrix, in
- *   which case the imaginary component of the above expression is neglected.
- *   The full complex value can be obtained using calcExpecNonHermitianPauliStrSum().
+ * 
+ * - Postcondition validation will check that the calculated expectation value is approximately
+ *   real (i.e. the imaginary component is smaller in size than the validation epsilon), as admitted
+ *   when @p qureg is correctly normalised. This behaviour can be adjusted using setValidationEpsilon(). 
+ * - Regardless of the validation epsilon, the returned value is always real and the imaginary component
+ *   is discarded. The full complex value can be obtained using calcExpecNonHermitianPauliStrSum().
  * 
  * @equivalences
+ * 
  * - When @p str is general, this function is equivalent to calling calcExpecPauliStrSum() with a 
  *   PauliStrSum composed of only a single PauliStr term and a unity coefficient.
  * - When @p str @f$ = \id^\otimes @f$, the output is equivalent to that of calcTotalProb().
@@ -105,6 +110,7 @@ qreal calcExpecPauliStr(Qureg qureg, PauliStr str);
  * Pauli strings - under the given state @p qureg, without modifying it. 
  * 
  * @formulae
+ * 
  * Let @f$ \hat{H} = @f$ @p sum.
  * - When @p qureg is a statevector @f$\svpsi@f$, this function returns
  *   @f[ 
@@ -114,20 +120,26 @@ qreal calcExpecPauliStr(Qureg qureg, PauliStr str);
  *   @f[ 
      \tr{ \hat{H} \dmrho }
  *   @f]
- *   which is the exact expectation value when @f$\dmrho@f$ is physical (specifically Hermitian).
+ *   which is the exact expectation value when @f$\dmrho@f$ is physical (or at least, Hermitian).
  * 
  * @constraints
+ * 
  * - Hermiticity of @p sum requires that every coefficient within is real. 
  *   Validation will check @p sum is _approximately_ Hermitian, i.e. that
  *   @f[ 
      |\im{c}| \le \valeps
  *   @f]
  *   for all @f$c \in @f$ `sum.coeffs`. Adjust @f$\valeps@f$ using setValidationEpsilon().
+ *   The sub-epsilon imaginary components of the coefficients _are_ included in calculation.
+ * - Postcondition validation will check that the calculated expectation value is approximately
+ *   real (i.e. the imaginary component is smaller in size than the validation epsilon), as should be
+ *   admitted when @p qureg is correctly normalised, and @p sum is Hermitian.
  * - The returned value is always real, and the imaginary component is neglected even when 
  *   Hermiticity validation is relaxed and/or @p qureg is an unnormalised density matrix. 
  *   The full complex value can be obtained using calcExpecNonHermitianPauliStrSum().
  * 
  * @equivalences
+ * 
  * - This function is mathematically equivalent to (albeit faster than) calling calcExpecPauliStr() upon
  *   each constituent @p PauliStr within @p sum, weighting each by its corresponding coefficient, and
  *   summing the outputs.
@@ -165,14 +177,170 @@ qreal calcExpecPauliStr(Qureg qureg, PauliStr str);
 qreal calcExpecPauliStrSum(Qureg qureg, PauliStrSum sum);
 
 
-/// @notyetdoced
-/// @notyetvalidated
+/** Calculates the expectation value of the given Hermitian observable @p matr - a diagonal,
+ * Hermitian matrix spanning the full Hilbert space - under the given state @p qureg, without 
+ * modifying it. 
+ * 
+ * @formulae
+ * 
+ * Let @f$ \hat{D} = @f$ @p matr.
+ * - When @p qureg is a statevector @f$\svpsi@f$, this function returns
+ *   @f[ 
+    \brapsi \hat{D} \svpsi \in \mathbb{R}.
+ *   @f]
+ * - When @p qureg is a density matrix @f$\dmrho@f$, this function returns the real component of
+ *   @f[ 
+     \tr{ \hat{D} \dmrho }
+ *   @f]
+ *   which is the exact expectation value when @f$\dmrho@f$ is physical (or at least, Hermitian).
+ * 
+ * @constraints
+ * 
+ * - Hermiticity of @p matr requires that every element within is real. 
+ *   Validation will check @p matr is _approximately_ Hermitian, i.e. that
+ *   @f[ 
+     |\im{c}| \le \valeps
+ *   @f]
+ *   for all @f$c \in @f$ `matr.cpuElems`. Adjust @f$\valeps@f$ using setValidationEpsilon().
+ * - Postcondition validation will check that the calculated expectation value is approximately
+ *   real (i.e. the imaginary component is smaller in size than the validation epsilon), as should be
+ *   admitted when @p qureg is correctly normalised, and @p matr is Hermitian.
+ * - The returned value is always real, and the imaginary component is neglected even when @p matr
+ *   Hermiticity validation is relaxed and/or @p qureg is an unnormalised density matrix. 
+ *   The full complex value can be obtained using calcExpecNonHermitianFullStateDiagMatr().
+ * 
+ * @equivalences
+ * 
+ * - This function is mathematically equivalent to (albeit much faster than) calling calcExpecPauliStrSum()
+ *   with a PauliStrSum consisting of all permutations of @f$\hat{I}@f$ and @f$\hat{Z}@f$ Pauli operators
+ *   with a precise, linear combination of coefficients.
+ * 
+ * @myexample
+ * 
+ * ```
+    Qureg qureg = createQureg(5);
+    initPlusState(qureg);
+
+    FullStateDiagMatr matr = createFullStateDiagMatr(qureg.numQubits);
+
+    // profanely inefficient per-element initialisation
+    for (int n=0; n<matr.numElems; n++) {
+        qcomp elem = getQcomp(n, 0);
+        setFullStateDiagMatr(matr, n, &elem, 1);
+    }
+
+    // prints "expec: 15.5"
+    qreal expec = calcExpecFullStateDiagMatr(qureg, matr);
+    reportScalar("expec", expec);
+ * ```
+ *
+ * @param[in] qureg the reference state.
+ * @param[in] matr  the observable operator.
+ * @returns The real component of the expectation value.
+ * @throws @validationerror
+ * - if @p qureg or @p matr are uninitialised.
+ * - if @p matr does not match the dimension of @p qureg
+ * - if @p matr is distributed but @p qureg is not
+ * - if @p matr is not approximately Hermitian.
+ * - if the output (with unreturned imaginary component) is not approximately real.
+* @notyetvalidated
+ * @see
+ * - calcExpecFullStateDiagMatrPower()
+ * - calcExpecNonHermitianFullStateDiagMatr()
+ * - calcExpecPauliStrSum()
+ * @author Tyson Jones
+ */
 qreal calcExpecFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matr);
 
 
-/// @notyetdoced
-/// @notyetvalidated
-qreal calcExpecFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matr, qreal exponent);
+/** Calculates the expectation value of the given Hermitian observable @p matrix - a diagonal,
+ * Hermitian matrix spanning the full Hilbert space - when raised to the given @p exponent,
+ * under the given state @p qureg, which is not modified.
+ * 
+ * @formulae
+ * 
+ * Let @f$ \hat{D} = @f$ @p matrix and @f$x = @f$ @p exponent.
+ * - When @p qureg is a statevector @f$\svpsi@f$, this function returns
+ *   @f[ 
+    \brapsi \hat{D}^x \svpsi \in \mathbb{R}.
+ *   @f]
+ * - When @p qureg is a density matrix @f$\dmrho@f$, this function returns the real component of
+ *   @f[ 
+     \tr{ \hat{D}^x \dmrho }
+ *   @f]
+ *   which is the exact expectation value when @f$\dmrho@f$ is physical (or at least, Hermitian).
+ * 
+ * @constraints
+ * 
+ * - Hermiticity of @p matrix itself requires that every element within is real. 
+ *   Validation will check @p matrix is _approximately_ Hermitian, i.e. that
+ *   @f[ 
+     |\im{c}| \le \valeps
+ *   @f]
+ *   for all @f$c \in @f$ `matr.cpuElems`. Adjust @f$\valeps@f$ using setValidationEpsilon().
+ * 
+ *   > [!CAUTION]
+ *   > Unlike other functions (including calcExpecFullStateDiagMatr()), this function will _NOT_
+ *   > consult the imaginary components of the elements of @p matrix, since a non-complex exponentiation
+ *   > function is used. That is, while validation permits the imaginary components to be small, they
+ *   > will be internally treated as precisely zero. This is true even when Hermiticity validation
+ *   > is disabled using setValidationOff(). To consult the imaginary components of @p matrix, use
+ *   > calcExpecNonHermitianFullStateDiagMatrPower().
+ * 
+ * - Hermiticity of @p matrix when raised to @p exponent further requires that, when @p exponent is 
+ *   a non-integer, @p matrix does not contain any negative elements which would otherwise produce 
+ *   complex elements in @f$\hat{D}^x@f$. This validation is always strict (i.e. independent of 
+ *   @f$\valeps@f$), and demands that
+ *   @f[ 
+     \min(\hat{D}) \ge 0 \text{ when } x \notin \mathbb{R}.
+ *   @f]
+ * - Numerical stability requires that if @p exponent is negative, @p matrix does not contain any
+ *   zero elements which would otherwise create divergences in @f$\hat{D}^x@f$. Validation ergo
+ *   checks that when @p exponent is (strictly) negative, @p matrix contains no elements within 
+ *   distance @f$\valeps@f$ to zero (regardless of the magnitude of @p exponent). Adjust
+ *   @f$\valeps@f$ using setValidationEpsilon().
+ * - The passed @p exponent is always real, but can be relaxed to a general complex scalar via
+ *   calcExpecNonHermitianFullStateDiagMatrPower().
+ * - The returned value is always real, and the imaginary component is neglected even when 
+ *   Hermiticity validation is relaxed and/or @p qureg is an unnormalised density matrix. 
+ *   The full complex value can be obtained using calcExpecNonHermitianFullStateDiagMatrPower().
+ * 
+ * @myexample
+ * ```
+    Qureg qureg = createQureg(5);
+    initPlusState(qureg);
+
+    FullStateDiagMatr matrix = createFullStateDiagMatr(qureg.numQubits);
+
+    // profanely inefficient per-element initialisation
+    for (int n=0; n<matrix.numElems; n++) {
+        qcomp elem = getQcomp(n+1, 0);
+        setFullStateDiagMatr(matrix, n, &elem, 1);
+    }
+
+    // prints "expec: 0.044503"
+    qreal exponent = -2.3;
+    qreal expec = calcExpecFullStateDiagMatrPower(qureg, matrix, exponent);
+    reportScalar("expec", expec);
+ * ```
+ * @param[in] qureg     the reference state.
+ * @param[in] matrix    the observable operator.
+ * @param[in] exponent  the exponent to which to raise @p matrix
+ * @returns The real component of the expectation value of @p matrix raised to @p exponent.
+ * @throws @validationerror
+ * - if @p qureg or @p matrix are uninitialised.
+ * - if @p matrix does not match the dimension of @p qureg
+ * - if @p matrix is distributed but @p qureg is not
+ * - if @p matrix is not approximately Hermitian.
+ * - if @p exponent is (precisely) non-integer but @p matrix contains (precisely) negative elements.
+ * - if @p exponent is (precisely) negative but @p matrix contains elements which are approximately zero. 
+ * - if the output (with unreturned imaginary component) is not approximately real.
+ * @notyetvalidated
+ * @see
+ * - calcExpecNonHermitianFullStateDiagMatrPower()
+ * @author Tyson Jones
+ */
+qreal calcExpecFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qreal exponent);
 
 
 /** @} */
@@ -186,23 +354,349 @@ qreal calcExpecFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matr, qreal
  */
 
 
-/// @notyetdoced
-/// @notyetvalidated
+/** Calculates the probability of the full computational basis state of the specified
+ * @p index. This is the probability that, when measured in the @f$ \hat{Z} @f$ basis,
+ * every qubit of @p qureg is consistent with the bits of @p index.
+ * 
+ * Indexing is little-endian and from zero, such that (for example) computational basis state 
+ * @f$ \ket{0011} @f$ (where qubits at indices @f$0@f$ and @f$1@f$ are in the @f$\ket{1}@f$ state)
+ * corresponds to @p index @f$ = 3 @f$. The maximum legal @p index of an @f$N@f$-qubit
+ * register is @p index @f$ = 2^N-1 @f$.
+ *
+ * @formulae
+ * 
+ * Let @f$ i = @f$ @p index.
+ * 
+ * - When @p qureg is a statevector @f$ \svpsi @f$, this function returns
+ *   @f[ 
+      P(i) = |\braket{i}{\psi}|^2 = |\psi_i|^2
+ *   @f] 
+ *   where @f$\psi_i@f$ is the @f$i@f$-th amplitude of @f$\svpsi@f$.
+ * - When @p qureg is a density matrix @f$\dmrho@f$, this function returns
+ *   @f[ 
+      P(i) = \re{ \tr{ \ketbra{i}{i} \dmrho } } = \re{ \bra{i} \dmrho \ket{i} } = \re{ \dmrho_{ii} }
+ *   @f]
+ *   where @f$ \dmrho_{ii} @f$ is the @f$i@f$-th diagonal element of @f$\dmrho@f$, and is
+ *   real whenever @f$ \dmrho @f$ is valid (or at least, Hermitian).
+ * 
+ * When @p qureg is correctly normalised, these quantities are within @f$[0, 1]@f$, and satisfy
+ * @f[
+      \sum\limits_{i=0}^{2^N-1} P(i) = 1
+ * @f]
+ * where @f$N@f$ is the number of qubits in @p qureg.
+ * 
+ * @equivalences
+ * 
+ * - This function is equivalent to obtaining the corresponding @p qureg amplitude directly
+ *   and evaluating the probability.
+ *   ```
+     // qureg is statevector
+     qcomp amp = getQuregAmp(qureg, index);
+     qreal prob = pow(abs(amp, 2));
+
+     // qureg is a density matrix
+     qcomp amp = getDensityQuregAmp(qureg, index, index);
+     qreal prob = real(amp);
+ *   ```
+ * - This function is slightly faster than, but otherwise mathematically equivalent to, invoking
+ *   calcProbOfMultiQubitOutcome() and passing explicitly the bits of @p index. I.e.
+ *   ```
+     int qubits[qureg.numQubits];
+     int outcomes[qureg.numQubits];
+
+     for (int q=0; q<qureg.numQubits; q++) {
+         qubits[q] = q;
+         outcomes[q] = (index >> q) & 1;
+     }
+
+     qreal prob = calcProbOfMultiQubitOutcome(qureg, qubits, outcomes, qureg.numQubits);
+ *   ```
+ *   Use of calcProbOfMultiQubitOutcome() may be more convenient if only the individual qubit 
+ *   outcomes are known.
+ * - This function is significantly faster than, but mathematically equivalent to, preparing
+ *   a secondary Qureg in the basis state @p index and computing their overlap.
+ *   ```
+     Qureg alt = createCloneQureg(qureg);
+     initClassicalState(alt, index);
+     qcomp amp = calcInnerProduct(alt, qureg);
+     qreal prob = pow(abs(amp), 2);
+ *   ```
+ * 
+ * @myexample
+ * ```
+    Qureg qureg = createQureg(5);
+    initPlusState(qureg);
+
+    qreal prob = calcProbOfBasisState(qureg, 2);
+    reportScalar("prob of |00010>", prob);
+ * ```
+ *
+ * @param[in] qureg the reference state, which is unchanged.
+ * @param[in] index the index of the queried basis state among the ordered set of all basis states.
+ * @returns The probability of the basis state at @p index.
+ * @throws @validationerror
+ * - if @p qureg is uninitialised.
+ * - if @p index is less than zero or beyond (or equal to) the dimension of @p qureg.
+* @notyetvalidated
+ * @see
+ * - calcProbOfQubitOutcome()
+ * - calcProbOfMultiQubitOutcome()
+ * - getQuregAmp()
+ * - getDensityQuregAmp()
+ * @author Tyson Jones
+ */
 qreal calcProbOfBasisState(Qureg qureg, qindex index);
 
 
-/// @notyetdoced
-/// @notyetvalidated
+/** Calculates the probability of the single qubit at index @p qubit being in the
+ * given computational basis @p outcome (`0` or `1`).
+ *
+ * @formulae
+ * 
+ * Let @f$ q = @f$ @p qubit and @f$ x = @f$ @p outcome, and let @f$\ketbra{x}{x}_q@f$
+ * notate a projector operating upon qubit @f$ q @f$. 
+ * 
+ * - When @p qureg is a statevector @f$ \svpsi @f$, this function returns
+ *   @f[
+      P_q(x) = \tr{ \ketbra{x}{x}_q \, \ketbra{\psi}{\psi} }
+         = \sum\limits_i |\psi_i|^2 \delta_{x,i_{[q]}}
+ *   @f]
+ *   where @f$\psi_i@f$ is the @f$i@f$-th amplitude of @f$\svpsi@f$, and @f$i_{[q]}@f$
+ *   notates the @f$q@f$-th bit of @f$i@f$.
+ * - When @p qureg is a density matrix @f$ \dmrho @f$, this function returns
+ *   @f[
+     P_q(x) = \tr{ \ketbra{x}{x}_q \, \dmrho }
+         = \sum\limits_i \re{ \dmrho_{ii} } \delta_{x,i_{[q]}}
+ *   @f]
+ *   where @f$ \dmrho_{ii} @f$ is the @f$i@f$-th diagonal element of @f$\dmrho@f$. This 
+ *   is real whenever @f$\dmrho@f$ is validly normalised (specifically, Hermitian).
+ * 
+ * When @p qureg is correctly normalised, these quantities are within @f$[0, 1]@f$, and
+ * satisfy
+ * @f[
+     P_q(x=0) + P_q(x=1) = 1.
+ * @f]
+ *
+ * @equivalences
+ * 
+ * - This function is a single-qubit convenience overload of calcProbOfMultiQubitOutcome(), 
+ *   which itself has optimised implementations for few-qubit outcomes.
+ *   ```
+     calcProbOfMultiQubitOutcome(qureg, &qubit, &outcome, 1);
+ *   ```
+ * - This function is much faster than, but mathematically equivalent to, summing the probability
+ *   of every computational basis state (e.g. via calcProbOfBasisState()) which is consistent
+ *   with the given qubit outcome.
+ *   ```
+     qreal prob = 0;
+     qindex dim = 1 << qureg.numQubits;
+     for (qindex i=0; i<dim; i++)
+         if (outcome == (i >> qubit) & 1)
+            prob += calcProbOfBasisState(qureg, i);
+ *   ```
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createQureg(5);
+    
+    int qubit = 2;
+    int outcome = 1;
+    qreal theta = 0.3;
+    applyRotateX(qureg, qubit, theta);
+
+    // prob = cos(theta/2)^2
+    qreal prob = calcProbOfQubitOutcome(qureg, qubit, outcome);
+ * ```
+ *
+ * @param[in] qureg   the reference state, which is unchanged.
+ * @param[in] qubit   the target qubit to query.
+ * @param[in] outcome the outcome of @p qubit to query (i.e. `0` oe `1`).
+ * @returns The probability that the given qubit is in the given outcome.
+ * @throws @validationerror
+ * - if @p qureg is uninitialised.
+ * - if @p qubit is less than zero or beyond the number of qubits in @p qureg.
+ * - if @p outcome is not `0` or `1`.
+* @notyetvalidated
+ * @see
+ * - calcProbOfMultiQubitOutcome()
+ * @author Tyson Jones
+ */
 qreal calcProbOfQubitOutcome(Qureg qureg, int qubit, int outcome);
 
 
-/// @notyetdoced
-/// @notyetvalidated
+/** Calculates the probability that the given list of @p qubits are simultaneously in the 
+ * respective single-qubit states specified in @p outcomes.
+ *
+ * @formulae
+ * 
+ * Let @f$q_j@f$ and @f$x_j@f$ notate the @f$j@f$-th qubit in @p qubits and its respective
+ * outcome in @p outcomes. 
+ * 
+ * - When @p qureg is a statevector @f$ \svpsi @f$, this function returns
+ *   @f[
+         \tr{
+            \bigotimes\limits_j \ketbra{x_j}{x_j}_{q_j} \; \ketbra{\psi}{\psi} 
+         }
+         =
+         \sum\limits_i |\psi_i|^2 \prod\limits_j \delta_{x_j, \, i_{[q_j]}}
+ *   @f]
+ *   where @f$\psi_i@f$ is the @f$i@f$-th amplitude of @f$\svpsi@f$, and 
+ *   @f$i_{[q]}@f$ notates the @f$q@f$-th bit of @f$i@f$.
+ * - When @p qureg is a density matrix @f$ \dmrho @f$, this function returns
+ *   @f[
+         \tr{
+            \bigotimes\limits_j \ketbra{x_j}{x_j}_{q_j} \; \dmrho
+         }
+         =
+         \sum\limits_i \re{\dmrho_{ii}} \prod\limits_j \delta_{x_j, \, i_{[q_j]}}
+ *   @f]
+ *   where @f$ \dmrho_{ii} @f$ is the @f$i@f$-th diagonal element of @f$\dmrho@f$. This 
+ *   is real whenever @f$\dmrho@f$ is validly normalised (specifically, Hermitian).
+ *
+ * When @p qureg is correctly normalised, these quantities are within @f$[0, 1]@f$, and their sum
+ * across all possible values of @p outcomes equals one.
+ *
+ * @equivalences
+ * 
+ * - The output of this function is equal to that found by in-turn finding the probability of each
+ *   qubit being in the specified outcome, then projecting @p qureg into it (i.e. forcing that 
+ *   measurement outcome). That approach is however slower and modifies @p qureg, whereas this
+ *   function leaves @p qureg unchanged.
+ *   ```
+     qreal prob = 1;
+     for (int j=0; j<numQubits; j++)
+         prob *= applyForcedQubitMeasurement(qureg, qubits[j], outcomes[j]);
+ *   ```
+ *
+ * - This function is much faster than, but mathematically equivalent to, summing the probability
+ *   of every computational basis state (e.g. via calcProbOfBasisState()) which is consistent
+ *   with the given qubit outcomes.
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createQureg(5);
+    initRandomPureState(qureg);
+
+    int num = 3;
+    int qubits[]   = {0, 3, 4};
+    int outcomes[] = {1, 1, 0};
+
+    qreal prob = calcProbOfMultiQubitOutcome(qureg, qubits, outcomes, num);
+ * ```
+ *
+ * @param[in] qureg     the reference state, which is unchanged.
+ * @param[in] qubits    a list of target qubits to query.
+ * @param[in] outcomes  a list of corresponding qubit outcomes (each `0` or `1`).
+ * @param[in] numQubits the length of list @p qubits (and @p outcomes).
+ * @returns The probability that the given qubits are simultaneously in the specified outcomes.
+ * @throws @validationerror
+ * - if @p qureg is uninitialised.
+ * - if @p qubits contains any duplicates.
+ * - if any element of @p qubits is less than zero or beyond the number of qubits in @p qureg.
+ * - if any element of @p outcomes is not `0` or `1`.
+ * - if @p numQubits is less than one or exceeds the number of qubits in @p qureg.
+ * @throws @segfault
+ * - if either of @p qubits or @p outcomes are not lists of length @p numQubits.
+* @notyetvalidated
+ * @see
+ * - calcProbsOfAllMultiQubitOutcomes()
+ * - calcProbOfBasisState()
+ * @author Tyson Jones
+ */
 qreal calcProbOfMultiQubitOutcome(Qureg qureg, int* qubits, int* outcomes, int numQubits);
 
 
-/// @notyetdoced
-/// @notyetvalidated
+/** Populates @p outcomeProbs with the probabilities of the specified list of @p qubits
+ * being in _all_ of their possible, simultaneous outcomes (of which there are `2^`
+ * @p numQubits).
+ * 
+ * The list @p qubits is taken to be in order of _increasing_ significance, determining 
+ * the ordering of the output @p outcomeProbs.
+ * For example, if @p qubits @f$ = \{ 1, 3 \} @f$, then @p outcomeProbs will be populated
+ * with _four_ values; the probabilities of qubits @f$(3,1)@f$ being in the respective
+ * simultaneously outcomes @f$(0,0), \, (0,1), \, (1,0) @f$ and @f$(1,1)@f$. In contrast,
+ * @p qubits @f$ = \{ 3, 1 \} @f$ would see the middle two outputs swapped.
+ * 
+ * @formulae
+ * 
+ * Let @f$ n = @f$ @p numQubits, and @f$ q_i @f$ be the @f$i@f$-th element of @p qubits,
+ * such that @p qubits = @f$ \{ q_0, q_1, \dots, q_{n-1} \} @f$. 
+ * Let @f$ P_{\ket{q_{n-1} \dots q_1 q_0}}(\ket{i}) @f$ denote the probability that the specified
+ * substate is in the computational basis substate @f$\ket{i}@f$. Explicitly, that
+ * qubit @f$q_j@f$ is in the outcome given by the @f$j@f$-th bit of @f$n@f$-digit integer 
+ * @f$i@f$ (simultaneously for all @f$j@f$).
+ * 
+ * Then, this function sets
+ * @f[
+      \text{outcomeProbs}[i] = P_{\ket{q_{n-1} \dots q_1 q_0}}(\ket{i})
+ * @f]
+ * for all @f$i \in \{0, 1, \dots 2^n-1\} @f$.
+ * 
+ * Explicitly, expressing substate @f$\ket{i}@f$ in terms of its individual qubits;
+ * @f[
+      \begin{gathered}
+      \text{outcomeProbs}[0] = P_{\ket{q_{n-1} \dots q_1 q_0}}( \ket{0\dots00} ) \\
+      \text{outcomeProbs}[1] = P_{\ket{q_{n-1} \dots q_1 q_0}}( \ket{0\dots01} ) \\
+      \text{outcomeProbs}[2] = P_{\ket{q_{n-1} \dots q_1 q_0}}( \ket{0\dots10} ) \\
+      \text{outcomeProbs}[3] = P_{\ket{q_{n-1} \dots q_1 q_0}}( \ket{0\dots11} ) \\
+      \vdots \\
+      \text{outcomeProbs}[2^n-1] = P_{\ket{q_{n-1} \dots q_1 q_0}}( \ket{1\dots11} )
+      \end{gathered}
+ * @f]
+ *
+ * Each probability is that which would be output by calcProbOfMultiQubitOutcome() when
+ * passed @p qubits and the bits of @f$ i @f$.
+ *
+ * When @p qureg is correctly normalised, all probabilities are within @f$[0, 1]@f$, and
+ * the sum of all elements written to @p outcomeProbs equals one.
+ * 
+ * @equivalences
+ * 
+ * - This function is significantly faster than, but otherwise equivalent to, populating
+ *   each element of @p outcomeProbs in-turn with the output of calcProbOfMultiQubitOutcome().
+ *   ```
+     qindex numOut = (1 << numQubits);
+
+     for (qindex i=0; i<numOut; i++) {
+
+         // set outcomes to the bits of i
+         int outcomes[numQubits];
+         for (int j=0; j<numQubits; j++)
+            outcomes[j] = (i >> j) & 1;
+
+         outcomeProbs[i] = calcProbOfMultiQubitOutcome(qureg, qubits, outcomes, numQubits);
+     }
+ *   ``` 
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createQureg(5);
+    initRandomPureState(qureg);
+
+    int num = 3;
+    int qubits[] = {0, 3, 4};
+    
+    qreal probs[8];
+    calcProbsOfAllMultiQubitOutcomes(probs, qureg, qubits, num);
+ * ```
+ * @param[out] outcomeProbs the array to which the output is written.
+ * @param[in]  qureg        the reference state, which is unchanged.
+ * @param[in]  qubits       a list of target qubits to query.
+ * @param[in]  numQubits    the length of list @p qubits.
+ * @throws @validationerror
+ * - if @p qureg is uninitialised.
+ * - if @p qubits contains any duplicates.
+ * - if any element of @p qubits is less than zero or beyond the number of qubits in @p qureg.
+ * - if @p numQubits is less than one or exceeds the number of qubits in @p qureg.
+ * @throws @segfault
+ * - if @p outcomeProbs is not a pre-allocated list of length `2^` @p numQubits.
+ * - if @p qubits is not a list of length @p numQubits.
+* @notyetvalidated
+ * @see
+ * - calcProbOfMultiQubitOutcome()
+ * - calcProbOfBasisState()
+ * @author Tyson Jones
+ */
 void calcProbsOfAllMultiQubitOutcomes(qreal* outcomeProbs, Qureg qureg, int* qubits, int numQubits);
 
 
@@ -217,13 +711,149 @@ void calcProbsOfAllMultiQubitOutcomes(qreal* outcomeProbs, Qureg qureg, int* qub
  */
 
 
-/// @notyetdoced
-/// @notyetvalidated
+/** Calculates the probability normalisation of the given @p qureg. This is the probability
+ * of the @p qureg being in _any_ outcome state, which is expected to equal `1`.
+ *
+ * @formulae
+ * 
+ * Let @f$N@f$ be the number of qubits in @p qureg.
+ * 
+ * - When @p qureg is a statevector @f$ \svpsi @f$ with @f$i@f$-th amplitude @f$\psi_i@f$,
+ *   this function returns
+ *   @f[
+         \sum\limits_{i=0}^{2^N-1} |\psi_i|^2.
+ *   @f]
+ * - When @p qureg is a density matrix @f$ \dmrho @f$ with @f$i@f$-th diagonal element
+ *   @f$ \dmrho_{ii} @f$, this function returns
+ *   @f[
+         \sum\limits_{i=0}^{2^N-1} \re{ \rho_{ii} }
+ *   @f]
+ * 
+ * @constraints
+ * 
+ * - As above, only the real components of the diagonal elements of a density matrix are consulted;
+ *   these are the only amplitudes consulted by functions which calculate probabilities in the
+ *   computational basis. As such, this function gives no indication of the general validity of density
+ *   matrices, such as whether they are Hermitian, whether the diagonals are real, and whether the
+ *   off-diagoanl elements are valid. 
+ *
+ * @equivalences
+ *
+ * - This function is faster than, but mathematically equivalent to, summing the outputs of other
+ *   functions which calculate probabilitie across all possible outcomes.
+ *   ```
+     // choice is arbitrary
+     int qubit = 0;
+
+     qreal totalProb = (
+         calcProbOfQubitOutcome(qureg, qubit, 0) + 
+         calcProbOfQubitOutcome(qureg, qubit, 1));
+ *   ```
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createDensityQureg(5);
+    initRandomMixedState(qureg, 1<<5);
+
+    // differs from 1 by numerical error
+    qreal totalProb = calcTotalProb(qureg);
+ * ```
+ *
+ * @param[in] qureg the reference state, which is unchanged.
+ * @returns The probability normalisation of @p qureg.
+ * @throws @validationerror
+ * - if @p qureg is uninitialised.
+* @notyetvalidated
+ * @see
+ * - calcPurity()
+ * - calcProbsOfAllMultiQubitOutcomes()
+ * @author Tyson Jones
+ */
 qreal calcTotalProb(Qureg qureg);
 
 
-/// @notyetdoced
-/// @notyetvalidated
+/** Calculates the purity of @p qureg, which is a measure of its mixedness.
+ *
+ * @formulae
+ * 
+ * Let @f$N@f$ be the number of qubits in @p qureg.
+ * 
+ * - When @p qureg is a density matrix @f$ \dmrho @f$ (as expected), this function returns
+ *   @f[
+         \tr{ \dmrho^2 } = \sum\limits_{i,j} \left| \dmrho_{ij} \right|^2
+ *   @f]
+ *   where @f$ \dmrho_{ij} @f$ is the @f$(i,j)@f$-th element of @f$ \dmrho @f$.
+ *   
+ *   A purity of `1` indicates that the matrix is _pure_ and can be expressed as
+ *   @f[
+         \dmrho \equiv \ketbra{\phi}{\phi}
+ *   @f]
+ *   where @f$ \ket{\phi} @f$ is some pure state expressible as a statevector.
+ *   
+ *   In contrast, a purity less than `1` indicates the matrix is _mixed_ and can be
+ *   understood as a convex combination of multiple (at least _two_) pure states.
+ *   That is,
+ *   @f[
+         \dmrho \equiv \sum\limits_n p_n \ketbra{\phi}{\phi}_n,
+ *   @f]
+ *   where @f$p_n \in [0,1]@f$ and sum to `1` whenever @f$\dmrho@f$ is a valid and correctly
+ *   normalised density matrix. Mixedness can result, for example, from @ref decoherence.
+ * 
+ *   The minimum purity of an @f$N@f$-qubit density matrix is @f$ 1/2^N @f$, which is
+ *   admitted only by the maximally-mixed state @f$ \dmrho = \hat{\id} / 2^N @f$.
+ * 
+ * - When @p qureg is a statevector @f$ \svpsi @f$, this function returns
+ *   @f[
+         \tr{ \ketbra{\psi}{\psi} \; \ketbra{\psi}{\psi} } 
+            = \left( \sum\limits_i |\psi_i|^2 \right)^2
+ *   @f]
+ *   where @f$\psi_i@f$ is the @f$i@f$-th amplitude of @f$\svpsi@f$. This is always `1` for
+ *   any valid statevector, and is otherwise equivalent to the output of calcTotalProb(), squared.
+ * 
+ * @constraints
+ *
+ * - The output of this function is only a reliable measure of purity when @p qureg is correctly 
+ *   normalised. For example, an invalid density matrix can return a purity of `1`, such as the
+ *   @f$N@f$-qubit maximally-mixed state scaled by factor @f$ 2^N @f$. Note that the function 
+ *   calcTotalProb() alone _cannot_ be used to validate validity since it only consults diagonal 
+ *   elements, whereas the purity is informed by all elements.
+ *
+ * @equivalences
+ *
+ * - When @p qureg is a valid density matrix (specifically, Hermitian), this function is faster
+ *   than, but mathematically equivalent to, calling calcInnerProduct() and passing @p qureg twice.
+ *   ```
+     qcomp out = calcInnerProduct(qureg, qureg);
+     qreal pur = real(out); // im=0
+ *   ```
+ * - When @p qureg is a statevector, this function returns the output of calcTotalProb(), squared.
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createDensityQureg(5);
+    initRandomPureState(qureg);
+
+    // = 1
+    qreal purity1 = calcPurity(qureg);
+    reportScalar("purity1", purity1);
+
+    mixTwoQubitDepolarising(qureg, 0, 1, 0.5);
+
+    // < 1
+    qreal purity2 = calcPurity(qureg);
+    reportScalar("purity2", purity2);
+ * ```
+ *
+ * @param[in] qureg the reference state, which is unchanged.
+ * @returns The purity of @p qureg.
+ * @throws @validationerror
+ * - if @p qureg is uninitialised.
+* @notyetvalidated
+ * @see
+ * - calcFidelity()
+ * - calcTotalProb()
+ * @author Tyson Jones
+ */
 qreal calcPurity(Qureg qureg);
 
 
@@ -238,13 +868,207 @@ qreal calcPurity(Qureg qureg);
  */
 
 
-/// @notyetdoced
-/// @notyetvalidated
+/** Calculates the fidelity between @p qureg and @p other, where at least one is a
+ * statevector.
+ *
+ * @formulae
+ * 
+ * - When both @p qureg and @p other are statevectors (respectively @f$\ket{\psi}@f$ and 
+ *   @f$\ket{\phi}@f$), this function returns
+ *   @f[
+         \left| \braket{\phi}{\psi} \right|^2.
+ *   @f]
+ * - When @p qureg is a density matrix @f$\dmrho@f$ and @p other is a statevector @f$\svpsi@f$,
+ *   this function returns
+ *   @f[
+         \bra{\psi} \dmrho \ket{\psi},
+ *   @f]
+ *   and similarly when @p qureg is a statevector and @p other is a density matrix.
+ * 
+ * @constraints
+ *
+ * - The output of this function is always real, which validation will check after computing the
+ *   fidelity as a complex scalar. Specifically, validation will assert that the result has an
+ *   absolute imaginary component less than the validation epsilon, which can be adjusted with
+ *   setValidationEpsilon().
+ * 
+ * - This function does not yet support both @p qureg and @p other being density matrices, for
+ *   which the fidelity calculation is more substantial.
+ * 
+ * - When @p qureg and @p other are _both_ statevectors, or _both_ density matrices, then _both_ or
+ *   _neither_ must be GPU-accelerated. That is, their CPU vs GPU deployments must agree. They are
+ *   permitted to differ in distribution however. Such considerations are only relevant when
+ *   creating the registers using createCustomQureg(), since the automatic deployments of createQureg()
+ *   and createDensityQureg() will always agree.
+ * 
+ * - When @p qureg and @p other dimensionally _differ_ (i.e. one is a statevector while the other is a
+ *   density matrix), the statevector must not be distributed _unless_ the density matrix is distributed.
+ *   The CPU vs GPU deployments however are permitted to disagree. These requirements are again
+ *   consistent with the automatic deployments of the createQureg() and createDensityQureg() functions.
+ * 
+ * @equivalences
+ * 
+ * - When both @p qureg and @p other are statevectors, this function is equivalent to calling
+ *   calcInnerProduct() and squaring the absolute value of the result.
+ *   ```
+     qcomp prod = calcInnerProduct(qureg, other);
+     qreal fid = pow(abs(prod), 2);
+ *   ```
+ * - When one of @p qureg or @p other is a statevector in the computational basis state @f$\ket{i}@f$
+ *   (e.g. as can be produced via initClassicalState()), this function is slower but equivalent to 
+ *   finding directly the probability of the basis state.
+ *   ```
+     // initClassicalState(other, index);
+
+     qreal fid = calcProbOfBasisState(qureg, index);
+ *   ```
+ *
+ * @myexample
+ * ```
+   // rho = |psi><psi|
+   Qureg psi = createQureg(5);
+   Qureg rho = createDensityQureg(5);
+   initRandomPureState(psi);
+   initPureState(rho, psi);
+
+   qreal fid0 = calcFidelity(rho, psi); // = 1
+
+   mixDepolarising(rho, 0, 0.5);
+   qreal fid1 = calcFidelity(rho, psi); // < 1
+ * ```
+ *
+ * @param[in] qureg a state
+ * @param[in] other another state containing an equal number of qubits.
+ * @returns The fidelity between @p qureg and @p other.
+ * @throws @validationerror
+ * - if @p qureg or @p other is uninitialised.
+ * - if @p qureg and @p other contain a different number of qubits.
+ * - if @p qureg and @p other are incompatible deployed.
+ * - if both @p qureg and @p other are density matrices (as is not yet supported).
+ * - if @p qureg or @p other is unnormalised such that the calculated fidelity is non-real.
+ * @notyetvalidated
+ * @see
+ * - calcInnerProduct()
+ * - calcDistance()
+ * @author Tyson Jones
+ */
 qreal calcFidelity(Qureg qureg, Qureg other);
 
-/// @notyetdoced
-/// @notyetvalidated
-qreal calcDistance(Qureg qureg1, Qureg qureg2);
+
+/** Calculates one of three distance measures between @p qureg and @p other, depending
+ * upon whether one or both are density matrices. These are the Hilbert-Schmidt distance,
+ * Bures distance and purified distance.
+ *
+ * @formulae
+ * 
+ * - When both @p qureg and @p other are statevectors (respectively @f$\ket{\psi}@f$ and 
+ *   @f$\ket{\phi}@f$), this function returns the **Bures distance** defined as
+ *   @f[
+         d_B\left(\ket{\psi},\ket{\phi}\right) = \sqrt{2 - 2 \left| \braket{\phi}{\psi} \right|}
+ *   @f]
+ *   where @f$\left| \braket{\phi}{\psi} \right|@f$ is the square-root of the fidelity
+ *   between @f$\ket{\psi}@f$ and @f$\ket{\phi}@f$ as would be computed by calcFidelity().
+ *
+ * - When both @p qureg and @p other are density matrices (respectively @f$\mathbf{\rho}@f$
+ *   and @f$\mathbf{\sigma}@f$), this function returns the **Hilbert-Schmidt distance** defined as
+ *   @f[
+         d_{HS}\left(\mathbf{\rho}, \mathbf{\sigma}\right) 
+            = 
+            \sqrt{ \tr{
+               \left| \mathbf{\rho} - \mathbf{\sigma} \right|^2
+            } }
+            =
+            \sqrt{
+               \sum\limits_{ij} \left| \rho_{ij} - \sigma_{ij} \right|^2
+            }.
+ *   @f]
+ *
+ * - When one of @p qureg or @p other is a statevector @f$\svpsi@f$, and the other is a density
+ *   matrix @f$\dmrho@f$, this function returns the **purified distance** defined as
+ *   @f[
+         d_p\left(\svpsi,\dmrho\right) = \sqrt{ 1 - \brapsi \dmrho \svpsi }
+ *   @f]
+ *   where @f$\brapsi \dmrho \svpsi@f$ is the fidelity as returned by calcFidelity().
+ * 
+ * @constraints
+ * 
+ * - The output of this function is always real, which is always mathematically satisfied by the
+ *   Hilbert-Schmidt distance, but may be violated by the Bures and purified distances when the
+ *   input Qureg are not normalised, or otherwise due to numerical imprecision. Postcondition
+ *   validation of the Bures distance will check that
+ *   @f[
+         \left| \braket{\phi}{\psi} \right| \le 1 + \valeps
+ *   @f]
+ *   while the purified distance validation will check that
+ *   @f[
+         \left| \, \im{ \brapsi \dmrho \svpsi } \, \right| \le \valeps, \\
+         \re{ \brapsi \dmrho \svpsi } \le 1 + \valeps,
+ *   @f]
+ *   where @f$\valeps@f$ is the validation epsilon, adjustable via setValidationEpsilon().
+ * 
+ * - Even when the above postcondition validation is disabled, the Bures and purified distance
+ *   calculations will respectively replace @f$\left| \braket{\phi}{\psi} \right|@f$ and 
+ *   @f$\re{ \brapsi \dmrho \svpsi }@f$ which exceed @f$1@f$ with value @f$1@f$, and the imaginary
+ *   component of @f$\brapsi \dmrho \svpsi@f$ is discarded.
+ * 
+ * - When @p qureg and @p other are _both_ statevectors, or _both_ density matrices, then _both_ or
+ *   _neither_ must be GPU-accelerated. That is, their CPU vs GPU deployments must agree. They are
+ *   permitted to differ in distribution however. Such considerations are only relevant when
+ *   creating the registers using createCustomQureg(), since the automatic deployments of createQureg()
+ *   and createDensityQureg() will always agree.
+ * 
+ * - When @p qureg and @p other dimensionally _differ_ (i.e. one is a statevector while the other is a
+ *   density matrix), the statevector must not be distributed _unless_ the density matrix is distributed.
+ *   The CPU vs GPU deployments however are permitted to disagree. These requirements are again
+ *   consistent with the automatic deployments of the createQureg() and createDensityQureg() functions.
+ * 
+ * @equivalences
+ * 
+ * - When both @p qureg and @p other are statevectors, this function wraps calcInnerProduct().
+ *   ```
+     qcomp prod = calcInnerProduct(qureg, other); // <qureg|other>
+     qreal mag = abs(prod);
+     mag = (mag > 1)? 1 : mag;
+     qreal dist = std::sqrt(2 - 2 * mag);
+ *   ```
+ *
+ * - When @p qureg is a density matrix and @p other is a statevector, this function wraps calcInnerProduct()
+ *   as a complex-valued proxy for calcFidelity().
+ *   ```
+     qcomp prod = calcInnerProduct(other, qureg); // <other|qureg|other>
+     qreal re = real(prod);
+     re = (re > 1)? 1 : re;
+     qreal dist = sqrt(1 - re);
+ *   ```
+ *
+ * @myexample
+ * ```
+   Qureg rho1 = createDensityQureg(5);
+   Qureg rho2 = createDensityQureg(5);
+
+   initRandomMixedState(rho1, 10);
+   setQuregToClone(rho2, rho1);
+   qreal distA = calcDistance(rho1, rho2); // = 0
+
+   initRandomMixedState(rho2, 10);
+   qreal distB = calcDistance(rho1, rho2); // > 0
+ * ```
+ *
+ * @param[in] qureg a state
+ * @param[in] other another state containing an equal number of qubits
+ * @returns The distance between @p qureg and @p other, according to the above measures.
+ * @throws @validationerror
+ * - if @p qureg or @p other is uninitialised.
+ * - if @p qureg and @p other contain a different number of qubits.
+ * - if @p qureg and @p other are incompatible deployed.
+ * - if @p qureg or @p other is unnormalised such that the Bures or purified distances would be non-real.
+ * @notyetvalidated
+ * @see
+ * - calcInnerProduct()
+ * - calcFidelity()
+ * @author Tyson Jones
+ */
+qreal calcDistance(Qureg qureg, Qureg other);
 
 
 /** @} */
@@ -258,13 +1082,198 @@ qreal calcDistance(Qureg qureg1, Qureg qureg2);
  */
 
 
-/// @notyetdoced
-/// @notyetvalidated
+/** Creates and populates a new Qureg which is a reduced density matrix resulting from tracing out 
+ * the specified qubits of @p qureg. This should be later freed by the user like all Qureg.
+ * 
+ * Note that the deployments of the output Qureg (i.e. whether multithreaded, GPU-accelerated and
+ * distributed) will match those of @p qureg. It is ergo intended that this function is used to
+ * trace out few qubits, and may show worsening performance when tracing many qubits.
+ * 
+ * The ordering of @p traceOutQubits has no effect, and the ordering of the remaining qubits in
+ * the output Qureg match their original relative ordering in @p qureg.
+ * 
+ * @formulae
+ * 
+ * Let @f$\dmrho_{\text{in}} = @f$ @p qureg and let @f$\vec{t} = @f$ @p traceOutQubits which is a list of
+ * length @f$n = @f$ @p numTraceQubits.
+ * 
+ * This function returns a new Qureg @f$\dmrho_{\text{out}}@f$ which satisfies
+ * @f[
+        \dmrho_{\text{out}} = \text{Tr}_{\vec{t}} \left( \dmrho_{\text{in}} \right)
+        =
+        \sum\limits_i^{2^n} 
+        (\hat{\id} \otimes \bra{i}_{\vec{t}} ) \,
+         \dmrho_{\text{in}} \,
+        (\hat{\id} \otimes \ket{i}_{\vec{t}} )
+ * @f]
+ * where @f$\ket{i}_{\vec{t}}@f$ notates the @f$i@f$-th basis state (in any orthonormal basis) of the
+ * targeted qubits, and @f$(\hat{\id} \otimes \ket{i}_{\vec{t}})@f$ notates interleaved identity operators
+ * upon the non-targeted qubits.
+ * 
+ * Given an @f$N@f$-qubit Qureg @f$\dmrho_{\text{in}}@f$, the output @f$\dmrho_{\text{out}}@f$ contains
+ * @f$N-n@f$ qubits.
+ * 
+ * @constraints
+ * 
+ * - The given @p qureg must be a density matrix. It is however straightforward to prepare a density matrix
+ *   from a statevector.
+ *   ```
+     // let qureg be the intended initial statevector
+
+     Qureg temp = createDensityQureg(qureg.numQubits);
+     initPureState(temp, qureg);
+
+     Qureg reduced = calcPartialTrace(temp, traceOutQubits, numTraceQubits);
+     destroyQureg(temp);
+ *   ```
+ * 
+ * - When @p qureg is distributed, the returned Qureg will also be distributed, which imposes a minimum on
+ *   the number of qubits contained within; @f$\log_2(W)@f$ where @f$W@f$ is the number of distributed nodes
+ *   (or "world size"). This imposes a maximum upon @p traceOutQubits of
+ *   ```
+ *   numTraceQubits <= qureg.numQubits - qureg.logNumNodes
+ *   ```
+ * 
+ * @equivalences
+ * 
+ * - The function calcReducedDensityMatrix() is entirely equivalent, but conveniently permits specifying
+ *   a list of which qubits to _retain_ during partial tracing. 
+ * 
+ * - The functions setQuregToPartialTrace() and setQuregToReducedDensityMatrix() are also equivalent but
+ *   permit overwriting an existing Qureg.
+ *  
+ * @myexample
+ * 
+ * ```
+   Qureg state = createDensityQureg(5);
+   initRandomMixedState(state, 10);
+   reportQureg(state);
+
+   int qubits[] = {0,2,4};
+   Qureg reduced = calcPartialTrace(state, qubits, 3);
+   reportQureg(reduced);
+
+   // state's qubits {1,3} have become reduced's qubits {0,1}
+ * ```
+ * 
+ * @param[in] qureg          a density matrix which is not modified.
+ * @param[in] traceOutQubits a list of qubits to trace out and ergo from the output Qureg.
+ * @param[in] numTraceQubits the length of @p traceOutQubits.
+ * @returns A new, smaller Qureg initialised to the reduced density matrix of @p qureg.
+ * @throws @validationerror
+ * - if @p qureg is uninitialised.
+ * - if @p numTraceQubits is less than one.
+ * - if @p numTraceQubits is equal or greater than the number of qubits in @p qureg.
+ * - if @p qureg is distributed and @p numTraceQubits exceeds `qureg.numQubits - qureg.logNumNodes`.
+ * - if the system contains insufficient RAM (or VRAM) to store the new Qureg in any deployment.
+ * - if any memory allocation of the output Qureg unexpectedly fails.
+ * @throws seg-fault
+ * - if @p traceOutQubits is not a list of length @p numTraceQubits.
+ * @notyetvalidated
+ * @see
+ * - calcReducedDensityMatrix()
+ * - setQuregToPartialTrace()
+ * - setQuregToReducedDensityMatrix()
+ * @author Tyson Jones
+ */
 Qureg calcPartialTrace(Qureg qureg, int* traceOutQubits, int numTraceQubits);
 
 
-/// @notyetdoced
-/// @notyetvalidated
+/** Creates and populates a new Qureg which is a reduced density matrix of @p qureg,
+ * retaining only the specified qubits and tracing out all others.
+ * 
+ * Note that the deployments of the output Qureg (i.e. whether multithreaded, GPU-accelerated and
+ * distributed) will match those of @p qureg. It is ergo intended that this function is used to
+ * preserve most qubits of @p qureg, and may show worsening performance when retaining only few.
+ * 
+ * > [!CAUTION]
+ * > The ordering of @p retainQubits has no effect on the output state. The ordering of the
+ * > retained qubits will match their original, relative ordering in @p qureg.
+ *
+ * @formulae
+ * 
+ * This function is entirely equivalent to calcPartialTrace() except that here the _retained_ qubits
+ * are specified, whereas calcPartialTrace() accepts those to be traced out.
+ * 
+ * Let @f$\dmrho_{\text{in}} = @f$ @p qureg, @f$\vec{r} = @f$ @p retainQubits, and let @f$\vec{q}@f$
+ * be a list containing _all_ qubits of @p qureg. This function partially traces out all qubits in
+ * list @f$\vec{t} = \vec{q} \setminus \vec{r}@f$, and returns a new Qureg @f$\dmrho_{\text{out}}@f$ 
+ * which satisfies
+ * @f[
+        \dmrho_{\text{out}} = \text{Tr}_{\vec{t}} \left( \dmrho_{\text{in}} \right)
+        =
+        \sum\limits_i^{2^n} 
+        (\hat{\id} \otimes \bra{i}_{\vec{t}} ) \,
+         \dmrho_{\text{in}} \,
+        (\hat{\id} \otimes \ket{i}_{\vec{t}} )
+ * @f]
+ * where @f$\ket{i}_{\vec{t}}@f$ notates the @f$i@f$-th basis state (in any orthonormal basis) of the
+ * qubits in @f$\vec{t}@f$, and @f$(\hat{\id} \otimes \ket{i}_{\vec{t}})@f$ notates interleaved identity
+ * operators upon the qubits in @f$\vec{r}@f$.
+ * 
+ * @constraints
+ * 
+ * - The given @p qureg must be a density matrix. It is however straightforward to prepare a density matrix
+ *   from a statevector.
+ *   ```
+     // let qureg be the intended initial statevector
+
+     Qureg temp = createDensityQureg(qureg.numQubits);
+     initPureState(temp, qureg);
+
+     Qureg reduced = calcReducedDensityMatrix(temp, retainQubits, numRetainQubits);
+     destroyQureg(temp);
+ *   ```
+ * 
+ * - When @p qureg is distributed, the returned Qureg will also be distributed, which imposes a minimum on
+ *   the number of qubits contained within; @f$\log_2(W)@f$ where @f$W@f$ is the number of distributed nodes
+ *   (or "world size"). This imposes bounds upon @p numRetainQubits of
+ *   ```
+ *   qureg.logNumNodes <= numRetainQubits <= qureg.numQubits - 1
+ *   ```
+ *
+ * @equivalences
+ * 
+ * - The function calcPartialTrace() is entirely equivalent, but permits directly specifying the qubits to
+ *   be traced out.
+ * 
+ * - The functions setQuregToPartialTrace() and setQuregToReducedDensityMatrix() are also equivalent but
+ *   permit overwriting an existing Qureg.
+ *  
+ * @myexample
+ * 
+ * ```
+   Qureg state = createDensityQureg(5);
+   initRandomMixedState(state, 10);
+   reportQureg(state);
+
+   int qubits[] = {1,3};
+   Qureg reduced = calcReducedDensityMatrix(state, qubits, 2);
+   reportQureg(reduced);
+
+   // state's qubits {1,3} have become reduced's qubits {0,1}
+ * ```
+ * 
+ * @param[in] qureg            a density matrix.
+ * @param[in] retainQubits    a list of qubits to retain in the reduced density matrix (at shifted, contiguous indices).
+ * @param[in] numRetainQubits the length of @p retainQubits.
+ * @returns A new Qureg containing @p numRetainQubits qubits, initialised to the reduced density matrix of @p qureg.
+ * @throws @validationerror
+ * - if @p qureg is uninitialised.
+ * - if @p numRetainQubits is less than one.
+ * - if @p numRetainQubits is equal or greater than the number of qubits in @p qureg.
+ * - if @p qureg is distributed and @p numRetainQubits is less than `qureg.logNumNodes`.
+ * - if the system contains insufficient RAM (or VRAM) to store the new Qureg in any deployment.
+ * - if any memory allocation of the output Qureg unexpectedly fails.
+ * @throws seg-fault
+ * - if @p retainQubits is not a list of length @p numRetainQubits.
+ * @notyetvalidated
+ * @see
+ * - calcPartialTrace()
+ * - setQuregToPartialTrace()
+ * - setQuregToReducedDensityMatrix()
+ * @author Tyson Jones
+ */
 Qureg calcReducedDensityMatrix(Qureg qureg, int* retainQubits, int numRetainQubits);
 
 
@@ -294,27 +1303,219 @@ Qureg calcReducedDensityMatrix(Qureg qureg, int* retainQubits, int numRetainQubi
  */
 
 
-/// @ingroup calc_comparisons
-/// @notyetdoced
-/// @notyetvalidated
-qcomp calcInnerProduct(Qureg qureg1, Qureg qureg2);
+/** @ingroup calc_comparisons
+ * 
+ * Calculates the inner product of state @p qureg with @p other. 
+ *
+ * @formulae
+ * 
+ * - When both @p qureg and @p other are statevectors (respectively @f$\ket{\psi}@f$ and 
+ *   @f$\ket{\phi}@f$), this function returns
+ *   @f[
+         \braket{\psi}{\phi} = \sum\limits_i \psi_i^* \phi_i
+ *   @f]
+ *   where @f$\psi_i@f$ and @f$\phi_i@f$ are the @f$i@f$-th amplitudes of @f$\ket{\psi}@f$ 
+ *   (@p qureg) and  @f$\ket{\phi}@f$ (@p other) respectively, and @f$\alpha^*@f$ notates
+ *   the complex conjugate of scalar @f$\alpha@f$.
+ * 
+ * - When both @p qureg and @p other are density matrices (respectively @f$\mathbf{\rho}@f$
+ *   and @f$\mathbf{\sigma}@f$), this function returns
+ *   @f[
+         \tr{ \rho^\dagger \sigma } = \sum\limits_{ij} {\rho_{ij}}^* \, \sigma_{ij}.
+ *   @f]
+ * 
+ * - When @p qureg is a density matrix @f$\dmrho@f$ and @p other is a statevector @f$\ket{\phi}@f$,
+ *   this function returns
+ *   @f[
+         \bra{\phi} \dmrho^\dagger \ket{\phi}.
+ *   @f]
+ *
+ * - When @p qureg is a statevector @f$\svpsi@f$ and @p other is a density matrix @f$\mathbf{\sigma}@f$,
+ *   this function returns
+ *   @f[
+         \brapsi \mathbf{\sigma} \svpsi.
+ *   @f]
+ *
+ * @constraints
+ * 
+ * - When @p qureg and @p other are _both_ statevectors, or _both_ density matrices, then _both_ or
+ *   _neither_ must be GPU-accelerated. That is, their CPU vs GPU deployments must agree. They are
+ *   permitted to differ in distribution however. Such considerations are only relevant when
+ *   creating the registers using createCustomQureg(), since the automatic deployments of createQureg()
+ *   and createDensityQureg() will always agree.
+ * 
+ * - When @p qureg and @p other dimensionally _differ_ (i.e. one is a statevector while the other is a
+ *   density matrix), the statevector must not be distributed _unless_ the density matrix is distributed.
+ *   The CPU vs GPU deployments however are permitted to disagree. These requirements are again
+ *   consistent with the automatic deployments of the createQureg() and createDensityQureg() functions.
+ *
+ * @myexample
+ * ```
+   Qureg rho1 = createDensityQureg(5);
+   Qureg rho2 = createDensityQureg(5);
+
+   // rho1 = rho2 = |psi><psi|
+   initRandomPureState(rho1);
+   setQuregToClone(rho2, rho1);
+   qcomp prodA = calcInnerProduct(rho1, rho2); // = 1
+
+   // rho1 = rho2 = sum_i prob_i |psi_i><psi_i|
+   initRandomMixedState(rho1, 10);
+   setQuregToClone(rho2, rho1);
+   qcomp prodB = calcInnerProduct(rho1, rho2); // < 1, real
+
+   // rho1 != rho2
+   initRandomMixedState(rho2, 10);
+   qcomp prodC = calcInnerProduct(rho1, rho2); // abs < 1, complex
+ * ```
+ *
+ * @param[in] qureg a state
+ * @param[in] other another state with an equal number of qubits
+ * @returns The inner product of @p qureg with @p other.
+ * @throws @validationerror
+ * - if @p qureg or @p other is uninitialised.
+ * - if @p qureg and @p other contain a different number of qubits.
+ * - if @p qureg and @p other are incompatibly deployed.
+ * @notyetvalidated
+ * @see
+ * - calcDistance()
+ * - calcFidelity()
+ * @author Tyson Jones
+ */
+qcomp calcInnerProduct(Qureg qureg, Qureg other);
 
 
-/// @ingroup calc_expec
-/// @notyetdoced
-/// @notyetvalidated
+/** @ingroup calc_expec
+ * 
+ * Calculates the expectation value of the given permittedly non-Hermitian operator @p sum 
+ * - a weighted sum of Pauli strings with complex weights - under the given state @p qureg, 
+ * which is not modified.
+ * 
+ * @formulae
+ * 
+ * This function is mathematically equivalent to calcExpecPauliStrSum(), _except_ that here a
+ * complex scalar is returned. This permits obtaining the full scalar when @p sum contains non-real
+ * weights, and/or when @p qureg is unnormalised.
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createQureg(5);
+    PauliStrSum sum = createInlinePauliStrSum(R"(
+        0.123 + 3.5i  ZIZIZI
+        1.234 - 1E-5i XYZXZ
+        -1E-2         IIIII
+    )");
+
+    // prints "expec: 0.113+3.5i"
+    qcomp expec = calcExpecNonHermitianPauliStrSum(qureg, sum);
+    reportScalar("expec", expec);  
+ * ```
+ *
+ * @param[in] qureg the permittedly unnormalised reference state.
+ * @param[in] sum   the permittedly non-Hermitian operator.
+ * @returns The permittedly complex expectation value.
+ * @throws @validationerror
+ * - if @p qureg or @p sum are uninitialised.
+ * - if any PauliStr in @p sum targets a higher-index qubit than exists in @p qureg.
+* @notyetvalidated
+ * @see
+ * - calcExpecPauliStrSum()
+ * @author Tyson Jones
+ */
 qcomp calcExpecNonHermitianPauliStrSum(Qureg qureg, PauliStrSum sum); 
 
 
-/// @ingroup calc_expec
-/// @notyetdoced
-/// @notyetvalidated
+/** @ingroup calc_expec
+ * 
+ * Calculates the expectation value of the given permittedly non-Hermitian operator @p matr,
+ * under the given state @p qureg, without modifying it. 
+ * 
+ * @formulae
+ * 
+ * This function is mathematically equivalent to calcExpecFullStateDiagMatr(), _except_ that here a
+ * complex scalar is returned. This permits obtaining the full scalar when @p sum contains non-real
+ * elements, and/or when @p qureg is unnormalised.
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createQureg(5);
+    initPlusState(qureg);
+
+    FullStateDiagMatr matr = createFullStateDiagMatr(qureg.numQubits);
+
+    // profanely inefficient per-element initialisation
+    for (int n=0; n<matr.numElems; n++) {
+        qcomp elem = getQcomp(n, n+1);
+        setFullStateDiagMatr(matr, n, &elem, 1);
+    }
+
+    // prints "expec: 15.5+16.5i"
+    qcomp expec = calcExpecNonHermitianFullStateDiagMatr(qureg, matr);
+    reportScalar("expec", expec);
+ * ```
+ *
+ * @param[in] qureg the permittedly unnormalised reference state.
+ * @param[in] matr  the permittedly non-Hermitian operator.
+ * @returns The permittedly complex expectation value.
+ * @throws @validationerror
+ * - if @p qureg or @p matr are uninitialised.
+ * - if @p matr does not match the dimension of @p qureg
+ * - if @p matr is distributed but @p qureg is not
+* @notyetvalidated
+ * @see
+ * - calcExpecFullStateDiagMatr()
+ * - calcExpecFullStateDiagMatrPower()
+ * - calcExpecNonHermitianFullStateDiagMatrPower()
+ * @author Tyson Jones
+ */
 qcomp calcExpecNonHermitianFullStateDiagMatr(Qureg qureg, FullStateDiagMatr matr);
 
 
-/// @ingroup calc_expec
-/// @notyetdoced
-/// @notyetvalidated
+/** @ingroup calc_expec
+ * 
+ * Calculates the expectation value of the given permittedly non-Hermitian operator @p matrix,
+ * raised to the arbitrary complex @p exponent, under the given state @p qureg, which is not modified.
+ * 
+ * @formulae
+ * 
+ * This function is mathematically equivalent to calcExpecFullStateDiagMatrPower(), _except_ that 
+ * here a complex scalar is returned, in addition to @p exponent being permittedly complex.
+ * This permits obtaining the full scalar when @p qureg is unnormalised or @p matrix (after being
+ * raised to @p exponent) is non-Hermitian.
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createQureg(5);
+    initPlusState(qureg);
+
+    FullStateDiagMatr matrix = createFullStateDiagMatr(qureg.numQubits);
+
+    // profanely inefficient per-element initialisation
+    for (int n=0; n<matrix.numElems; n++) {
+        qcomp elem = getQcomp(n, n+1);
+        setFullStateDiagMatr(matrix, n, &elem, 1);
+    }
+
+    qcomp exponent = 3+4_i;
+
+    // prints "expec: -257.26-613.8i"
+    qcomp expec = calcExpecNonHermitianFullStateDiagMatrPower(qureg, matrix, exponent);
+    reportScalar("expec", expec);
+ * ```
+ *
+ * @param[in] qureg     the permittedly unnormalised reference state.
+ * @param[in] matrix    the permittedly non-Hermitian operator.
+ * @param[in] exponent  the permittedly complex exponent.
+ * @returns The permittedly complex expectation value.
+ * @throws @validationerror
+ * - if @p qureg or @p matrix are uninitialised.
+ * - if @p matrix does not match the dimension of @p qureg
+ * - if @p matrix is distributed but @p qureg is not
+* @notyetvalidated
+ * @see
+ * - calcExpecFullStateDiagMatr()
+ * @author Tyson Jones
+ */
 qcomp calcExpecNonHermitianFullStateDiagMatrPower(Qureg qureg, FullStateDiagMatr matrix, qcomp exponent);
 
 
@@ -346,6 +1547,7 @@ qreal calcProbOfMultiQubitOutcome(Qureg qureg, std::vector<int> qubits, std::vec
 /// @notyetdoced
 /// @notyetvalidated
 /// @cpponly
+/// @cppvectoroverload
 /// @see calcProbsOfAllMultiQubitOutcomes()
 std::vector<qreal> calcProbsOfAllMultiQubitOutcomes(Qureg qureg, std::vector<int> qubits);
 
diff --git a/quest/include/decoherence.h b/quest/include/decoherence.h
index 689ec640f..cbe6b04c5 100644
--- a/quest/include/decoherence.h
+++ b/quest/include/decoherence.h
@@ -32,6 +32,7 @@ extern "C" {
 /** @notyetdoced
  * 
  * @formulae
+ * 
  * Let @f$ \dmrho = @f$ @p qureg, @f$ p = @f$ @p prob and @f$ t = @f$ @p target. 
  * 
  * This function effects
@@ -44,6 +45,7 @@ extern "C" {
  * @f]
  * 
  * @equivalences
+ * 
  * This function is equivalent to (but much faster than):
  * - mixPaulis() with a zero probability for the @f$\hat{X}@f$ and @f$\hat{Y}@f$ components.
  *   ```
@@ -76,6 +78,7 @@ void mixDephasing(Qureg qureg, int target, qreal prob);
 /** @notyetdoced
  * 
  * @formulae
+ * 
  * Let @f$ \dmrho = @f$ @p qureg, @f$ p = @f$ @p prob, @f$ t_1 = @f$ @p target1 and @f$ t_2 = @f$ @p target2.
  * 
  * This function effects
@@ -94,6 +97,7 @@ void mixDephasing(Qureg qureg, int target, qreal prob);
  * @f]
  * 
  * @equivalences
+ * 
  * This function is equivalent to (but much faster than):
  * - mixKrausMap() with (scaled) @f$\hat{\id}\otimes\hat{\id}@f$, @f$\hat{\id}\otimes\hat{Z}@f$, 
  *   @f$\hat{Z}\otimes\hat{\id}@f$ and @f$\hat{Z}\otimes\hat{Z}@f$ Kraus operators.
@@ -120,6 +124,7 @@ void mixTwoQubitDephasing(Qureg qureg, int target1, int target2, qreal prob);
 /** @notyetdoced
  * 
  * @formulae
+ * 
  * Let @f$ \dmrho = @f$ @p qureg, @f$ p = @f$ @p prob and @f$ t = @f$ @p target. 
  * 
  * This function effects
@@ -133,6 +138,7 @@ void mixTwoQubitDephasing(Qureg qureg, int target1, int target2, qreal prob);
  * @f]
  * 
  * @equivalences
+ * 
  * This function is equivalent to (but much faster than):
  * - mixPaulis() with a uniform probability.
  *   ```
@@ -161,6 +167,7 @@ void mixDepolarising(Qureg qureg, int target, qreal prob);
 /** @notyetdoced
  * 
  * @formulae
+ * 
  * Let @f$ \dmrho = @f$ @p qureg, @f$ p = @f$ @p prob, @f$ t_1 = @f$ @p target1 and @f$ t_2 = @f$ @p target2.
  * 
  * This function effects:
@@ -209,6 +216,7 @@ void mixDepolarising(Qureg qureg, int target, qreal prob);
  * @f]
  *
  * @equivalences
+ * 
  * This function is equivalent to (but much faster than):
  * - mixKrausMap() with Kraus operators containing every possible tensor product 
  *   of two Pauli matrices, all scaled by @f$ (p/15)^{1/2} @f$, _except_ for
@@ -222,6 +230,7 @@ void mixTwoQubitDepolarising(Qureg qureg, int target1, int target2, qreal prob);
 /** @notyetdoced
  * 
  * @formulae
+ * 
  * Let @f$ \dmrho = @f$ @p qureg, @f$ p = @f$ @p prob and @f$ t = @f$ @p target.
  * 
  * This function effects
@@ -239,6 +248,7 @@ void mixTwoQubitDepolarising(Qureg qureg, int target1, int target2, qreal prob);
  * @f]
  * 
  * @equivalences
+ * 
  * This function is equivalent to (but much faster than):
  * - mixKrausMap() with the above Kraus operators.
  *   ```
@@ -258,6 +268,7 @@ void mixDamping(Qureg qureg, int target, qreal prob);
 /** @notyetdoced
  * 
  * @formulae
+ * 
  * Let @f$ \dmrho = @f$ @p qureg, @f$ t = @f$ @p target, and
  * @f$ p_x = @f$ @p probX, @f$ p_y = @f$ @p probY, @f$ p_z = @f$ @p probZ.
  * 
@@ -274,6 +285,7 @@ void mixDamping(Qureg qureg, int target, qreal prob);
  * @f]
  * 
  * @equivalences
+ * 
  * This function is equivalent to (but much faster than):
  * - mixKrausMap() with (scaled) @f$\hat{\id}@f$, @f$\hat{X}@f$, @f$\hat{Y}@f$ and @f$\hat{Z}@f$ Kraus operators.
  *   ```
@@ -300,6 +312,7 @@ void mixPaulis(Qureg qureg, int target, qreal probX, qreal probY, qreal probZ);
 /** @notyetdoced
  * 
  * @formulae
+ * 
  * Let @f$ \dmrho_1 = @f$ @p qureg, @f$ \dmrho_2 = @f$ @p other and @f$ p = @f$ @p prob.
  * 
  * This function effects
@@ -318,6 +331,7 @@ void mixQureg(Qureg qureg, Qureg other, qreal prob);
 /** @notyetdoced
  * 
  * @formulae
+ * 
  * Let @f$ \dmrho = @f$ @p qureg, @f$ \vec{t} = @f$ @p targets and @f$ \hat{K}^{(i)} @f$
  * denote the @f$i@f$-th Kraus operator in @p map.
  * 
diff --git a/quest/include/operations.h b/quest/include/operations.h
index 0108af431..b138f2009 100644
--- a/quest/include/operations.h
+++ b/quest/include/operations.h
@@ -69,6 +69,7 @@ digraph {
  * @enddot
  * 
  * @formulae
+ * 
  * Let @f$ \hat{U} = @f$ @p matrix, @f$ t = @f$ @p target, and let @f$\hat{U}_t@f$
  * notate operating @f$\hat{U}@f$ upon the @f$ t @f$-th qubit among@f$ N @f$, i.e.
  * @f[ 
@@ -85,9 +86,10 @@ digraph {
  *   @f]
  *
  * @constraints
+ * 
  * - Unitarity of @f$ \hat{U} = @f$ @p matrix requires that 
  *   @f$ \hat{U} \hat{U}^\dagger = \id @f$. Validation will check that @p matrix is
- *   approximately unitarity via
+ *   approximately unitary via
  *   @f[ 
         \max\limits_{ij} \Big|\left(\hat{U} \hat{U}^\dagger - \id\right)_{ij}\Big|^2 \le \valeps
  *   @f]
@@ -125,9 +127,7 @@ digraph {
 void applyCompMatr1(Qureg qureg, int target, CompMatr1 matrix);
 
 
-/** @notyetdoced
- * 
- * Applies a singly-controlled one-qubit dense unitary @p matrix to the specified 
+/** Applies a singly-controlled one-qubit dense unitary @p matrix to the specified 
  * @p target qubit of @p qureg.
  * 
  * @diagram
@@ -156,8 +156,84 @@ digraph {
 }
  * @enddot
  *
+ * @formulae
+ * 
+ * Let @f$ \hat{U} = @f$ @p matrix, @f$ t = @f$ @p target, @f$ c = @f$ @p control,
+ * and let @f$\hat{O}_q@f$ denote an operator upon the @f$q@f$-th qubit.
+ * This function effects operator
+ * @f[
+    C_c[\hat{U}_t] = \ketbra{0}{0}_c \otimes \id_t + \ketbra{1}{1}_c \otimes \hat{U}_t,
+ * @f]
+ * where @f$\hat{U}@f$ is effected upon basis states for which qubit @f$c@f$ has value `1`.
+ * For illustration, when @p control=0 and @p target=1, this function would effect
+ * @f[
+    C_1[\hat{U}_0] \equiv 
+    \begin{pmatrix} 
+      1 \\ & 1 \\ & & u_{00} & u_{01} \\ & & u_{10} & u_{11}
+    \end{pmatrix}.
+ * @f]
+ *
+ * This operation can be performed upon statevectors and density matrices.
+ *
+ * - When @p qureg is a statevector @f$ \svpsi @f$, this function effects
+ *   @f[ 
+        \svpsi \rightarrow C_c[\hat{U}_t] \, \svpsi.
+ *   @f]
+ * - When @p qureg is a density matrix @f$\dmrho@f$, this function effects
+ *   @f[ 
+        \dmrho \rightarrow C_c[\hat{U}_t] \, \dmrho \, {C_c[\hat{U}_t]}^\dagger.
+ *   @f]
+ *
+ * @constraints
+ * 
+ * - Unitarity of @f$ \hat{U} = @f$ @p matrix requires that 
+ *   @f$ \hat{U} \hat{U}^\dagger = \id @f$. Validation will check that @p matrix is
+ *   approximately unitary via
+ *   @f[ 
+        \max\limits_{ij} \Big|\left(\hat{U} \hat{U}^\dagger - \id\right)_{ij}\Big|^2 \le \valeps
+ *   @f]
+ *   where the validation epsilon @f$ \valeps @f$ can be adjusted with setValidationEpsilon().
+ *
+ * @equivalences
+ * 
+ * - This function is faster than, but mathematically equivalent to, initialising a two-qubit
+ *   matrix (CompMatr2) to the @f$C_1[\hat{U}_0]@f$ matrix above, and calling applyCompMatr2():
+ *   ```
+     CompMatr2 m = getInlineCompMatr2({
+         {1,0,0,0}, 
+         {0,1,0,0}, 
+         {0,0,u00,u01}, 
+         {0,0,u10,u11}});
+     
+     applyCompMatr2(qureg, target, control);
+ *   ```
+ *
+ * @myexample
+ * ```
+    Qureg qureg = createQureg(5);
+
+    CompMatr1 matrix = getInlineCompMatr1({
+        {-1i/sqrt(2), 1i/sqrt(2)},
+        {(1i-1)/2,    (1i-1)/2}
+    });
+
+    // C_0[U_2]
+    applyControlledCompMatr1(qureg, 0, 2, matrix); 
+ * ```
+
+ * @param[in,out] qureg   the state to modify.
+ * @param[in]     control the index of the control qubit.
+ * @param[in]     target  the index of the target qubit.
+ * @param[in]     matrix  the Z-basis unitary matrix to effect.
+ * @throws @validationerror
+ * - if @p qureg or @p matrix are uninitialised.
+ * - if @p matrix is not approximately unitary.
+ * - if @p control or @p target are an invalid qubit index.
+ * - if @p control and @p target overlap.
  * @see
- * - applyCompMatr1()
+ * - applyMultiControlledCompMatr1()
+ * - applyMultiStateControlledCompMatr1()
+ * @author Tyson Jones
  */
 void applyControlledCompMatr1(Qureg qureg, int control, int target, CompMatr1 matrix);
 
@@ -201,6 +277,24 @@ digraph {
 }
  * @enddot
  *
+ * @formulae
+ * 
+ * Let @f$ \vec{c} = @f$ @p controls, @f$ t = @f$ @p target, and @f$ \hat{U} = @f$ @p matrix.
+ * This functions effects operator
+ * 
+ * @f[
+    C_{\vec{c}}[\hat{U}_t]
+ * @f]
+ *
+ * which is equivalent to applying @f$ \hat{U}_t @f$ upon only the computational basis states for which 
+ * all control qubits are in the @f$ \ket{1} @f$ state.
+ *
+ * Precisely, let @f$n = 2^{|\vec{c}|}-1@f$. Then
+ * @f[
+    C_{\vec{c}}[\hat{U}_t] = \sum\limits_{i=0}^{n-1} \ketbra{i}{i}_{\vec{c}} \otimes \hat{\id}_t
+      + \ketbra{n}{n}_{\vec{c}} \otimes \hat{U}_t
+ * @f]
+ *
  * @see
  * - applyCompMatr1()
  */
@@ -506,11 +600,11 @@ extern "C" {
  * The qubits within @p targets are treated to be ordered least to most significant with respect
  * to @f$ M @f$. That is, if @f$ M @f$ was hypothetically separable single-qubit matrices
  * @f[
-      M \equiv A \otimes B \otimes C \otimes \dots 
+      M \equiv \dots \otimes C \otimes B \otimes A
  * @f]
  * then this function would effect
  * @f[
-      \hat{M}_{\text{targets}} \equiv A_{\text{targets}[0]} B_{\text{targets}[1]} C_{\text{targets}[2]} \dots
+      \hat{M}_{\text{targets}} \equiv A_{\text{targets}[0]} \cdot B_{\text{targets}[1]} \cdot C_{\text{targets}[2]} \cdot \dots
  * @f]
  *
  * @see
@@ -1345,6 +1439,7 @@ extern "C" {
  * upon the @p target qubit, where @f$ \hat{\sigma}_x @f$ is the Pauli X matrix.
  *
  * @equivalences
+ * 
  * - This function is entirely equivalent to calling applyPauliGadget() with a single-site PauliStr.
  *   ```
      applyPauliGadget(qureg, getInlinePauliStr("X", {target}), angle);
@@ -1383,6 +1478,7 @@ void applyRotateX(Qureg qureg, int target, qreal angle);
  * upon the @p target qubit, where @f$ \hat{\sigma}_y @f$ is the Pauli Y matrix.
  *
  * @equivalences
+ * 
  * - This function is entirely equivalent to calling applyPauliGadget() with a single-site PauliStr.
  *   ```
      applyPauliGadget(qureg, getInlinePauliStr("Y", {target}), angle);
@@ -1421,6 +1517,7 @@ void applyRotateY(Qureg qureg, int target, qreal angle);
  * upon the @p target qubit, where @f$ \hat{\sigma}_z @f$ is the Pauli Z matrix.
  *
  * @equivalences
+ * 
  * - This function is entirely equivalent to calling applyPauliGadget() with a single-site PauliStr.
  *   ```
      applyPauliGadget(qureg, getInlinePauliStr("Z", {target}), angle);
@@ -1528,6 +1625,7 @@ void applyMultiStateControlledRotateZ(Qureg qureg, int* controls, int* states, i
  * @f]
  *
  * @equivalences
+ * 
  * - Assuming @f$ \| \vec{n} \|_2 \ne 0 @f$, this function is agnostic to the normalisation
  *   of the axis vector.
  *   ```
@@ -1653,6 +1751,7 @@ extern "C" {
 /** @notyetdoced
  * 
  * @formulae
+ * 
  * Let @f$ \hat{\sigma} = @f$ @p str and @f$ \theta = @f$ @p angle. 
  * 
  * This function effects unitary
@@ -1663,6 +1762,7 @@ extern "C" {
  * Pauli. As such, this effects a multi-qubit rotation around an arbitrary Pauli string.
  * 
  * @equivalences
+ * 
  * - Because @f$ R_{\hat{\sigma}}(\theta) @f$ satisfies
  *   @f[
         R_{\hat{\sigma}}(\theta) \equiv 
@@ -1790,6 +1890,7 @@ extern "C" {
  * @f]
  *
  * @equivalences
+ * 
  * - This function is equivalent to calling applyPauliGadget() with a PauliStr containing only @f$ \hat{Z} @f$ and @f$ \id @f$.
  *   This latter function will actually automatically invoke applyPhaseGadget() which has an optimised implementation.
  * - This function is equivalent to, albeit much faster than, preparing a DiagMatr with @f$ \pm 1 @f$ elements (depending upon
@@ -1833,6 +1934,7 @@ void applyPhaseFlip(Qureg qureg, int target);
  * upon the @p target qubit.
  * 
  * @equivalences
+ * 
  * - This function is equivalent to, albeit much faster than, a Z-axis rotation with
  *   an adjustment to the global phase (which is redundant upon density matrices).
  *   @f[
@@ -1885,6 +1987,7 @@ digraph {
  * @enddot
  *
  * @equivalences
+ * 
  * - The target qubits are interchangeable, ergo
  *   ```
      applyTwoQubitPhaseFlip(qureg, target1, target2);
@@ -1945,6 +2048,7 @@ digraph {
  * @enddot
  *
  * @equivalences
+ * 
  * - The target qubits are interchangeable, ergo
  *   ```
      applyTwoQubitPhaseShift(qureg, target1, target2, angle);
@@ -1971,6 +2075,7 @@ void applyTwoQubitPhaseShift(Qureg qureg, int target1, int target2, qreal angle)
  * effected upon the target qubits.
  * 
  * @equivalences
+ * 
  * - The ordering of @p targets has no affect on the effected operation.
  * - This function is entirely equivalent to a multi-controlled Pauli-Z unitary (or a hypothetical
  *   many-controlled variant of applyPhaseFlip()) with all but one arbitrary target qubit becoming
@@ -2025,6 +2130,7 @@ digraph {
  * @enddot
  *
  * @equivalences
+ * 
  * - The ordering of @p targets has no affect on the effected operation.
  * - This function is equivalent to a multi-controlled variant of applyPhaseShift(), treating all
  *   but one arbitrary target qubit as control qubits.
diff --git a/quest/src/api/calculations.cpp b/quest/src/api/calculations.cpp
index 3c6213f20..25958c18d 100644
--- a/quest/src/api/calculations.cpp
+++ b/quest/src/api/calculations.cpp
@@ -338,14 +338,14 @@ qreal calcDistance(Qureg quregA, Qureg quregB) {
 
     // Hilbert-Schmidt = sqrt( Tr((A-B)(A-B)^dagger) = sqrt(sum_ij |A_ij - B_ij|^2)
     if (isDensA && isDensB) {
-        qreal dif = localiser_densmatr_calcHilbertSchmidtDistance(quregA, quregB);
+        qreal dif = localiser_densmatr_calcHilbertSchmidtDistance(quregA, quregB); // >= 0
         return std::sqrt(dif);
     }
 
     // Bures = sqrt(2 - 2 |<A|B>|) (even when unnormalised)
     if (!isDensA && !isDensB) {
         qcomp prod = localiser_statevec_calcInnerProduct(quregA, quregB);
-        qreal mag = std::abs(prod);
+        qreal mag = std::abs(prod); // >= 0
 
         validate_buresDistanceInnerProdIsNormalised(mag, __func__);
         mag = (mag > 1)? 1 : mag; // forgive eps error to avoid complex
diff --git a/utils/docs/Doxyfile b/utils/docs/Doxyfile
index 1d1086fe3..6efe89d70 100644
--- a/utils/docs/Doxyfile
+++ b/utils/docs/Doxyfile
@@ -310,6 +310,7 @@ ALIASES += "constraints=@par Constraints"
 ALIASES += "formulae=@par Formulae"
 ALIASES += "diagram=@par Diagram"
 ALIASES += "validationerror=error"
+ALIASES += "segfault=seg-fault"
 
 # We are temporarily hiding the @cppvectoroverload functions since they differ
 # trivially from the language-agnostic functions (ptr,len vs vector) yet clutter
diff --git a/utils/docs/latex/commands.tex b/utils/docs/latex/commands.tex
index dfd5da5cd..289cd2059 100644
--- a/utils/docs/latex/commands.tex
+++ b/utils/docs/latex/commands.tex
@@ -8,8 +8,10 @@
 \newcommand{\pauliz}{\hat{Z}}
 \newcommand{\ket}[1]{|#1\rangle}
 \newcommand{\bra}[1]{\langle#1|}
-\newcommand{\re}[1]{\text{Re}(#1)}
-\newcommand{\im}[1]{\text{Im}(#1)}
+\newcommand{\braket}[2]{\langle#1|#2\rangle}
+\newcommand{\ketbra}[2]{|#1\rangle\langle#2|}
+\newcommand{\re}[1]{\text{Re}\left(#1\right)}
+\newcommand{\im}[1]{\text{Im}\left(#1\right)}
 \newcommand{\tr}[1]{\text{Tr}\left(#1\right)}
 \newcommand{\svpsi}{\ket{\psi}}
 \newcommand{\brapsi}{\bra{\psi}}

From 1d7abf75e8aceec3ea733a3e08b17093d1aada96 Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Wed, 8 Oct 2025 21:47:39 -0400
Subject: [PATCH 30/32] refactored all macros to config options (#685)

All user-configurable macros utilised by the source code (e.g. `COMPILE_MPI`) are now CMake options, passed to the source only via preparation of the `config.h` header. This centralises them, reduces the myriad of arguments to the compiler command (which made verbose debugging cumbersome), makes erroneous overriding of macros more difficult (if not impossible), and logs the macro choices when installing QuEST.

We also took the chance to clean up the main CMakeLists.txt, defend against user-overriding of pre-set macros, and automate setting the QuEST version macros from the CMake build. Finally, we patched an issue when installing QuEST via FetchContent and/or inside a directory (like as a git submodule).

Tyson refactored options and Oliver patched the install issues.

---------

Co-authored-by: Oliver Thomson Brown <otbrown@users.noreply.github.com>
---
 .github/workflows/compile.yml                 |   1 +
 .github/workflows/test_free.yml               |   1 +
 CMakeLists.txt                                | 249 ++++++++++--------
 docs/cmake.md                                 |   7 +-
 docs/compile.md                               |   3 +
 quest/include/CMakeLists.txt                  |   9 +-
 quest/include/config.h.in                     | 167 +++++++++++-
 quest/include/modes.h                         |  75 +-----
 quest/include/precision.h                     |  15 +-
 quest/include/quest.h                         |  14 +-
 quest/include/types.h                         |  28 +-
 quest/include/version.h                       |  18 --
 quest/src/api/matrices.cpp                    |   3 +-
 quest/src/api/qureg.cpp                       |   1 +
 quest/src/comm/comm_config.cpp                |   2 +-
 quest/src/comm/comm_routines.cpp              |   1 +
 quest/src/core/parser.cpp                     |   1 +
 quest/src/core/printer.cpp                    |   1 +
 quest/src/core/utilities.cpp                  |   1 +
 quest/src/core/utilities.hpp                  |   1 +
 quest/src/cpu/cpu_config.cpp                  |   2 +-
 quest/src/cpu/cpu_subroutines.cpp             |   1 -
 quest/src/gpu/gpu_config.cpp                  |   2 +-
 quest/src/gpu/gpu_config.hpp                  |   1 +
 quest/src/gpu/gpu_cuquantum.cuh               |   4 +-
 quest/src/gpu/gpu_kernels.cuh                 |   2 +-
 quest/src/gpu/gpu_subroutines.cpp             |   4 +-
 quest/src/gpu/gpu_thrust.cuh                  |   4 +-
 quest/src/gpu/gpu_types.cuh                   |   2 +-
 tests/deprecated/CMakeLists.txt               |   1 -
 tests/deprecated/test_calculations.cpp        |  10 +-
 tests/deprecated/test_data_structures.cpp     |   6 -
 tests/deprecated/test_decoherence.cpp         |   7 +-
 tests/deprecated/test_gates.cpp               |   6 -
 tests/deprecated/test_main.cpp                |   4 -
 tests/deprecated/test_operators.cpp           |   6 -
 .../deprecated/test_state_initialisations.cpp |   3 -
 tests/deprecated/test_unitaries.cpp           |   6 -
 tests/deprecated/test_utilities.cpp           |   3 -
 utils/scripts/compile.sh                      | 228 +++++++++++-----
 40 files changed, 518 insertions(+), 382 deletions(-)
 delete mode 100644 quest/include/version.h

diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index 64d59a59d..fa7ca4234 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -228,6 +228,7 @@ jobs:
           -DENABLE_TESTING=ON
           -DFLOAT_PRECISION=${{ matrix.precision }}
           -DENABLE_DEPRECATED_API=${{ matrix.deprecated }}
+          -DDISABLE_DEPRECATION_WARNINGS=${{ matrix.deprecated }}
           -DENABLE_MULTITHREADING=${{ matrix.omp }}
           -DENABLE_DISTRIBUTION=${{ matrix.mpi }}
           -DENABLE_CUDA=${{ matrix.cuda }}
diff --git a/.github/workflows/test_free.yml b/.github/workflows/test_free.yml
index 7efd540d3..e0837bfde 100644
--- a/.github/workflows/test_free.yml
+++ b/.github/workflows/test_free.yml
@@ -66,6 +66,7 @@ jobs:
           -DENABLE_TESTING=ON
           -DENABLE_MULTITHREADING=OFF
           -DENABLE_DEPRECATED_API=${{ matrix.version == 3 && 'ON' || 'OFF' }}
+          -DDISABLE_DEPRECATION_WARNINGS=${{ matrix.version == 3 && 'ON' || 'OFF' }}
           -DFLOAT_PRECISION=${{ matrix.precision }}
 
       # force 'Release' build (needed by MSVC to enable optimisations)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ee91a89d6..c97a84497 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 # @author Oliver Thomson Brown
 # @author Erich Essmann (patches including MSVC support)
-# @author Tyson Jones (tidying + patches including clang multithreading)
+# @author Tyson Jones (centralising macros, doc, patches including clang multithreading)
 # @author Luc Jaulmes (NUMA awareness, patching install)
 #
 # Contributions to previous builds from:
@@ -54,7 +54,7 @@ endif()
 
 
 # ============================
-# Declare options
+# Obtain options
 # ============================
 
 
@@ -90,21 +90,6 @@ message(STATUS "Library will be named lib${LIB_NAME}. Set LIB_NAME to modify.")
 option(VERBOSE_LIB_NAME "Modify library name based on compilation configuration. Turned OFF by default." OFF)
 message(STATUS "Verbose library naming is turned ${VERBOSE_LIB_NAME}. Set VERBOSE_LIB_NAME to modify.")
 
-if (VERBOSE_LIB_NAME)
-  # Same headers will be used for several verbosely-named libraries
-  set(MULTI_LIB_HEADERS 1)
-  function(compile_option VAR VALUE)
-    target_compile_definitions(QuEST PUBLIC ${VAR}=${VALUE})
-  endfunction()
-else()
-  # Headers will be used for a single library with a single valid configuration
-  set(MULTI_LIB_HEADERS 0)
-  function(compile_option VAR VALUE)
-    target_compile_definitions(QuEST PRIVATE ${VAR}=${VALUE})
-    set(${VAR} ${VALUE} PARENT_SCOPE)
-  endfunction()
-endif()
-
 
 # Precision
 set(FLOAT_PRECISION 2 
@@ -119,10 +104,6 @@ set_property(CACHE FLOAT_PRECISION PROPERTY STRINGS
 )
 message(STATUS "Precision set to ${FLOAT_PRECISION}. Set FLOAT_PRECISION to modify.")
 
-if (VERBOSE_LIB_NAME)
-  string(CONCAT LIB_NAME ${LIB_NAME} "-fp${FLOAT_PRECISION}")
-endif()
-
 
 # Examples
 option(
@@ -174,14 +155,12 @@ option(
 )
 message(STATUS "NVIDIA GPU acceleration is turned ${ENABLE_CUDA}. Set ENABLE_CUDA to modify.")
 
-if (ENABLE_CUDA)
-  option(
-    ENABLE_CUQUANTUM
-    "Whether QuEST will be built with support for NVIDIA CuQuantum. Turned OFF by default."
-    OFF
-  )
-  message(STATUS "CuQuantum support is turned ${ENABLE_CUQUANTUM}. Set ENABLE_CUQUANTUM to modify.")
-endif()
+option(
+  ENABLE_CUQUANTUM
+  "Whether QuEST will be built with support for NVIDIA cuQuantum. Turned OFF by default."
+  OFF
+)
+message(STATUS "CuQuantum support is turned ${ENABLE_CUQUANTUM}. Set ENABLE_CUQUANTUM to modify.")
 
 option(
   ENABLE_HIP
@@ -190,33 +169,95 @@ option(
 )
 message(STATUS "AMD GPU acceleration is turned ${ENABLE_HIP}. Set ENABLE_HIP to modify.")
 
-# Throw on disallowed GPU combinations
+
+# Deprecated API
+option(
+  ENABLE_DEPRECATED_API
+  "Whether QuEST will be built with deprecated API support. Turned OFF by default."
+  OFF
+)
+message(STATUS "Deprecated API support is turned ${ENABLE_DEPRECATED_API}. Set ENABLE_DEPRECATED_API to modify.")
+
+option(
+  DISABLE_DEPRECATION_WARNINGS
+  "Whether to disable compile-time warnings ordinarily triggered by use of the deprecated API. Turned OFF by default."
+  OFF
+)
+message(STATUS "Disabling of deprecated API warnings is turned ${DISABLE_DEPRECATION_WARNINGS}. Set DISABLE_DEPRECATION_WARNINGS to modify.")
+
+
+
+# ============================
+# Validate options
+# ============================
+
+
 if (ENABLE_CUDA AND ENABLE_HIP)
   message(FATAL_ERROR "QuEST cannot support CUDA and HIP simultaneously.")
 endif()
 
+
 if ((ENABLE_CUDA OR ENABLE_HIP) AND FLOAT_PRECISION STREQUAL 4)
   message(FATAL_ERROR "Quad precision is not supported on GPU. Please disable GPU acceleration or lower precision.")
 endif()
 
 
-# Deprecated API
-option(
-  ENABLE_DEPRECATED_API
-  "Whether QuEST will be built with deprecated API support. Turned OFF by default."
-  OFF
-)
-message(STATUS "Deprecated API support is turned ${ENABLE_DEPRECATED_API}. Set ENABLE_DEPRECATED_API to modify.")
+if (ENABLE_CUQUANTUM AND NOT ENABLE_CUDA)
+  message(FATAL_ERROR "Use of cuQuantum requires CUDA.")
+endif()
 
 
-# Windows Specific Options
 if(WIN32)
   
   # Force MSVC to export all symbols in a shared library, like GCC and clang
   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
-  if (ENABLE_TESTING)
+
+  if (ENABLE_TESTING AND BUILD_SHARED_LIBS)
+    message(WARNING "Compiling the tests on Windows requires BUILD_SHARED_LIBS=OFF which we now force.")
     set(BUILD_SHARED_LIBS OFF)
   endif()
+
+  if (ENABLE_DEPRECATED_API)
+    message(FATAL_ERROR "The deprecated API is not compatible with MSVC.")
+  endif()
+
+endif()
+
+
+
+# ============================
+# Extend verbose library name
+# ============================
+
+
+if (VERBOSE_LIB_NAME)
+
+  string(CONCAT LIB_NAME ${LIB_NAME} "-fp${FLOAT_PRECISION}")
+
+  if (ENABLE_MULTITHREADING)
+    string(CONCAT LIB_NAME ${LIB_NAME} "+mt")
+  endif()
+
+  if (ENABLE_DISTRIBUTION)
+    string(CONCAT LIB_NAME ${LIB_NAME} "+mpi")
+  endif()
+
+  if (ENABLE_CUDA)
+    string(CONCAT LIB_NAME ${LIB_NAME} "+cuda")
+  endif()
+
+  if (ENABLE_HIP)
+    string(CONCAT LIB_NAME ${LIB_NAME} "+hip")
+  endif()
+
+  if (ENABLE_CUQUANTUM)
+    string(CONCAT LIB_NAME ${LIB_NAME} "+cuquantum")
+  endif()
+
+  if (ENABLE_DEPRECATED_API)
+    string(CONCAT LIB_NAME ${LIB_NAME} "+depr")
+  endif()
+
 endif()
 
 
@@ -234,15 +275,15 @@ add_library(QuEST::QuEST ALIAS QuEST)
 
 # Set include directories
 target_include_directories(QuEST
-        PUBLIC
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/quest/include>
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-        $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
-        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
+  PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/quest/include>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
 )
 set_target_properties(QuEST PROPERTIES
-        VERSION     ${PROJECT_VERSION}
-        SOVERSION   ${PROJECT_VERSION_MAJOR}
+  VERSION     ${PROJECT_VERSION}
+  SOVERSION   ${PROJECT_VERSION_MAJOR}
 )
 
 
@@ -277,13 +318,10 @@ target_compile_options(QuEST
 
 
 # ============================
-# Pass options to library
+# Link optional dependencies
 # ============================
 
 
-compile_option(FLOAT_PRECISION ${FLOAT_PRECISION})
-
-
 # OpenMP
 if (ENABLE_MULTITHREADING)
 
@@ -299,42 +337,44 @@ if (ENABLE_MULTITHREADING)
     message(FATAL_ERROR ${ErrorMsg})
   endif()
 
-  compile_option(COMPILE_OPENMP 1)
   target_link_libraries(QuEST
     PRIVATE
     OpenMP::OpenMP_CXX
     OpenMP::OpenMP_C
   )
 
+else()
+
+  # suppress GCC "unknown pragma" warning when OpenMP disabled
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+    target_compile_options(QuEST PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Wno-unknown-pragmas>)
+  endif()
+  
+endif()
+
+
+# NUMA (only relevant when multithreading)
+if (ENABLE_MULTITHREADING)
+
   # Find NUMA - location of NUMA headers
   if (WIN32)
-    compile_option(NUMA_AWARE 0)
+    set(NUMA_AWARE 0)
     message(WARNING "Building on Windows, QuEST will not be aware of numa locality")
   else()
     include(FindPkgConfig)
     pkg_search_module(NUMA numa IMPORTED_TARGET GLOBAL)
     if (${NUMA_FOUND})
-      compile_option(NUMA_AWARE ${NUMA_FOUND})
+      set(NUMA_AWARE ${NUMA_FOUND})
       target_link_libraries(QuEST PRIVATE PkgConfig::NUMA)
       message(STATUS "NUMA awareness is enabled.")
     else()
-      compile_option(NUMA_AWARE 0)
+      set(NUMA_AWARE 0)
       message(WARNING "libnuma not found, QuEST will not be aware of numa locality")
     endif()
   endif()
 
-  if (VERBOSE_LIB_NAME)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+mt")
-  endif()
-
 else()
-
-  # suppress GCC "unknown pragma" warning when OpenMP disabled
-  if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-    target_compile_options(QuEST PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-Wno-unknown-pragmas>)
-  endif()
-  
-  compile_option(COMPILE_OPENMP 0)
+  set(NUMA_AWARE 0)
 endif()
 
 
@@ -343,16 +383,10 @@ if (ENABLE_DISTRIBUTION)
   find_package(MPI REQUIRED
     COMPONENTS CXX
   )
-  compile_option(COMPILE_MPI 1)
   target_link_libraries(QuEST
     PRIVATE
     MPI::MPI_CXX
   )
-  if (VERBOSE_LIB_NAME)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+mpi")
-  endif()
-else()
-  compile_option(COMPILE_MPI 0)
 endif()
 
 
@@ -366,32 +400,9 @@ if (ENABLE_CUDA)
 
   enable_language(CUDA)
   set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-  
-  set_property(TARGET QuEST PROPERTY CUDA_STANDARD 20)
-  
   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
   
-  if (VERBOSE_LIB_NAME)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+cuda")
-  endif()
-
-  # beware that compile_option(COMPILE_CUDA) is deferred to below because
-  # it is triggered by both/either ENABLE_CUDA and ENABLE_HIP
-
-endif()
-
-
-# cuQuantum
-if (ENABLE_CUQUANTUM)
-  find_package(CUQUANTUM REQUIRED)
-  compile_option(COMPILE_CUQUANTUM 1)
-  target_link_libraries(QuEST PRIVATE CUQUANTUM::cuStateVec)
-  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
-  if (VERBOSE_LIB_NAME)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+cuquantum")
-  endif()
-else()
-  compile_option(COMPILE_CUQUANTUM 0)
+  set_property(TARGET QuEST PROPERTY CUDA_STANDARD 20)
 endif()
 
 
@@ -413,36 +424,50 @@ if (ENABLE_HIP)
   find_package(HIP REQUIRED)
   message(STATUS "Found HIP: " ${HIP_VERSION})
 
-  compile_option(COMPILE_CUQUANTUM 0)
   target_link_libraries(QuEST PRIVATE hip::host)
 
-  if (VERBOSE_LIB_NAME)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+hip")
-  endif()
 endif()
 
 
-# set COMPILE_CUDA
-if (ENABLE_CUDA OR ENABLE_HIP)
-  compile_option(COMPILE_CUDA 1)
-else()
-  compile_option(COMPILE_CUDA 0)
+# cuQuantum
+if (ENABLE_CUQUANTUM)
+  find_package(CUQUANTUM REQUIRED)
+  target_link_libraries(QuEST PRIVATE CUQUANTUM::cuStateVec)
+  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
 endif()
 
 
-# v3 API
-if (ENABLE_DEPRECATED_API)
-  target_compile_definitions(QuEST PRIVATE INCLUDE_DEPRECATED_FUNCTIONS=1)
 
-  if (VERBOSE_LIB_NAME)
-    string(CONCAT LIB_NAME ${LIB_NAME} "+depr")
-  endif()
+# ===============================
+# Set options to save in config.h
+# ===============================
 
+
+# set vars which will be written to config.h.in (auto-converted to 0 or 1)
+set(COMPILE_OPENMP ${ENABLE_MULTITHREADING})
+set(COMPILE_MPI ${ENABLE_DISTRIBUTION})
+set(COMPILE_CUQUANTUM ${ENABLE_CUQUANTUM})
+set(INCLUDE_DEPRECATED_FUNCTIONS ${ENABLE_DEPRECATED_API})
+
+
+# (for the love of God cmake, create a concise syntax for this)
+if (ENABLE_CUDA OR ENABLE_HIP)
+  set(COMPILE_CUDA 1)
 else()
-  target_compile_definitions(QuEST PRIVATE INCLUDE_DEPRECATED_FUNCTIONS=0)
+  set(COMPILE_CUDA 0)
 endif()
 
 
+# these vars are already set, but repeated here for clarity
+set(FLOAT_PRECISION ${FLOAT_PRECISION})
+set(NUMA_AWARE ${NUMA_AWARE})
+set(DISABLE_DEPRECATION_WARNINGS ${DISABLE_DEPRECATION_WARNINGS})
+
+
+# these do not appear in src but are saved for record-keeping in config.h.in
+set(COMPILE_HIP ${ENABLE_HIP})
+
+
 
 # ============================
 # Patch CPU performance
@@ -690,7 +715,7 @@ install(FILES
 )
 
 install(FILES
-  "${CMAKE_CURRENT_BINARY_DIR}/include/quest/include/config.h"
+  "${CMAKE_CURRENT_BINARY_DIR}/quest/include/config.h"
   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/quest/include"
 )
 
diff --git a/docs/cmake.md b/docs/cmake.md
index 336dcf28c..9240b6793 100644
--- a/docs/cmake.md
+++ b/docs/cmake.md
@@ -8,13 +8,13 @@
   @author Tyson Jones (test variables)
 -->
 
-Version 4 of QuEST includes reworked CMake to support library builds, CMake export, and installation. Here we detail useful variables to configure the compilation of QuEST. If using a Unix-like operating system any of these variables can be set using the `-D` flag when invoking CMake, for example:
+Version 4 of QuEST includes reworked CMake to support library builds, CMake export, and installation. Here we detail useful variables to configure the compilation of QuEST. If using a Unix-like operating system, any of these variables can be set using the `-D` flag when invoking CMake, for example:
 
 ```
 cmake -Bbuild -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/opt/QuEST -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DENABLE_MULTITHREADING=ON -DENABLE_DISTRIBUTION=OFF ./
 ```
 
-Then one need only move to the build directory, and invoke make:
+Then, as detailed in [`compile.md`](compile.md), one need only move to the build directory and compile by invoking make:
 
 ```
 cd build
@@ -42,10 +42,13 @@ make
 | `ENABLE_CUQUANTUM` | (`OFF`), `ON` | Determines whether QuEST will make use of the NVIDIA CuQuantum library. Cannot be turned on if `ENABLE_CUDA` is off. |
 | `ENABLE_HIP` | (`OFF`), `ON` | Determines whether QuEST will be built with support for AMD GPU acceleration. If turned on, `CMAKE_HIP_ARCHITECTURES` should probably also be set. |
 | `ENABLE_DEPRECATED_API` | (`OFF`), `ON` | Determines whether QuEST will be built with support for the deprecated (v3) API. ***Note**: this will generate compiler warnings and is not supported by MSVC.* |
+| `DISABLE_DEPRECATION_WARNINGS` | (`OFF`), `ON` | Whether to disable the compile-time deprecation warnings when using the deprecated (v3) API. |
 | `USER_SOURCE` | (Undefined), String | The source file for a user program which will be compiled alongside QuEST. `OUTPUT_EXE` *must* also be defined. |
 | `OUTPUT_EXE` | (Undefined), String | The name of the executable which will be created from the provided `USER_SOURCE`. `USER_SOURCE` *must* also be defined. |
 
 
+
+
 --------------------------
 
 ## Test variables
diff --git a/docs/compile.md b/docs/compile.md
index 37fc6ad23..f11677fbf 100644
--- a/docs/compile.md
+++ b/docs/compile.md
@@ -55,6 +55,9 @@ Compiling is configured with variables supplied by the [`-D` flag](https://cmake
 > [!TIP]
 > QuEST's [Github Actions](https://github.com/QuEST-Kit/QuEST/actions/workflows/compile.yml) regularly test QuEST compilation using a broad combination of deployment settings; presently `108` combinations! The [`compile.yml`](/.github/workflows/compile.yml) workflow can serve as a concrete example of how to compile QuEST in a sanitised, virtual setting.
 
+> [!NOTE]
+> Afraid of CMake? See [`compile.sh`](/utils/scripts/compile.sh) for manual compilation (which we discourage!)
+
 
 ------------------
 
diff --git a/quest/include/CMakeLists.txt b/quest/include/CMakeLists.txt
index bf0d1cd09..2ab3d569d 100644
--- a/quest/include/CMakeLists.txt
+++ b/quest/include/CMakeLists.txt
@@ -1,5 +1,12 @@
 # @author Oliver Thomson Brown
 # @author Erich Essmann
 # @author Luc Jaulmes (using config file)
+# @author Tyson Jones (doc)
 
-configure_file(config.h.in "${CMAKE_BINARY_DIR}/include/quest/include/config.h" @ONLY)
+# Generate a header file which defines all configurable preprocessors
+# needed by the QuEST source (e.g. COMPILE_MPI), as informed by the
+# user-set CMake options. This permits us to avoid passing any macros
+# through compiler flags and the associated conflicts arising when
+# installing QuEST. Note that config.h must be manually created when
+# not compiling via CMake, e.g. when using a custom build script 
+configure_file(config.h.in config.h @ONLY)
diff --git a/quest/include/config.h.in b/quest/include/config.h.in
index 8326259b6..2cb12fa90 100644
--- a/quest/include/config.h.in
+++ b/quest/include/config.h.in
@@ -1,22 +1,169 @@
+/** @file
+ * The input file to CMake's configure_file() which produces
+ * the config.h header, which in-turn defines macros needed by 
+ * the QuEST source, as informed by user-specified CMake options.
+ * 
+ * Use of this configured header file enables all macros to be 
+ * defined in one central place (right here) rather than being 
+ * passed to each source file as compiler flags. It further
+ * ensures that when QuEST is installed, critical user-facing
+ * macros such as FLOAT_PRECISION cannot ever be changed from
+ * their value during source compilation. Finally, it enables
+ * users to access macros such as COMPILE_OPENMP at pre-build
+ * time of their own source code, which could prove necessary 
+ * when interfacing with external libraries.
+ * 
+ * Use of this config file however means that manual compilation
+ * (through GNUMake or a manual script) requires generating the
+ * config.h file manually.
+ * 
+ * @author Luc Jaulmes
+ * @author Oliver Brown
+ * @author Tyson Jones (centralising macros, validation, doc)
+ */
+
 #ifndef CONFIG_H
 #define CONFIG_H
 
-// be warned, the below is sensitive to whitespace after the slash
-#if !defined(FLOAT_PRECISION)\
-    || !defined(COMPILE_MPI)\
-    || !defined(COMPILE_OPENMP)\
-    || !defined(COMPILE_CUDA)\
-    || !defined(COMPILE_CUQUANTUM)
 
-// bind compile settings to installed exec
-#if !@MULTI_LIB_HEADERS@
+
+/*
+ * check that the below CMake options are not being erroneously
+ * passed as macros (as was previously accepted), which would 
+ * anyway trigger a 'macro redefined' warning below.
+ */
+
+
+#if defined(FLOAT_PRECISION)              || \
+    defined(COMPILE_OPENMP)               || \
+    defined(COMPILE_MPI)                  || \
+    defined(COMPILE_CUDA)                 || \
+    defined(COMPILE_HIP)                  || \
+    defined(COMPILE_CUQUANTUM)            || \
+    defined(NUMA_AWARE)                   || \
+    defined(INCLUDE_DEPRECATED_FUNCTIONS) || \
+    defined(DISABLE_DEPRECATION_WARNINGS)
+
+    #error "Pre-config macros were erroneously passed directly to the source rather than through the CMake config file."
+
+#endif
+
+
+#if defined(QUEST_VERSION_MAJOR) || \
+    defined(QUEST_VERSION_MINOR) || \
+    defined(QUEST_VERSION_PATCH) || \
+    defined(QUEST_VERSION_STRING)
+
+    #error "QuEST version macros were erroneously passed directly to the source rather than through the CMake config file."
+
+#endif
+
+
+
+/*
+ * save the CMake option values as macros. 
+ *
+ * Note we use cmakedefine (rather than just define @VAL) so that 
+ * the macro is totally avoided being defined if the corresponding 
+ * CMake variable was not passed to this config file, which we can
+ * subsequently detect and error-out below
+ */ 
+
+
+// crucial to user source (informs API)
 #cmakedefine FLOAT_PRECISION @FLOAT_PRECISION@
-#cmakedefine01 COMPILE_MPI
+#cmakedefine01 INCLUDE_DEPRECATED_FUNCTIONS
+#cmakedefine01 DISABLE_DEPRECATION_WARNINGS
+
+
+// crucial to QuEST source (informs external library usage)
 #cmakedefine01 COMPILE_OPENMP
+#cmakedefine01 COMPILE_MPI
 #cmakedefine01 COMPILE_CUDA
 #cmakedefine01 COMPILE_CUQUANTUM
+
+
+// not actually a CMake option (user cannot disable) but nonetheless crucial
+#cmakedefine01 NUMA_AWARE
+
+
+// not consulted by src (included for book-keeping)
+#cmakedefine01 COMPILE_HIP
+
+
+
+/*
+ * inherit the version information from CMake.
+ *
+ * Note we do not (and actually cannot) validate these using the 
+ * cmakedefine trick used above since they are always gauranteed 
+ * to be passed and substituted by CMake
+ */ 
+
+
+#define QUEST_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
+#define QUEST_VERSION_MINOR @PROJECT_VERSION_MINOR@
+#define QUEST_VERSION_PATCH @PROJECT_VERSION_PATCH@
+#define QUEST_VERSION_STRING "@PROJECT_VERSION@"
+
+
+
+/*
+ * check that all above expected CMake options were
+ * actually passed (since use of 'cmakedefine' means
+ * they otherwise default to no macro definition)
+ */
+
+
+#if ! defined(FLOAT_PRECISION)              || \
+    ! defined(COMPILE_OPENMP)               || \
+    ! defined(COMPILE_MPI)                  || \
+    ! defined(COMPILE_CUDA)                 || \
+    ! defined(COMPILE_HIP)                  || \
+    ! defined(COMPILE_CUQUANTUM)            || \
+    ! defined(NUMA_AWARE)                   || \
+    ! defined(INCLUDE_DEPRECATED_FUNCTIONS) || \
+    ! defined(DISABLE_DEPRECATION_WARNINGS)
+
+    #error "Expected macros were not defined by the config.h header, possibly because their corresponding CMake variables were not substituted."
+
 #endif
 
+
+
+/*
+ * validate boolean macro definitions.
+ *
+ * Note that more specific checks (e.g. whether non-boolean
+ * macros have legal values, or whether combinations of
+ * multiple macro values are legal) are performed by the files
+ * that concern them. These checks merely validate the macro
+ * has the right 'type' so will not trip up other preprocesing
+ */
+
+
+#if ! (COMPILE_OPENMP               == 0 || COMPILE_OPENMP               == 1) || \
+    ! (COMPILE_MPI                  == 0 || COMPILE_MPI                  == 1) || \
+    ! (COMPILE_CUDA                 == 0 || COMPILE_CUDA                 == 1) || \
+    ! (COMPILE_HIP                  == 0 || COMPILE_HIP                  == 1) || \
+    ! (COMPILE_CUQUANTUM            == 0 || COMPILE_CUQUANTUM            == 1) || \
+    ! (NUMA_AWARE                   == 0 || NUMA_AWARE                   == 1) || \
+    ! (INCLUDE_DEPRECATED_FUNCTIONS == 0 || INCLUDE_DEPRECATED_FUNCTIONS == 1) || \
+    ! (DISABLE_DEPRECATION_WARNINGS == 0 || DISABLE_DEPRECATION_WARNINGS == 1)
+
+    #error "A macro defined by the config.h header (as inferred from a CMake variable) had an illegal value."
+
+#endif
+
+
+// ensure __cplusplus macro is valid (API headers use #ifdef, not #if)
+
+#ifdef __cplusplus
+#if !__cplusplus
+#error "Preprocessor __cplusplus was 0 and should instead be undefined"
 #endif
+#endif
+
+
 
-#endif
\ No newline at end of file
+#endif // CONFIG_H
\ No newline at end of file
diff --git a/quest/include/modes.h b/quest/include/modes.h
index 2bb608be9..f8fc52a1c 100644
--- a/quest/include/modes.h
+++ b/quest/include/modes.h
@@ -1,12 +1,12 @@
 /** @file
- * Compile-time checks that all expected
- * preprocessor macros are defined and valid 
+ * Constants related to configuring QuEST runtime modes,
+ * and documentation of environment variables
  * 
  * @author Tyson Jones
  * 
  * @defgroup modes Modes
  * @ingroup api
- * @brief Macros for controlling QuEST compilation.
+ * @brief Constants and environment variables for controlling QuEST execution.
  * @{
  */
 
@@ -15,75 +15,6 @@
 
 
 
-// ensure all mode flags are valid values
-// undefined allowed as undefined == 0 in C/C++ standards
-
-#if ! (COMPILE_MPI == 0 || COMPILE_MPI == 1)
-    #error "Macro COMPILE_MPI must have value 0 or 1"
-#endif
-
-#if ! (COMPILE_OPENMP == 0 || COMPILE_OPENMP == 1)
-    #error "Macro COMPILE_OPENMP must have value 0 or 1"
-#endif
-
-#if ! (COMPILE_CUDA == 0 || COMPILE_CUDA == 1)
-    #error "Macro COMPILE_CUDA must have value 0 or 1"
-#endif
-
-#if ! (COMPILE_CUQUANTUM == 0 || COMPILE_CUQUANTUM == 1)
-    #error "Macro COMPILE_CUQUANTUM must have value 0 or 1"
-#endif
-
-
-
-// ensure mode flags are compatible
-
-#if COMPILE_CUQUANTUM && ! COMPILE_CUDA
-    #error "Cannot enable cuQuantum without simultaneously enabling GPU-acceleration"
-#endif
-
-
-
-// ensure C++ macro is valid (API headers use #ifdef, not #if)
-
-#ifdef __cplusplus
-#if !__cplusplus
-#error "Preprocessor __cplusplus was 0 and should instead be undefined"
-#endif
-#endif
-
-
-
-// define optional-macro defaults (mostly to list them)
-
-#ifndef INCLUDE_DEPRECATED_FUNCTIONS
-#define INCLUDE_DEPRECATED_FUNCTIONS 0
-#endif
-
-#ifndef DISABLE_DEPRECATION_WARNINGS
-#define DISABLE_DEPRECATION_WARNINGS 0
-#endif
-
-// further macros are defined in precision.h
-
-// spoofing above macro as consts to doc
-#if 0
-
-
-    /// @notyetdoced
-    /// @macrodoc
-    const int INCLUDE_DEPRECATED_FUNCTIONS = 0;
-
-
-    /// @notyetdoced
-    /// @macrodoc
-    const int DISABLE_DEPRECATION_WARNINGS = 0;
-
-
-#endif
-
-
-
 // document environment variables
 
 // spoof env-vars as consts to doc (hackily and hopefully temporarily)
diff --git a/quest/include/precision.h b/quest/include/precision.h
index f7a18e416..d37b9a2d3 100644
--- a/quest/include/precision.h
+++ b/quest/include/precision.h
@@ -1,6 +1,6 @@
 /** @file
- * User-overridable numerical precision of
- * both the QuEST API and backends
+ * The precision of QuEST's numerical types, some of which 
+ * are overridable and others of which are intendedly fixed.
  * 
  * @author Tyson Jones
  * @author Milos Prokop (patched trig overloads in v3)
@@ -14,7 +14,7 @@
 #ifndef PRECISION_H
 #define PRECISION_H
 
-#include "quest/include/modes.h"
+#include "quest/include/config.h"
 
 
 
@@ -76,11 +76,6 @@
  * RE-CONFIGURABLE FLOATING-POINT PRECISION
  */
 
-// assume double precision as default
-#ifndef FLOAT_PRECISION
-    #define FLOAT_PRECISION 2
-#endif
-
 // validate precision is 1 (float), 2 (double) or 4 (long double)
 #if ! (FLOAT_PRECISION == 1 || FLOAT_PRECISION == 2 || FLOAT_PRECISION == 4)
     #error "FLOAT_PRECISION must be 1 (float), 2 (double) or 4 (long double)"
@@ -100,10 +95,14 @@
 
     /// @notyetdoced
     /// @macrodoc
+    ///
+    /// (note this macro is informed by the FLOAT_PRECISION CMake variable)
     const int FLOAT_PRECISION = 2;
 
     /// @notyetdoced
     /// @macrodoc
+    ///
+    /// (note this macro is informed by the FLOAT_PRECISION CMake variable)
     typedef double int FLOAT_TYPE;
 
 #endif
diff --git a/quest/include/quest.h b/quest/include/quest.h
index c0a30ed1f..409253ff8 100644
--- a/quest/include/quest.h
+++ b/quest/include/quest.h
@@ -2,9 +2,6 @@
  * The main QuEST header, exposing the entire API.
  * This header is intendedly included by user
  * source-code, and is both C11 and C++14 compatible.
- * Preprocessor 'INCLUDE_DEPRECATED_FUNCTIONS' can
- * be defined as 1 to additionally include QuEST's
- * deprecated v3 API, before including this header.
  * 
  * @author Tyson Jones
  * @author Luc Jaulmes (patching CMake install)
@@ -30,18 +27,11 @@
 #ifndef QUEST_H
 #define QUEST_H
 
-
-// include version first so it is accessible to 
-// debuggers in case a subsequent include fails
-#include "quest/include/version.h"
-
+// include config.h first to define macros
+// consulted by subsequent headers
 #include "quest/include/config.h"
 
-// include before API headers since it validates
-// preprocessor configuration, and affirms macro
-// preconditions assumed by subsequent header
 #include "quest/include/modes.h"
-
 #include "quest/include/precision.h"
 #include "quest/include/types.h"
 #include "quest/include/calculations.h"
diff --git a/quest/include/types.h b/quest/include/types.h
index c006b02cd..066d35e9d 100644
--- a/quest/include/types.h
+++ b/quest/include/types.h
@@ -19,7 +19,7 @@
 #ifndef TYPES_H
 #define TYPES_H
 
-#include "quest/include/modes.h"
+#include "quest/include/config.h"
 #include "quest/include/precision.h"
 
 
@@ -157,26 +157,10 @@ static inline qcomp getQcomp(qreal re, qreal im) {
     // Furthermore, the user might do arithmetic on complex literals which are
     // not the same precision as qcomp, so compilation will fail depending
     // on the setting of PRECISION. To avoid this, we'll define overloads
-    // between all type/precision permutations, always returning qcomp.
-    // Via the unholy macros below, we create 312 overloads; since this will
-    // no doubt break somebody's build/integration, users can disable this
-    // attempt at precision-agnostic arithmetic via DEFINE_ARITHMETIC_OVERLOADS=0
-
-    #ifndef DEFINE_ARITHMETIC_OVERLOADS
-    #define DEFINE_ARITHMETIC_OVERLOADS 1
-    #endif
-
-    // spoofing above macro as const to doc
-    #if 0
-
-        /// @notyetdoced
-        /// @macrodoc
-        const int DEFINE_ARITHMETIC_OVERLOADS = 1;
-
-    #endif
-
-
-    #if DEFINE_ARITHMETIC_OVERLOADS
+    // between all type/precision permutations, always returning qcomp. These
+    // overloads are also used by the QuEST source code. Via the unholy macros 
+    // below, we create 312 overloads; no doubt this is going to break something
+    // in the future, for which I am already sorry :'(
 
     /// @cond EXCLUDE_FROM_DOXYGEN
 
@@ -274,8 +258,6 @@ static inline qcomp getQcomp(qreal re, qreal im) {
 
     /// @endcond // EXCLUDE_FROM_DOXYGEN
 
-    #endif // DEFINE_ARITHMETIC_OVERLOADS
-
 #endif
 
 
diff --git a/quest/include/version.h b/quest/include/version.h
deleted file mode 100644
index bd3e99a50..000000000
--- a/quest/include/version.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/** @file
- * QuEST version information.
- * 
- * @author Tyson Jones
- * 
- * (no doxygen doc)
- */
-
-#ifndef VERSION_H
-#define VERSION_H
-
-// TODO: automate this from git somehow
-#define QUEST_VERSION_MAJOR 4
-#define QUEST_VERSION_MINOR 1
-#define QUEST_VERSION_PATCH 0
-#define QUEST_VERSION_STRING "4.1.0"
-
-#endif // VERSION_H
\ No newline at end of file
diff --git a/quest/src/api/matrices.cpp b/quest/src/api/matrices.cpp
index d1c118d17..b17987eb4 100644
--- a/quest/src/api/matrices.cpp
+++ b/quest/src/api/matrices.cpp
@@ -9,8 +9,9 @@
  */
 
 #include "quest/include/matrices.h"
-#include "quest/include/environment.h"
+#include "quest/include/modes.h"
 #include "quest/include/types.h"
+#include "quest/include/environment.h"
 
 #include "quest/src/core/validation.hpp"
 #include "quest/src/core/autodeployer.hpp"
diff --git a/quest/src/api/qureg.cpp b/quest/src/api/qureg.cpp
index 7d68528a1..fa7c73b05 100644
--- a/quest/src/api/qureg.cpp
+++ b/quest/src/api/qureg.cpp
@@ -6,6 +6,7 @@
  */
 
 #include "quest/include/qureg.h"
+#include "quest/include/modes.h"
 #include "quest/include/environment.h"
 #include "quest/include/initialisations.h"
 
diff --git a/quest/src/comm/comm_config.cpp b/quest/src/comm/comm_config.cpp
index ae8009ed2..854a12bd5 100644
--- a/quest/src/comm/comm_config.cpp
+++ b/quest/src/comm/comm_config.cpp
@@ -12,7 +12,7 @@
  * @author Tyson Jones
  */
 
-#include "quest/include/modes.h"
+#include "quest/include/config.h"
 #include "quest/include/types.h"
 
 #include "quest/src/comm/comm_config.hpp"
diff --git a/quest/src/comm/comm_routines.cpp b/quest/src/comm/comm_routines.cpp
index 6e161db18..19ebcb9f8 100644
--- a/quest/src/comm/comm_routines.cpp
+++ b/quest/src/comm/comm_routines.cpp
@@ -10,6 +10,7 @@
  * @author Ania (Anna) Brown (developed QuEST v1 logic)
  */
 
+#include "quest/include/config.h"
 #include "quest/include/types.h"
 #include "quest/include/qureg.h"
 #include "quest/include/matrices.h"
diff --git a/quest/src/core/parser.cpp b/quest/src/core/parser.cpp
index 8884acc4c..5448d3862 100644
--- a/quest/src/core/parser.cpp
+++ b/quest/src/core/parser.cpp
@@ -10,6 +10,7 @@
  * @author Tyson Jones
  */
 
+#include "quest/include/config.h"
 #include "quest/include/precision.h"
 #include "quest/include/types.h"
 #include "quest/include/paulis.h"
diff --git a/quest/src/core/printer.cpp b/quest/src/core/printer.cpp
index 3d2a4d9f0..e4d4cbc32 100644
--- a/quest/src/core/printer.cpp
+++ b/quest/src/core/printer.cpp
@@ -9,6 +9,7 @@
  * @author Erich Essmann (improved OS agnosticism, patched mem-leak)
  */
 
+#include "quest/include/config.h"
 #include "quest/include/qureg.h"
 #include "quest/include/types.h"
 #include "quest/include/matrices.h"
diff --git a/quest/src/core/utilities.cpp b/quest/src/core/utilities.cpp
index 4966d1916..999bd9a72 100644
--- a/quest/src/core/utilities.cpp
+++ b/quest/src/core/utilities.cpp
@@ -8,6 +8,7 @@
  * @author Luc Jaulmes (distributing ranges over blocks)
  */
 
+#include "quest/include/config.h"
 #include "quest/include/types.h"
 #include "quest/include/qureg.h"
 #include "quest/include/paulis.h"
diff --git a/quest/src/core/utilities.hpp b/quest/src/core/utilities.hpp
index cb2d8e713..4b7fb5db6 100644
--- a/quest/src/core/utilities.hpp
+++ b/quest/src/core/utilities.hpp
@@ -12,6 +12,7 @@
 #ifndef UTILITIES_HPP
 #define UTILITIES_HPP
 
+#include "quest/include/config.h"
 #include "quest/include/types.h"
 #include "quest/include/qureg.h"
 #include "quest/include/paulis.h"
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
index e488e6a9c..c11ec224d 100644
--- a/quest/src/cpu/cpu_config.cpp
+++ b/quest/src/cpu/cpu_config.cpp
@@ -6,7 +6,7 @@
  * @author Luc Jaulmes (NUMA awareness)
  */
 
-#include "quest/include/modes.h"
+#include "quest/include/config.h"
 #include "quest/include/types.h"
 #include "quest/include/paulis.h"
 
diff --git a/quest/src/cpu/cpu_subroutines.cpp b/quest/src/cpu/cpu_subroutines.cpp
index a853bc9be..13bf6eecf 100644
--- a/quest/src/cpu/cpu_subroutines.cpp
+++ b/quest/src/cpu/cpu_subroutines.cpp
@@ -21,7 +21,6 @@
  * @author Ania (Anna) Brown (developed QuEST v1 logic)
  */
 
-#include "quest/include/modes.h"
 #include "quest/include/types.h"
 #include "quest/include/qureg.h"
 #include "quest/include/paulis.h"
diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
index 87a1c5192..c7db834b7 100644
--- a/quest/src/gpu/gpu_config.cpp
+++ b/quest/src/gpu/gpu_config.cpp
@@ -5,7 +5,7 @@
  * @author Tyson Jones
  */
 
-#include "quest/include/modes.h"
+#include "quest/include/config.h"
 #include "quest/include/types.h"
 #include "quest/include/qureg.h"
 #include "quest/include/matrices.h"
diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
index 7517cb2a3..1b3be6295 100644
--- a/quest/src/gpu/gpu_config.hpp
+++ b/quest/src/gpu/gpu_config.hpp
@@ -12,6 +12,7 @@
 #ifndef GPU_CONFIG_HPP
 #define GPU_CONFIG_HPP
 
+#include "quest/include/config.h"
 #include "quest/include/types.h"
 #include "quest/include/qureg.h"
 #include "quest/include/matrices.h"
diff --git a/quest/src/gpu/gpu_cuquantum.cuh b/quest/src/gpu/gpu_cuquantum.cuh
index f627990fd..6ba321000 100644
--- a/quest/src/gpu/gpu_cuquantum.cuh
+++ b/quest/src/gpu/gpu_cuquantum.cuh
@@ -25,7 +25,9 @@
 #define GPU_CUQUANTUM_HPP
 
 
-// check preprocessors and compilers are valid before #includes to avoid compile errors
+// check preprocessors and compilers are valid before #includes to avoid 
+// compile errors (though we must still obtain the preprocessors from config.h)
+#include "quest/include/config.h"
 
 #if ! COMPILE_CUQUANTUM
     #error "A file being compiled somehow included gpu_cuquantum.hpp despite QuEST not being compiled in cuQuantum mode."
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 448073fe0..4f2a737e4 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -18,7 +18,7 @@
 #ifndef GPU_KERNELS_HPP
 #define GPU_KERNELS_HPP
 
-#include "quest/include/modes.h"
+#include "quest/include/config.h"
 #include "quest/include/types.h"
 
 #include "quest/src/core/bitwise.hpp"
diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp
index 034d87f48..5e18048f7 100644
--- a/quest/src/gpu/gpu_subroutines.cpp
+++ b/quest/src/gpu/gpu_subroutines.cpp
@@ -32,11 +32,13 @@
  * @author Tyson Jones
  */
 
+// obtain preprocessors from config.h prior to validation
+#include "quest/include/config.h"
+
 #if (COMPILE_CUQUANTUM && ! COMPILE_CUDA)
     #error "Cannot define COMPILE_CUQUANTUM=1 without simultaneously defining COMPILE_CUDA=1"
 #endif
 
-#include "quest/include/modes.h"
 #include "quest/include/types.h"
 #include "quest/include/qureg.h"
 #include "quest/include/paulis.h"
diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh
index 44a37fadd..9f8d8f1ab 100644
--- a/quest/src/gpu/gpu_thrust.cuh
+++ b/quest/src/gpu/gpu_thrust.cuh
@@ -21,11 +21,13 @@
 #ifndef GPU_THRUST_HPP
 #define GPU_THRUST_HPP
 
+// obtain preprocessors from config.h prior to validation
+#include "quest/include/config.h"
+
 #if ! COMPILE_CUDA
     #error "A file being compiled somehow included gpu_thrust.hpp despite QuEST not being compiled in GPU-accelerated mode."
 #endif
 
-#include "quest/include/modes.h"
 #include "quest/include/types.h"
 #include "quest/include/qureg.h"
 #include "quest/include/paulis.h"
diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh
index 8fe8e6930..a934ecef6 100644
--- a/quest/src/gpu/gpu_types.cuh
+++ b/quest/src/gpu/gpu_types.cuh
@@ -14,7 +14,7 @@
 #ifndef GPU_TYPES_HPP
 #define GPU_TYPES_HPP
 
-#include "quest/include/modes.h"
+#include "quest/include/config.h"
 #include "quest/include/types.h"
 #include "quest/include/precision.h"
 
diff --git a/tests/deprecated/CMakeLists.txt b/tests/deprecated/CMakeLists.txt
index 0887bce16..f9132c74a 100644
--- a/tests/deprecated/CMakeLists.txt
+++ b/tests/deprecated/CMakeLists.txt
@@ -13,7 +13,6 @@ add_executable(dep_tests
     test_utilities.cpp
 )
 target_link_libraries(dep_tests PUBLIC QuEST::QuEST Catch2::Catch2)
-target_compile_definitions(dep_tests PRIVATE INCLUDE_DEPRECATED_FUNCTIONS=1)
 
 if (ENABLE_DISTRIBUTION)
     target_link_libraries(dep_tests PRIVATE MPI::MPI_CXX)
diff --git a/tests/deprecated/test_calculations.cpp b/tests/deprecated/test_calculations.cpp
index 3ed8f560e..0f02a6dea 100644
--- a/tests/deprecated/test_calculations.cpp
+++ b/tests/deprecated/test_calculations.cpp
@@ -15,13 +15,7 @@
 #include <catch2/matchers/catch_matchers_string.hpp>
 #include <catch2/generators/catch_generators_range.hpp>
 
-// must define preprocessors to enable quest's
-// deprecated v3 API, and disable the numerous
-// warnings issued by its compilation
-#define INCLUDE_DEPRECATED_FUNCTIONS 1
-#define DISABLE_DEPRECATION_WARNINGS 1
 #include "quest.h"
-
 #include "test_utilities.hpp"
 
 /* allows concise use of ContainsSubstring in catch's REQUIRE_THROWS_WITH */
@@ -918,8 +912,8 @@ TEST_CASE( "calcInnerProduct", "[calculations]" ) {
                 toQureg(vec2, r2);
                 qcomp res = calcInnerProduct(vec1,vec2);
                 
-                REQUIRE( real(res) == Approx(real(prod)) );
-                REQUIRE( imag(res) == Approx(imag(prod)) );
+                REQUIRE( real(res) == Approx(real(prod)).margin(REAL_EPS) );
+                REQUIRE( imag(res) == Approx(imag(prod)).margin(REAL_EPS) );
             }
         }
     }
diff --git a/tests/deprecated/test_data_structures.cpp b/tests/deprecated/test_data_structures.cpp
index 258d83f75..bc336f7dd 100644
--- a/tests/deprecated/test_data_structures.cpp
+++ b/tests/deprecated/test_data_structures.cpp
@@ -15,13 +15,7 @@
 #include <catch2/matchers/catch_matchers_string.hpp>
 #include <catch2/generators/catch_generators_range.hpp>
 
-// must define preprocessors to enable quest's
-// deprecated v3 API, and disable the numerous
-// warnings issued by its compilation
-#define INCLUDE_DEPRECATED_FUNCTIONS 1
-#define DISABLE_DEPRECATION_WARNINGS 1
 #include "quest.h"
-
 #include "test_utilities.hpp"
 
 /* allows concise use of ContainsSubstring in catch's REQUIRE_THROWS_WITH */
diff --git a/tests/deprecated/test_decoherence.cpp b/tests/deprecated/test_decoherence.cpp
index 295263ef2..edf1d9f61 100644
--- a/tests/deprecated/test_decoherence.cpp
+++ b/tests/deprecated/test_decoherence.cpp
@@ -16,12 +16,7 @@
 #include <catch2/generators/catch_generators_range.hpp>
 #include <catch2/generators/catch_generators_random.hpp>
 
-// must define preprocessors to enable quest's
-// deprecated v3 API, and disable the numerous
-// warnings issued by its compilation
-#define INCLUDE_DEPRECATED_FUNCTIONS 1
-#define DISABLE_DEPRECATION_WARNINGS 1
-
+#include "quest.h"
 #include "test_utilities.hpp"
 
 #include <random>
diff --git a/tests/deprecated/test_gates.cpp b/tests/deprecated/test_gates.cpp
index f8c175666..5542318a4 100644
--- a/tests/deprecated/test_gates.cpp
+++ b/tests/deprecated/test_gates.cpp
@@ -18,13 +18,7 @@
 #include <catch2/matchers/catch_matchers_string.hpp>
 #include <catch2/generators/catch_generators_range.hpp>
 
-// must define preprocessors to enable quest's
-// deprecated v3 API, and disable the numerous
-// warnings issued by its compilation
-#define INCLUDE_DEPRECATED_FUNCTIONS 1
-#define DISABLE_DEPRECATION_WARNINGS 1
 #include "quest.h"
-
 #include "test_utilities.hpp"
 
 /* allows concise use of ContainsSubstring in catch's REQUIRE_THROWS_WITH */
diff --git a/tests/deprecated/test_main.cpp b/tests/deprecated/test_main.cpp
index fc75d8190..35ba37477 100644
--- a/tests/deprecated/test_main.cpp
+++ b/tests/deprecated/test_main.cpp
@@ -17,10 +17,6 @@
  */
 #include <catch2/catch_session.hpp>
 
-
-#define INCLUDE_DEPRECATED_FUNCTIONS 1
-#define DISABLE_DEPRECATION_WARNINGS 1
-
 #include "quest.h"
 #include "test_utilities.hpp"
 
diff --git a/tests/deprecated/test_operators.cpp b/tests/deprecated/test_operators.cpp
index 6130ad3b8..12f2cb0a7 100644
--- a/tests/deprecated/test_operators.cpp
+++ b/tests/deprecated/test_operators.cpp
@@ -15,13 +15,7 @@
 #include <catch2/generators/catch_generators_range.hpp>
 #include <catch2/matchers/catch_matchers_string.hpp>
 
-// must define preprocessors to enable quest's
-// deprecated v3 API, and disable the numerous
-// warnings issued by its compilation
-#define INCLUDE_DEPRECATED_FUNCTIONS 1
-#define DISABLE_DEPRECATION_WARNINGS 1
 #include "quest.h"
-
 #include "test_utilities.hpp"
 
 /** Prepares the needed data structures for unit testing some operators.
diff --git a/tests/deprecated/test_state_initialisations.cpp b/tests/deprecated/test_state_initialisations.cpp
index aca87f68f..995cf8a41 100644
--- a/tests/deprecated/test_state_initialisations.cpp
+++ b/tests/deprecated/test_state_initialisations.cpp
@@ -14,10 +14,7 @@
 #include <catch2/generators/catch_generators_range.hpp>
 #include <catch2/matchers/catch_matchers_string.hpp>
 
-#define INCLUDE_DEPRECATED_FUNCTIONS 1
-#define DISABLE_DEPRECATION_WARNINGS 1
 #include "quest.h"
-
 #include "test_utilities.hpp"
     
 /* allows concise use of ContainsSubstring in catch's REQUIRE_THROWS_WITH */
diff --git a/tests/deprecated/test_unitaries.cpp b/tests/deprecated/test_unitaries.cpp
index 6414880d1..f0bb2f5aa 100644
--- a/tests/deprecated/test_unitaries.cpp
+++ b/tests/deprecated/test_unitaries.cpp
@@ -15,13 +15,7 @@
 #include <catch2/generators/catch_generators_adapters.hpp>
 #include <catch2/matchers/catch_matchers_string.hpp>
 
-// must define preprocessors to enable quest's
-// deprecated v3 API, and disable the numerous
-// warnings issued by its compilation
-#define INCLUDE_DEPRECATED_FUNCTIONS 1
-#define DISABLE_DEPRECATION_WARNINGS 1
 #include "quest.h"
-
 #include "test_utilities.hpp"
 
 /** Prepares the needed data structures for unit testing unitaries. 
diff --git a/tests/deprecated/test_utilities.cpp b/tests/deprecated/test_utilities.cpp
index 27ef8af9f..81be43525 100644
--- a/tests/deprecated/test_utilities.cpp
+++ b/tests/deprecated/test_utilities.cpp
@@ -9,10 +9,7 @@
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/generators/catch_generators.hpp>
 
-#define INCLUDE_DEPRECATED_FUNCTIONS 1
-#define DISABLE_DEPRECATION_WARNINGS 1
 #include "quest.h"
-
 #include "test_utilities.hpp"
 
 #include <random>
diff --git a/utils/scripts/compile.sh b/utils/scripts/compile.sh
index ac6eccf02..8eca0ae27 100755
--- a/utils/scripts/compile.sh
+++ b/utils/scripts/compile.sh
@@ -16,27 +16,35 @@
 FLOAT_PRECISION=2
 
 # deployments to compile (0, 1)
-COMPILE_MPI=0        # distribution
-COMPILE_OPENMP=0     # multithreading
-COMPILE_CUDA=0       # GPU acceleration
-COMPILE_CUQUANTUM=0  # GPU + cuQuantum
+ENABLE_DISTRIBUTION=0       # MPI
+ENABLE_MULTITHREADING=0     # OpenMP
+ENABLE_CUDA=0               # NVIDIA GPU
+ENABLE_HIP=0                # AMD GPU
+ENABLE_CUQUANTUM=0          # NVIDIA cuStateVec
+ENABLE_NUMA=0               # NUMA awareness
 
-# GPU compute capability
-GPU_CC=90
+# other options (0, 1)
+ENABLE_DEPRECATED_API=0
+DISABLE_DEPRECATION_WARNINGS=0
+
+# NVIDIA compute capability or AMD arch (e.g. 60 or gfx908)
+GPU_ARCH=90
 
 # backend compilers
 TESTS_COMPILER=g++
 BASE_COMPILER=g++
 OMP_COMPILER=g++
 MPI_COMPILER=mpic++
-GPU_COMPILER=nvcc
+CUDA_COMPILER=nvcc
+HIP_COMPILER=hipcc
 
 # linker
 LINKER=g++
 
-# whether to compile unit tests (1) or the below user files (0),
-# or the v3 deprecated unit tests (2). when either tests are
-# compiled, all user-source related settings are ignored. 
+# whether to compile the below user source files (0),
+# or the unit tests (1), which when paired with above
+# ENABLE_DEPRECATED_API=1, will use the v3 tests (which
+# you should pair with DISABLE_DEPRECATION_WARNINGS=1)
 COMPILE_TESTS=0
 
 # name of the compiled test executable
@@ -64,7 +72,7 @@ USER_CXX_COMP_FLAGS='-std=c++14'
 # user linker flags
 USER_LINK_FLAGS='-lstdc++'
 
-# whether to compile cuQuantum (consulted only when COMPILE_CUQUANTUM=1)
+# whether to compile cuQuantum (consulted only when ENABLE_CUQUANTUM=1)
 # in debug mode, which logs to below file with performance tips and errors
 CUQUANTUM_LOG=0
 CUQUANTUM_LOG_FN="./custatevec_log.txt"
@@ -72,14 +80,10 @@ CUQUANTUM_LOG_FN="./custatevec_log.txt"
 # external library locations (replace with "." to default)
 CUQUANTUM_LIB_DIR="${CUQUANTUM_ROOT}"
 CUDA_LIB_DIR="/usr/local/cuda"
+ROCM_LIB_DIR="/opt/rocm"
 OMP_LIB_DIR="/opt/homebrew/opt/libomp"
 MPI_LIB_DIR="/opt/homebrew/opt/openmpi"
-CATCH_LIB_DIR="tests/deprecated/catch"
-
-# TODO:
-# use of 'CATCH_LIB_DIR' above will change when v4 tests 
-# switch to using Catch2 as supplied by CMake, rather
-# than the hacky use of deprecated v3's single-header  
+CATCH_LIB_DIR="$(pwd)/catch"
 
 
 
@@ -91,6 +95,8 @@ TEST_OBJ_PREF='test_'
 
 INDENT='  '
 
+CATCH_VERSION="3.4.0"
+
 
 
 # QUEST FILE LAYOUT
@@ -112,6 +118,10 @@ TEST_UNIT_DIR="${TEST_MAIN_DIR}/unit"
 TEST_DEPR_DIR="${TEST_MAIN_DIR}/deprecated"
 TEST_DEPR_CATCH_DIR="${TEST_DEPR_DIR}/catch"
 
+# files that require modification by this script
+CONFIG_FILE_IN="${INCLUDE_DIR}/config.h.in"
+CONFIG_FILE_OUT="${INCLUDE_DIR}/config.h"
+
 # files in API_DIR
 API_FILES=(
     "calculations"
@@ -122,9 +132,11 @@ API_FILES=(
     "initialisations"
     "matrices"
     "modes"
+    "multiplication"
     "operations"
     "paulis"
     "qureg"
+    "trotterisation"
     "types"
 )
 
@@ -132,10 +144,12 @@ API_FILES=(
 CORE_FILES=(
     "accelerator"
     "autodeployer"
+    "envvars"
     "errors"
     "localiser"
     "memory"
     "parser"
+    "paulilogic"
     "printer"
     "randomiser"
     "utilities"
@@ -169,6 +183,7 @@ TEST_MAIN_FILES=(
 TEST_UTIL_FILES=(
     "cache"
     "compare"
+    "config"
     "convert"
     "evolve"
     "linalg"
@@ -188,9 +203,11 @@ TEST_UNIT_FILES=(
     "environment"
     "initialisations"
     "matrices"
+    "multiplication"
     "operations"
     "paulis"
     "qureg"
+    "trotterisation"
     "types"
 )
 
@@ -216,10 +233,11 @@ TEST_DEPR_MPI_FILES=(
 # COMPILER AND LINKER FLAG OPTIONS
 
 # compiler flags given to all (non-deprecated) files
-TEST_COMP_FLAGS="-std=c++20 -I${CATCH_LIB_DIR}"
+TEST_COMP_FLAGS="-std=c++20 -I${CATCH_LIB_DIR}/include"
+TEST_LINK_FLAGS="-L${CATCH_LIB_DIR}/lib -lCatch2"
 
 # compiler flags given to deprecated test files
-TEST_DEPR_COMP_FLAGS="-std=c++17 -I${TEST_DEPR_CATCH_DIR}"
+TEST_DEPR_COMP_FLAGS="-std=c++17 -I${CATCH_LIB_DIR}/include"
 
 # compiler flags given to all backend files
 BACKEND_COMP_FLAGS='-std=c++17 -O3'
@@ -227,23 +245,27 @@ BACKEND_COMP_FLAGS='-std=c++17 -O3'
 # warning flags which apply to all compiled and linked files including user's
 WARNING_FLAGS='-Wall'
 
-# GPU-specific flags
-GPU_COMP_FLAGS="-x cu -arch=sm_${GPU_CC} -I${CUDA_LIB_DIR}/include"
-GPU_LINK_FLAGS="-L${CUDA_LIB_DIR}/lib -L${CUDA_LIB_DIR}/lib64 -lcudart -lcuda"
+# CUDA specific flags
+CUDA_COMP_FLAGS="-x cu -arch=sm_${GPU_ARCH} -I${CUDA_LIB_DIR}/include"
+CUDA_LINK_FLAGS="-L${CUDA_LIB_DIR}/lib -L${CUDA_LIB_DIR}/lib64 -lcudart -lcuda"
 
-if [ $COMPILE_CUQUANTUM == 1 ]
+if [ $ENABLE_CUQUANTUM == 1 ]
 then
     # extend GPU flags if cuQuantum enabled
-    GPU_COMP_FLAGS+=" -I${CUQUANTUM_LIB_DIR}/include"
-    GPU_LINK_FLAGS+=" -L${CUQUANTUM_LIB_DIR}/lib -L${CUQUANTUM_LIB_DIR}/lib64 -lcustatevec"
+    CUDA_COMP_FLAGS+=" -I${CUQUANTUM_LIB_DIR}/include"
+    CUDA_LINK_FLAGS+=" -L${CUQUANTUM_LIB_DIR}/lib -L${CUQUANTUM_LIB_DIR}/lib64 -lcustatevec"
 
     # optional debug logging - will slow down code
     if [ $CUQUANTUM_LOG == 1 ]
     then
-        GPU_COMP_FLAGS+=" -DCUSTATEVEC_LOG_LEVEL=5 -DCUSTATEVEC_LOG_FILE=${CUQUANTUM_LOG_FN}"
+        CUDA_COMP_FLAGS+=" -DCUSTATEVEC_LOG_LEVEL=5 -DCUSTATEVEC_LOG_FILE=${CUQUANTUM_LOG_FN}"
     fi
 fi
 
+# HIP specific flags
+HIP_COMP_FLAGS="-x hip --offload-arch=${GPU_ARCH} -I${ROCM_LIB_DIR}/include"
+HIP_LINK_FLAGS="-L${ROCM_LIB_DIR}/lib -lamdhip64"
+
 # MPI-specific flags
 MPI_COMP_FLAGS="-I${MPI_LIB_DIR}/include"
 MPI_LINK_FLAGS="-L${MPI_LIB_DIR}/lib -lmpi"
@@ -271,14 +293,10 @@ else
     OMP_LINK_FLAGS+=' -fopenmp'
 fi
 
-# define pre-processor macros to indicate deployment mode
-MODE_FLAGS="-DCOMPILE_MPI=${COMPILE_MPI} "
-MODE_FLAGS+="-DCOMPILE_OPENMP=${COMPILE_OPENMP} "
-MODE_FLAGS+="-DCOMPILE_CUDA=${COMPILE_CUDA} "
-MODE_FLAGS+="-DCOMPILE_CUQUANTUM=${COMPILE_CUQUANTUM}"
-
-# define pre-processor macros to set qcomp precision
-PREC_FLAG="-DFLOAT_PRECISION=${FLOAT_PRECISION}"
+if [ $ENABLE_NUMA == 1 ]
+then
+    OMP_LINK_FLAGS+=' -lnuma'
+fi
 
 # point compilers to QuEST src
 HEADER_FLAGS="-I. -I${INCLUDE_DIR}"
@@ -290,34 +308,44 @@ HEADER_FLAGS="-I. -I${INCLUDE_DIR}"
 echo ""
 echo "deployment modes:"
 
-# flags given to every compilation unit
-GLOBAL_COMP_FLAGS="${HEADER_FLAGS} ${MODE_FLAGS} ${PREC_FLAG}"
-
 # choose linker flags (extended below)
 ALL_LINK_FLAGS="${USER_LINK_FLAGS}"
 
 # choose compiler and flags for CPU/OMP files
-if [ $COMPILE_OPENMP == 1 ]
+CPU_FILES_FLAGS='-Ofast -DCOMPLEX_OVERLOADS_PATCHED=1'
+
+if [ $ENABLE_MULTITHREADING == 1 ]
 then
     echo "${INDENT}(multithreading enabled)"
     echo "${INDENT}${INDENT}[compiling OpenMP]"
+    if [ $ENABLE_NUMA == 1 ]
+    then
+        echo "${INDENT}${INDENT}[compiling NUMA]"
+    fi
     CPU_FILES_COMPILER=$OMP_COMPILER
-    CPU_FILES_FLAGS=$OMP_COMP_FLAGS
+    CPU_FILES_FLAGS+=" ${OMP_COMP_FLAGS}"
     ALL_LINK_FLAGS+=" ${OMP_LINK_FLAGS}"
 else
     echo "${INDENT}(multithreading disabled)"
     CPU_FILES_COMPILER=$BASE_COMPILER
-    CPU_FILES_FLAGS=''
 fi
 
 # choose compiler and flags for GPU files
-if [ $COMPILE_CUDA == 1 ]
+if [ $ENABLE_CUDA == 1 ]
 then
     echo "${INDENT}(GPU-acceleration enabled)"
     echo "${INDENT}${INDENT}[compiling CUDA]"
-    GPU_FILES_COMPILER=$GPU_COMPILER
-    GPU_FILES_FLAGS=$GPU_COMP_FLAGS
-    ALL_LINK_FLAGS+=" ${GPU_LINK_FLAGS}"
+    GPU_FILES_COMPILER=$CUDA_COMPILER
+    GPU_FILES_FLAGS=$CUDA_COMP_FLAGS
+    ALL_LINK_FLAGS+=" ${CUDA_LINK_FLAGS}"
+    GPU_WARNING_FLAGS="-Xcompiler ${WARNING_FLAGS}"
+elif [ $ENABLE_HIP == 1 ]
+then
+    echo "${INDENT}(GPU-acceleration enabled)"
+    echo "${INDENT}${INDENT}[compiling HIP]"
+    GPU_FILES_COMPILER=$HIP_COMPILER
+    GPU_FILES_FLAGS=$HIP_COMP_FLAGS
+    ALL_LINK_FLAGS+=" ${HIP_LINK_FLAGS}"
     GPU_WARNING_FLAGS="-Xcompiler ${WARNING_FLAGS}"
 else
     echo "${INDENT}(GPU-acceleration disabled)"
@@ -327,7 +355,7 @@ else
 fi
 
 # merely report cuQuantum status
-if [ $COMPILE_CUQUANTUM == 1 ]
+if [ $ENABLE_CUQUANTUM == 1 ]
 then
     echo "${INDENT}(cuQuantum enabled)"
     echo "${INDENT}${INDENT}[compiling cuStateVec]"
@@ -336,7 +364,7 @@ else
 fi
 
 # choose compiler and flags for communication files
-if [ $COMPILE_MPI == 1 ]
+if [ $ENABLE_DISTRIBUTION == 1 ]
 then
     echo "${INDENT}(distribution enabled)"
     echo "${INDENT}${INDENT}[compiling MPI]"
@@ -349,7 +377,7 @@ else
     MPI_FILES_FLAGS=''
 fi
 
-# choose linker warning flag (to avoid pass them to nvcc)
+# choose linker warning flag (to avoid passing them to nvcc)
 if [ "${LINKER}" = "nvcc" ]
 then
     ALL_LINK_FLAGS+="-Xcompiler ${WARNING_FLAGS}"
@@ -357,6 +385,12 @@ else
     ALL_LINK_FLAGS+=" ${WARNING_FLAGS}"
 fi
 
+# test link flags
+if [ "${COMPILE_TESTS}" -eq 1 ] 
+then
+    ALL_LINK_FLAGS+=" ${TEST_LINK_FLAGS}"
+fi
+
 # display precision
 if [ $FLOAT_PRECISION == 1 ]; then
     echo "${INDENT}(single precision)"
@@ -377,7 +411,6 @@ echo ""
 
 # REPORTING COMILERS FLAGS
 
-
 echo "chosen compilers and flags..."
 
 # user compilers
@@ -389,14 +422,14 @@ then
 fi
 
 # test compiler
-if (( $COMPILE_TESTS == 1 ))
+if (( $COMPILE_TESTS == 1 && ENABLE_DEPRECATED_API == 0 ))
 then
     echo "${INDENT}tests compiler and flags:"
     echo "${INDENT}${INDENT}${TESTS_COMPILER} ${TEST_COMP_FLAGS} ${WARNING_FLAGS}"
 fi
 
 # deprecated compiler
-if (( $COMPILE_TESTS == 2 ))
+if (( $COMPILE_TESTS == 1 && ENABLE_DEPRECATED_API == 1 ))
 then
     echo "${INDENT}deprecated tests compiler and flags:"
     echo "${INDENT}${INDENT}${TESTS_COMPILER} ${TEST_DEPR_COMP_FLAGS} ${WARNING_FLAGS}"
@@ -425,7 +458,67 @@ echo "${INDENT}${INDENT}${LINKER} ${ALL_LINK_FLAGS}"
 # globals
 echo "${INDENT}header flags:"
 echo "${INDENT}${INDENT}${HEADER_FLAGS}"
+echo ""
 
+
+
+# OPTIONALLY PREPARING CATCH2
+
+if [ "${COMPILE_TESTS}" -eq 1 ] 
+then
+    echo "preparing Catch2:"
+
+    if [ -d "${CATCH_LIB_DIR}" ]
+    then
+        echo "${INDENT}found at ${CATCH_LIB_DIR}"
+    else
+        echo "${INDENT}downloading to ${CATCH_LIB_DIR}..."
+        git clone --quiet https://github.com/catchorg/Catch2.git "${CATCH_LIB_DIR}"
+
+        ORIGINAL_DIR=$(pwd)
+
+        echo "${INDENT}configuring..."
+        cd "${CATCH_LIB_DIR}"
+        git fetch --quiet --tags
+        git checkout --quiet "v${CATCH_VERSION}"
+        git submodule update --quiet --init --recursive
+
+        echo "${INDENT}building..."
+        mkdir build
+        cd build
+        cmake .. -DCMAKE_INSTALL_PREFIX="${CATCH_LIB_DIR}"
+        cmake --build . --target install --parallel
+
+        cd "${ORIGINAL_DIR}"
+    fi
+
+    echo ""
+fi
+
+
+
+# GENERATING CONFIG HEADER
+
+echo "generating headers:"
+
+# write user-options as macros to config.h (and set version info to -1)
+sed \
+  -e "s|#cmakedefine FLOAT_PRECISION @FLOAT_PRECISION@|#define FLOAT_PRECISION ${FLOAT_PRECISION}|" \
+  -e "s|#cmakedefine01 INCLUDE_DEPRECATED_FUNCTIONS|#define INCLUDE_DEPRECATED_FUNCTIONS ${ENABLE_DEPRECATED_API}|" \
+  -e "s|#cmakedefine01 DISABLE_DEPRECATION_WARNINGS|#define DISABLE_DEPRECATION_WARNINGS ${DISABLE_DEPRECATION_WARNINGS}|" \
+  -e "s|#cmakedefine01 COMPILE_OPENMP|#define COMPILE_OPENMP ${ENABLE_MULTITHREADING}|" \
+  -e "s|#cmakedefine01 COMPILE_MPI|#define COMPILE_MPI ${ENABLE_DISTRIBUTION}|" \
+  -e "s|#cmakedefine01 COMPILE_CUDA|#define COMPILE_CUDA $(( ENABLE_CUDA || ENABLE_HIP ))|" \
+  -e "s|#cmakedefine01 COMPILE_CUQUANTUM|#define COMPILE_CUQUANTUM ${ENABLE_CUQUANTUM}|" \
+  -e "s|#cmakedefine01 COMPILE_HIP|#define COMPILE_HIP ${ENABLE_HIP}|" \
+  -e "s|#cmakedefine01 NUMA_AWARE|#define NUMA_AWARE ${ENABLE_NUMA}|" \
+  -e "s|@PROJECT_VERSION@|unknown (not populated by manual compilation)|" \
+  -e "s|@PROJECT_VERSION_MAJOR@|-1|" \
+  -e "s|@PROJECT_VERSION_MINOR@|-1|" \
+  -e "s|@PROJECT_VERSION_PATCH@|-1|" \
+  "${CONFIG_FILE_IN}" > "${CONFIG_FILE_OUT}"
+
+echo "${INDENT}${CONFIG_FILE_OUT}"
 echo ""
 
 
@@ -454,7 +547,7 @@ then
         fi
 
         # compile
-        $COMP -c $USER_DIR/$fn -o ${USER_OBJ_PREF}${fn}.o $FLAG $GLOBAL_COMP_FLAGS $WARNING_FLAGS
+        $COMP -c $USER_DIR/$fn -o ${USER_OBJ_PREF}${fn}.o $FLAG $HEADER_FLAGS $WARNING_FLAGS
     done
 
     echo ""
@@ -464,7 +557,7 @@ fi
 
 # COMPILING TESTS
 
-if (( $COMPILE_TESTS == 1 ))
+if (( $COMPILE_TESTS == 1 && $ENABLE_DEPRECATED_API == 0 ))
 then
 
     echo "compiling unit test files:"
@@ -474,7 +567,7 @@ then
     for fn in ${TEST_MAIN_FILES[@]}
     do
         echo "${INDENT}${INDENT}${fn}.cpp ..."
-        $TESTS_COMPILER -c $TEST_MAIN_DIR/$fn.cpp -o ${TEST_OBJ_PREF}${fn}.o $TEST_COMP_FLAGS $GLOBAL_COMP_FLAGS $WARNING_FLAGS
+        $TESTS_COMPILER -c $TEST_MAIN_DIR/$fn.cpp -o ${TEST_OBJ_PREF}${fn}.o $TEST_COMP_FLAGS $HEADER_FLAGS $WARNING_FLAGS
     done
 
     echo "${INDENT}utils:"
@@ -482,7 +575,7 @@ then
     for fn in ${TEST_UTIL_FILES[@]}
     do
         echo "${INDENT}${INDENT}${fn}.cpp ..."
-        $TESTS_COMPILER -c $TEST_UTIL_DIR/$fn.cpp -o ${TEST_OBJ_PREF}${fn}.o $TEST_COMP_FLAGS $GLOBAL_COMP_FLAGS $WARNING_FLAGS
+        $TESTS_COMPILER -c $TEST_UTIL_DIR/$fn.cpp -o ${TEST_OBJ_PREF}${fn}.o $TEST_COMP_FLAGS $HEADER_FLAGS $WARNING_FLAGS
     done
 
     echo "${INDENT}unit:"
@@ -490,7 +583,7 @@ then
     for fn in ${TEST_UNIT_FILES[@]}
     do
         echo "${INDENT}${INDENT}${fn}.cpp ..."
-        $TESTS_COMPILER -c $TEST_UNIT_DIR/$fn.cpp -o ${TEST_OBJ_PREF}${fn}.o $TEST_COMP_FLAGS $GLOBAL_COMP_FLAGS $WARNING_FLAGS
+        $TESTS_COMPILER -c $TEST_UNIT_DIR/$fn.cpp -o ${TEST_OBJ_PREF}${fn}.o $TEST_COMP_FLAGS $HEADER_FLAGS $WARNING_FLAGS
     done
 
     echo ""
@@ -500,20 +593,25 @@ fi
 
 # COMPILING DEPRECATED TESTS
 
-if (( $COMPILE_TESTS == 2 ))
+if (( $COMPILE_TESTS == 1 && $ENABLE_DEPRECATED_API == 1 ))
 then
     echo "compiling deprecated test files:"
 
+    if (( $DISABLE_DEPRECATION_WARNINGS == 0 ))
+    then
+        echo "${INDENT}(beware deprecation warnings were not disabled)"
+    fi
+
     for fn in ${TEST_DEPR_FILES[@]}
     do
         echo "${INDENT}${fn}.cpp ..."
-        $TESTS_COMPILER -c $TEST_DEPR_DIR/$fn.cpp -o ${TEST_OBJ_PREF}${fn}.o $TEST_DEPR_COMP_FLAGS $GLOBAL_COMP_FLAGS $WARNING_FLAGS
+        $TESTS_COMPILER -c $TEST_DEPR_DIR/$fn.cpp -o ${TEST_OBJ_PREF}${fn}.o $TEST_DEPR_COMP_FLAGS $HEADER_FLAGS $WARNING_FLAGS
     done
 
     for fn in ${TEST_DEPR_MPI_FILES[@]}
     do
         echo "${INDENT}${fn}.cpp ..."
-        $MPI_FILES_COMPILER -c $TEST_DEPR_DIR/$fn.cpp -o ${TEST_OBJ_PREF}${fn}.o $TEST_DEPR_COMP_FLAGS $GLOBAL_COMP_FLAGS $WARNING_FLAGS
+        $MPI_FILES_COMPILER -c $TEST_DEPR_DIR/$fn.cpp -o ${TEST_OBJ_PREF}${fn}.o $TEST_DEPR_COMP_FLAGS $HEADER_FLAGS $WARNING_FLAGS
     done
 
     echo ""
@@ -528,7 +626,7 @@ echo "compiling core files in C++"
 for fn in ${CORE_FILES[@]}
 do
     echo "${INDENT}${fn}.cpp ..."
-    $BASE_COMPILER -c $CORE_DIR/$fn.cpp -o ${QUEST_OBJ_PREF}${fn}.o $BACKEND_COMP_FLAGS $GLOBAL_COMP_FLAGS $WARNING_FLAGS
+    $BASE_COMPILER -c $CORE_DIR/$fn.cpp -o ${QUEST_OBJ_PREF}${fn}.o $BACKEND_COMP_FLAGS $HEADER_FLAGS $WARNING_FLAGS
 done
 
 echo ""
@@ -542,7 +640,7 @@ echo "compiling API files in C++:"
 for fn in ${API_FILES[@]}
 do
     echo "${INDENT}${fn}.cpp ..."
-    $BASE_COMPILER -c $API_DIR/$fn.cpp -o ${QUEST_OBJ_PREF}${fn}.o $BACKEND_COMP_FLAGS $GLOBAL_COMP_FLAGS $WARNING_FLAGS
+    $BASE_COMPILER -c $API_DIR/$fn.cpp -o ${QUEST_OBJ_PREF}${fn}.o $BACKEND_COMP_FLAGS $HEADER_FLAGS $WARNING_FLAGS
 done
 
 echo ""
@@ -556,7 +654,7 @@ echo "compiling CPU/OMP files..."
 for fn in ${OMP_FILES[@]}
 do
     echo "${INDENT}${fn}.cpp ..."
-    $CPU_FILES_COMPILER -c $OMP_DIR/$fn.cpp -o ${QUEST_OBJ_PREF}${fn}.o $CPU_FILES_FLAGS $BACKEND_COMP_FLAGS $GLOBAL_COMP_FLAGS $WARNING_FLAGS
+    $CPU_FILES_COMPILER -c $OMP_DIR/$fn.cpp -o ${QUEST_OBJ_PREF}${fn}.o $CPU_FILES_FLAGS $BACKEND_COMP_FLAGS $HEADER_FLAGS $WARNING_FLAGS
 done
 
 echo ""
@@ -570,7 +668,7 @@ echo "compiling GPU files..."
 for fn in ${GPU_FILES[@]}
 do
     echo "${INDENT}${fn}.cpp ..."
-    $GPU_FILES_COMPILER -c $GPU_DIR/$fn.cpp -o ${QUEST_OBJ_PREF}${fn}.o $GPU_FILES_FLAGS $BACKEND_COMP_FLAGS $GLOBAL_COMP_FLAGS $GPU_WARNING_FLAGS
+    $GPU_FILES_COMPILER -c $GPU_DIR/$fn.cpp -o ${QUEST_OBJ_PREF}${fn}.o $GPU_FILES_FLAGS $BACKEND_COMP_FLAGS $HEADER_FLAGS $GPU_WARNING_FLAGS
 done
 
 echo ""
@@ -584,7 +682,7 @@ echo "compiling communication/MPI files..."
 for fn in ${MPI_FILES[@]}
 do
     echo "${INDENT}${fn}.cpp ..."
-    $MPI_FILES_COMPILER -c $MPI_DIR/$fn.cpp -o ${QUEST_OBJ_PREF}${fn}.o $MPI_FILES_FLAGS $BACKEND_COMP_FLAGS $GLOBAL_COMP_FLAGS $WARNING_FLAGS
+    $MPI_FILES_COMPILER -c $MPI_DIR/$fn.cpp -o ${QUEST_OBJ_PREF}${fn}.o $MPI_FILES_FLAGS $BACKEND_COMP_FLAGS $HEADER_FLAGS $WARNING_FLAGS
 done
 
 echo ""
@@ -606,12 +704,12 @@ OBJECTS+=" $(printf " ${QUEST_OBJ_PREF}%s.o" "${MPI_FILES[@]}")"
 if (( $COMPILE_TESTS == 0 ))
 then
     OBJECTS+=" $(printf " ${USER_OBJ_PREF}%s.o" "${USER_FILES[@]}")"
-elif (( $COMPILE_TESTS == 1 ))
+elif (( $COMPILE_TESTS == 1 && $ENABLE_DEPRECATED_API == 0 ))
 then
     OBJECTS+=" $(printf " ${TEST_OBJ_PREF}%s.o" "${TEST_MAIN_FILES[@]}")"
     OBJECTS+=" $(printf " ${TEST_OBJ_PREF}%s.o" "${TEST_UTIL_FILES[@]}")"
     OBJECTS+=" $(printf " ${TEST_OBJ_PREF}%s.o" "${TEST_UNIT_FILES[@]}")"
-elif (( $COMPILE_TESTS == 2 ))
+elif (( $COMPILE_TESTS == 1 && $ENABLE_DEPRECATED_API == 1 ))
 then
     OBJECTS+=" $(printf " ${TEST_OBJ_PREF}%s.o" "${TEST_DEPR_FILES[@]}")"
     OBJECTS+=" $(printf " ${TEST_OBJ_PREF}%s.o" "${TEST_DEPR_MPI_FILES[@]}")"

From f0d4bcd1cc487d7f26bc6a5b101764ac8848062e Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Wed, 8 Oct 2025 21:52:36 -0400
Subject: [PATCH 31/32] bumped version to v4.2.0

---
 CMakeLists.txt      | 2 +-
 utils/docs/Doxyfile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c97a84497..eb826e18e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,7 @@ cmake_minimum_required(VERSION 3.21)
 
 
 project(QuEST
-  VERSION 4.1.0
+  VERSION 4.2.0
   DESCRIPTION "Quantum Exact Simulation Toolkit"
   LANGUAGES CXX C
 )
diff --git a/utils/docs/Doxyfile b/utils/docs/Doxyfile
index 6efe89d70..9432ff058 100644
--- a/utils/docs/Doxyfile
+++ b/utils/docs/Doxyfile
@@ -51,7 +51,7 @@ PROJECT_NAME           = "The Quantum Exact Simulation Toolkit"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = "v4.1.0"
+PROJECT_NUMBER         = "v4.2.0"
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a

From b0c66d2b088724cc79f614a58aab20b64e9cccca Mon Sep 17 00:00:00 2001
From: Tyson Jones <tyson.jones.input@gmail.com>
Date: Sun, 12 Oct 2025 21:00:07 -0400
Subject: [PATCH 32/32] updated README with EPCC lead

---
 README.md | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index c0c510ba0..9eb85bfdf 100644
--- a/README.md
+++ b/README.md
@@ -29,9 +29,6 @@
 
 <!-- intro -->
 
-> [!NOTE]
-> QuEST `v4` has been released which re-designed QuEST from the ground up. Read about the exciting new features [here](docs/v4.md).
-
 The **Quantum Exact Simulation Toolkit** (QuEST) is a high-performance simulator of quantum statevectors and density matrices.
 It hybridises **multithreading**, **GPU acceleration** and **distribution** to run lightning fast on laptops, desktops and 
 supercomputers, parallelising over multiple cores, CPUs and GPUs. Behind the scenes, QuEST leverages [OpenMP](https://www.openmp.org/),
@@ -58,9 +55,8 @@ by both `C` and `C++` and all the major compilers (detailed [here](docs/compiler
 
 </div>
 
-
-QuEST development is led by the [QTechTheory](http://qtechtheory.org/) group at the University of Oxford, with active contributions from the [EPCC](https://www.epcc.ed.ac.uk/) team at the University of Edinburgh, and support from the below organisations.
-In particular, QuEST `v4` was made possible through the support of the UK National Quantum Computing centre (_NQCC200921_) and the [UKRI SEEQA](https://gtr.ukri.org/projects?ref=EP%2FY004655%2F1#/tabOverview) project.
+As of `v4.2`, QuEST development is led by the [EPCC](https://www.epcc.ed.ac.uk/) team at the University of Edinburgh, with support and former development by the [QTechTheory](http://qtechtheory.org/) group at the University of Oxford. QuEST has also received contributions and support from the below organisations.
+In particular, QuEST `v4.0` was made possible through the support of the UK National Quantum Computing centre (_NQCC200921_) and the [UKRI SEEQA](https://gtr.ukri.org/projects?ref=EP%2FY004655%2F1#/tabOverview) project.
 
 <div align="center">