Skip to content

Conversation

TysonRayJones
Copy link
Member

as per #638

Before merging, the performance impact of these changes must be measured in all settings.

A sincere thanks to Pavel Khudov at Quantum Motion for helping discover the performance regression!

as per #638

Before merging, the performance impact of these changes must be measured in all settings.
@TysonRayJones
Copy link
Member Author

Tested performance of the devel branch (not with this PR's changes) via:

#include "quest.h"

#include <chrono>
#include <vector>
#include <random>
#include <iostream>
#include <algorithm>

using std::vector;

static std::mt19937 RNG;


vector<int> getRandomCtrls(int numCtrls, int maxCtrlIndExcl, int excludeTarget) {

	vector<int> ctrls;
	for (int i=0; i<maxCtrlIndExcl; i++)
		if (i != excludeTarget)
			ctrls.push_back(i);

	// shuffle controls only when non-distributed (lazy)
	if (!getQuESTEnv().isDistributed)
		std::shuffle(ctrls.begin(), ctrls.end(), RNG);

	return vector<int>(ctrls.begin(), ctrls.begin() + numCtrls);
}


template <typename T>
double getFuncDuration(int nQb, T func) {

	double out = 0;

	for (int t=0; t<nQb; t++) {

		auto ctrls = getRandomCtrls(nQb-1, nQb, t);

		// warmup
		for(int r=0; r<5; r++)
			func(ctrls.data(), t);

		auto start = std::chrono::high_resolution_clock::now();

		func(ctrls.data(), t);
		syncQuESTEnv();

		auto end = std::chrono::high_resolution_clock::now();
		out += (end - start).count();
	}

	return out;
}


template <typename T1, typename T2>
void compareFuncs(std::string label, int nQb, T1 func1, T2 func2) {

	std::cout << label << "\t";

	double oldDur = getFuncDuration(nQb, func1);
	double newDur = getFuncDuration(nQb, func2);

	qreal speedup = oldDur/newDur;
	std::cout 
		<< oldDur << " \t v " << newDur << " \t = " << speedup << "x"
		<< ((speedup < 1)? "\t!!! " : "") << std::endl;
}


int main() {
	initQuESTEnv();
	setValidationEpsilon(0);

	// suppress non-root output
	if (getQuESTEnv().rank != 0)
		std::cout.setstate(std::ios_base::failbit);

	// prepare matrix alternatives to Pauli functions
	CompMatr1 x = getCompMatr1({{0,1}, {1,0}});
	CompMatr1 y = getCompMatr1({{0,-1i}, {1i,0}});
	DiagMatr1 z = getDiagMatr1({1, -1});

	qreal a = 0.123;
    qreal c = std::cos(-a/2);
    qreal s = std::sin(-a/2);
    qcomp v = std::exp(qcomp(0, -a/2));

    CompMatr1 rx = getCompMatr1({
        {qcomp(c,0), qcomp(0,s)},
        {qcomp(0,s), qcomp(c,0)}
    });
    CompMatr1 ry = getCompMatr1({
        { c, s},
        {-s, c}
    });
    DiagMatr1 rz = getDiagMatr1(
    	{v, std::conj(v)}
    );

    // prepare random-ctrl-list RNG
    std::random_device cspnrg;
    unsigned seed = cspnrg();
    RNG.seed(seed);



    double oldDur, newDur;

	for (int nQb : {5, 10, 15, 20, 25, 30}) {

		std::cout << "\n[" << nQb << " qubits]" << std::endl;

		// beware; will use all available accelerations
		Qureg q = createForcedQureg(nQb);

		int st[] = {0,1,0,1};
		qreal a = 0.123;


		compareFuncs("X", nQb, 
			[&](int* c, int t) { applyPauliX(q, t); },
			[&](int* c, int t) { applyCompMatr1(q, t, x); });

		compareFuncs("cX", nQb, 
			[&](int* c, int t) { applyControlledPauliX   (q, c[0], t); },
			[&](int* c, int t) { applyControlledCompMatr1(q, c[0], t, x); });

		compareFuncs("ccccX", nQb, 
			[&](int* c, int t) { applyMultiControlledPauliX   (q, c, 4, t); },
			[&](int* c, int t) { applyMultiControlledCompMatr1(q, c, 4, t, x); });

		compareFuncs("csX", nQb, 
			[&](int* c, int t) { applyMultiStateControlledPauliX   (q, c, st, 4, t); },
			[&](int* c, int t) { applyMultiStateControlledCompMatr1(q, c, st, 4, t, x); });


		compareFuncs("Y", nQb, 
			[&](int* c, int t) { applyPauliY(q, t); },
			[&](int* c, int t) { applyCompMatr1(q, t, y); });

		compareFuncs("cY", nQb, 
			[&](int* c, int t) { applyControlledPauliY   (q, c[0], t); },
			[&](int* c, int t) { applyControlledCompMatr1(q, c[0], t, y); });

		compareFuncs("ccccY", nQb, 
			[&](int* c, int t) { applyMultiControlledPauliY   (q, c, 4, t); },
			[&](int* c, int t) { applyMultiControlledCompMatr1(q, c, 4, t, y); });

		compareFuncs("csY", nQb, 
			[&](int* c, int t) { applyMultiStateControlledPauliY   (q, c, st, 4, t); },
			[&](int* c, int t) { applyMultiStateControlledCompMatr1(q, c, st, 4, t, y); });


		compareFuncs("Z", nQb, 
			[&](int* c, int t) { applyPauliZ(q, t); },
			[&](int* c, int t) { applyDiagMatr1(q, t, z); });

		compareFuncs("cZ", nQb, 
			[&](int* c, int t) { applyControlledPauliZ   (q, c[0], t); },
			[&](int* c, int t) { applyControlledDiagMatr1(q, c[0], t, z); });

		compareFuncs("ccccZ", nQb, 
			[&](int* c, int t) { applyMultiControlledPauliZ   (q, c, 4, t); },
			[&](int* c, int t) { applyMultiControlledDiagMatr1(q, c, 4, t, z); });

		compareFuncs("csZ", nQb, 
			[&](int* c, int t) { applyMultiStateControlledPauliZ   (q, c, st, 4, t); },
			[&](int* c, int t) { applyMultiStateControlledDiagMatr1(q, c, st, 4, t, z); });


		compareFuncs("Rx", nQb, 
			[&](int* c, int t) { applyRotateX(q, t, a); },
			[&](int* c, int t) { applyCompMatr1(q, t, rx); });

		compareFuncs("cRx", nQb, 
			[&](int* c, int t) { applyControlledRotateX(q, c[0], t, a); },
			[&](int* c, int t) { applyControlledCompMatr1(q, c[0], t, rx); });

		compareFuncs("ccccRx", nQb, 
			[&](int* c, int t) { applyMultiControlledRotateX(q, c, 4, t, a); },
			[&](int* c, int t) { applyMultiControlledCompMatr1(q, c, 4, t, rx); });

		compareFuncs("csRx", nQb, 
			[&](int* c, int t) { applyMultiStateControlledRotateX(q, c, st, 4, t, a); },
			[&](int* c, int t) { applyMultiStateControlledCompMatr1(q, c, st, 4, t, rx); });


		compareFuncs("Ry", nQb, 
			[&](int* c, int t) { applyRotateY(q, t, a); },
			[&](int* c, int t) { applyCompMatr1(q, t, ry); });

		compareFuncs("cRy", nQb, 
			[&](int* c, int t) { applyControlledRotateY(q, c[0], t, a); },
			[&](int* c, int t) { applyControlledCompMatr1(q, c[0], t, ry); });

		compareFuncs("ccccRy", nQb, 
			[&](int* c, int t) { applyMultiControlledRotateY(q, c, 4, t, a); },
			[&](int* c, int t) { applyMultiControlledCompMatr1(q, c, 4, t, ry); });

		compareFuncs("csRy", nQb, 
			[&](int* c, int t) { applyMultiStateControlledRotateY(q, c, st, 4, t, a); },
			[&](int* c, int t) { applyMultiStateControlledCompMatr1(q, c, st, 4, t, ry); });


		compareFuncs("Rz", nQb, 
			[&](int* c, int t) { applyRotateZ(q, t, a); },
			[&](int* c, int t) { applyDiagMatr1(q, t, rz); });

		compareFuncs("cRz", nQb, 
			[&](int* c, int t) { applyControlledRotateZ(q, c[0], t, a); },
			[&](int* c, int t) { applyControlledDiagMatr1(q, c[0], t, rz); });

		compareFuncs("ccccRz", nQb, 
			[&](int* c, int t) { applyMultiControlledRotateZ(q, c, 4, t, a); },
			[&](int* c, int t) { applyMultiControlledDiagMatr1(q, c, 4, t, rz); });

		compareFuncs("csRz", nQb, 
			[&](int* c, int t) { applyMultiStateControlledRotateZ(q, c, st, 4, t, a); },
			[&](int* c, int t) { applyMultiStateControlledDiagMatr1(q, c, st, 4, t, rz); });


		destroyQureg(q);
	}
	
	finalizeQuESTEnv();
	return 0;
}

This compares the existing single-qubit Pauli methods (which call the general multi-qubit cases) with this PR's revised methods, which instead use the matrix methods. Broad speedup is obtained, though with concerning variability for large GPU-accelerated register.

single-CPU

[5 qubits]
X	82217 	 v 32243 	 = 2.54992x
cX	71296 	 v 37541 	 = 1.89915x
ccccX	88574 	 v 41423 	 = 2.13828x
csX	76711 	 v 52796 	 = 1.45297x
Y	55548 	 v 31895 	 = 1.74159x
cY	72492 	 v 37339 	 = 1.94146x
ccccY	88870 	 v 41602 	 = 2.1362x
csY	77445 	 v 46485 	 = 1.66602x
Z	48534 	 v 27174 	 = 1.78605x
cZ	65593 	 v 31974 	 = 2.05145x
ccccZ	83993 	 v 37601 	 = 2.2338x
csZ	71983 	 v 42407 	 = 1.69743x
Rx	56036 	 v 32051 	 = 1.74834x
cRx	72412 	 v 37855 	 = 1.91288x
ccccRx	89034 	 v 41415 	 = 2.1498x
csRx	56008 	 v 30378 	 = 1.8437x
Ry	36632 	 v 21083 	 = 1.73751x
cRy	47740 	 v 24315 	 = 1.9634x
ccccRy	58407 	 v 27107 	 = 2.15468x
csRy	50782 	 v 30289 	 = 1.67658x
Rz	30757 	 v 17759 	 = 1.73191x
cRz	39793 	 v 20744 	 = 1.91829x
ccccRz	52228 	 v 24321 	 = 2.14744x
csRz	45105 	 v 27656 	 = 1.63093x

[10 qubits]
X	306000 	 v 177304 	 = 1.72585x
cX	210699 	 v 127347 	 = 1.65453x
ccccX	133033 	 v 65518 	 = 2.03048x
csX	125323 	 v 48048 	 = 2.60829x
Y	201711 	 v 117080 	 = 1.72285x
cY	140014 	 v 84027 	 = 1.6663x
ccccY	90144 	 v 44912 	 = 2.00713x
csY	79987 	 v 48964 	 = 1.63359x
Z	102385 	 v 81684 	 = 1.25343x
cZ	88758 	 v 59630 	 = 1.48848x
ccccZ	78165 	 v 38060 	 = 2.05373x
csZ	69426 	 v 43235 	 = 1.60578x
Rx	204748 	 v 117896 	 = 1.73668x
cRx	143227 	 v 95307 	 = 1.5028x
ccccRx	87661 	 v 44807 	 = 1.95641x
csRx	77882 	 v 47277 	 = 1.64735x
Ry	202793 	 v 119267 	 = 1.70033x
cRy	144521 	 v 87854 	 = 1.64501x
ccccRy	90560 	 v 44273 	 = 2.04549x
csRy	80527 	 v 48468 	 = 1.66145x
Rz	98402 	 v 82999 	 = 1.18558x
cRz	89493 	 v 58817 	 = 1.52155x
ccccRz	256505 	 v 142097 	 = 1.80514x
csRz	241125 	 v 155814 	 = 1.54752x

[15 qubits]
X	9.93121e+06 	 v 7.88877e+06 	 = 1.25891x
cX	3.95484e+06 	 v 2.64698e+06 	 = 1.49409x
ccccX	653440 	 v 423853 	 = 1.54167x
csX	634701 	 v 425575 	 = 1.4914x
Y	7.67223e+06 	 v 4.5104e+06 	 = 1.70101x
cY	3.97778e+06 	 v 2.64686e+06 	 = 1.50283x
ccccY	666739 	 v 427513 	 = 1.55958x
csY	647943 	 v 435720 	 = 1.48706x
Z	2.95292e+06 	 v 2.94957e+06 	 = 1.00114x
cZ	1.67082e+06 	 v 1.61732e+06 	 = 1.03308x
ccccZ	402899 	 v 342247 	 = 1.17722x
csZ	386510 	 v 348003 	 = 1.11065x
Rx	7.71913e+06 	 v 4.50004e+06 	 = 1.71535x
cRx	3.99806e+06 	 v 2.67269e+06 	 = 1.49589x
ccccRx	656655 	 v 426341 	 = 1.54021x
csRx	630688 	 v 419537 	 = 1.5033x
Ry	7.70095e+06 	 v 4.54251e+06 	 = 1.69531x
cRy	4.00174e+06 	 v 2.63681e+06 	 = 1.51764x
ccccRy	662552 	 v 444994 	 = 1.4889x
csRy	644195 	 v 438820 	 = 1.46802x
Rz	2.97619e+06 	 v 2.92832e+06 	 = 1.01635x
cRz	1.80873e+06 	 v 1.61767e+06 	 = 1.11811x
ccccRz	416281 	 v 341905 	 = 1.21753x
csRz	406896 	 v 362148 	 = 1.12356x

[20 qubits]
X	3.34532e+08 	 v 1.94047e+08 	 = 1.72397x
cX	1.71359e+08 	 v 1.16569e+08 	 = 1.47002x
ccccX	2.45472e+07 	 v 1.68619e+07 	 = 1.45578x
csX	2.40385e+07 	 v 1.6221e+07 	 = 1.48194x
Y	3.34775e+08 	 v 1.96318e+08 	 = 1.70527x
cY	1.70611e+08 	 v 1.15871e+08 	 = 1.47242x
ccccY	2.42514e+07 	 v 1.76931e+07 	 = 1.37067x
csY	2.4223e+07 	 v 1.61117e+07 	 = 1.50344x
Z	1.25616e+08 	 v 1.26318e+08 	 = 0.994443x	!!!
cZ	7.2086e+07 	 v 6.94097e+07 	 = 1.03856x
ccccZ	1.32982e+07 	 v 1.30699e+07 	 = 1.01747x
csZ	1.2875e+07 	 v 1.30011e+07 	 = 0.990305x	!!!
Rx	3.33266e+08 	 v 1.93654e+08 	 = 1.72094x
cRx	1.72855e+08 	 v 1.15024e+08 	 = 1.50277x
ccccRx	2.42583e+07 	 v 1.83716e+07 	 = 1.32042x
csRx	2.40683e+07 	 v 1.65037e+07 	 = 1.45836x
Ry	3.33624e+08 	 v 1.94156e+08 	 = 1.71833x
cRy	1.72933e+08 	 v 1.15091e+08 	 = 1.50257x
ccccRy	2.5468e+07 	 v 1.70615e+07 	 = 1.49272x
csRy	2.41612e+07 	 v 1.67793e+07 	 = 1.43994x
Rz	1.27567e+08 	 v 1.26232e+08 	 = 1.01058x
cRz	7.5643e+07 	 v 7.02014e+07 	 = 1.07751x
ccccRz	1.41532e+07 	 v 1.30695e+07 	 = 1.08291x
csRz	1.39407e+07 	 v 1.34337e+07 	 = 1.03774x

32 CPUs

[5 qubits]
X	778817 	 v 82471 	 = 9.44353x
cX	477839 	 v 92210 	 = 5.18207x
ccccX	173946 	 v 67690 	 = 2.56974x
csX	137931 	 v 73222 	 = 1.88374x
Y	564386 	 v 59407 	 = 9.50033x
cY	341998 	 v 66315 	 = 5.15717x
ccccY	149159 	 v 64935 	 = 2.29705x
csY	109629 	 v 57258 	 = 1.91465x
Z	62076 	 v 44349 	 = 1.39972x
cZ	76457 	 v 49949 	 = 1.5307x
ccccZ	88643 	 v 55662 	 = 1.59252x
csZ	79974 	 v 57809 	 = 1.38342x
Rx	461836 	 v 50576 	 = 9.13152x
cRx	276887 	 v 54416 	 = 5.08834x
ccccRx	115385 	 v 55803 	 = 2.06772x
csRx	106051 	 v 58435 	 = 1.81485x
Ry	436331 	 v 48476 	 = 9.00097x
cRy	268860 	 v 52196 	 = 5.15097x
ccccRy	114438 	 v 53477 	 = 2.13995x
csRy	107694 	 v 59587 	 = 1.80734x
Rz	53770 	 v 44524 	 = 1.20766x
cRz	69796 	 v 50174 	 = 1.39108x
ccccRz	83084 	 v 52829 	 = 1.5727x
csRz	75180 	 v 55792 	 = 1.34751x

[10 qubits]
X	190708 	 v 102918 	 = 1.85301x
cX	215623 	 v 108157 	 = 1.99361x
ccccX	238249 	 v 114876 	 = 2.07397x
csX	225622 	 v 127240 	 = 1.7732x
Y	194373 	 v 106710 	 = 1.82151x
cY	217402 	 v 110642 	 = 1.96491x
ccccY	240399 	 v 119135 	 = 2.01787x
csY	222305 	 v 126785 	 = 1.7534x
Z	118964 	 v 87528 	 = 1.35915x
cZ	149393 	 v 99928 	 = 1.49501x
ccccZ	176403 	 v 105926 	 = 1.66534x
csZ	161321 	 v 115808 	 = 1.393x
Rx	190537 	 v 99077 	 = 1.92312x
cRx	219404 	 v 110183 	 = 1.99127x
ccccRx	242459 	 v 113631 	 = 2.13374x
csRx	221166 	 v 125160 	 = 1.76707x
Ry	191021 	 v 100408 	 = 1.90245x
cRy	217604 	 v 109520 	 = 1.98689x
ccccRy	246910 	 v 115316 	 = 2.14116x
csRy	221627 	 v 119368 	 = 1.85667x
Rz	112088 	 v 86937 	 = 1.2893x
cRz	141537 	 v 98349 	 = 1.43913x
ccccRz	171233 	 v 111165 	 = 1.54035x
csRz	154584 	 v 113389 	 = 1.36331x

[15 qubits]
X	828288 	 v 273873 	 = 3.02435x
cX	612609 	 v 229038 	 = 2.6747x
ccccX	399489 	 v 189980 	 = 2.1028x
csX	373949 	 v 199038 	 = 1.87878x
Y	850689 	 v 271537 	 = 3.13287x
cY	611638 	 v 233049 	 = 2.6245x
ccccY	408067 	 v 184690 	 = 2.20947x
csY	375860 	 v 199042 	 = 1.88835x
Z	255270 	 v 211521 	 = 1.20683x
cZ	281443 	 v 204170 	 = 1.37847x
ccccZ	281763 	 v 674446 	 = 0.41777x	!!!
csZ	255118 	 v 188324 	 = 1.35468x
Rx	831794 	 v 273005 	 = 3.04681x
cRx	612865 	 v 231717 	 = 2.64489x
ccccRx	402504 	 v 185505 	 = 2.16977x
csRx	378158 	 v 201205 	 = 1.87947x
Ry	833425 	 v 272841 	 = 3.05462x
cRy	611231 	 v 226307 	 = 2.70089x
ccccRy	404433 	 v 183172 	 = 2.20794x
csRy	380827 	 v 195267 	 = 1.95029x
Rz	248499 	 v 208619 	 = 1.19116x
cRz	270403 	 v 204627 	 = 1.32144x
ccccRz	272341 	 v 180624 	 = 1.50778x
csRz	245153 	 v 184427 	 = 1.32927x

[20 qubits]
X	2.50185e+07 	 v 6.06671e+06 	 = 4.1239x
cX	1.3099e+07 	 v 3.1873e+06 	 = 4.10975x
ccccX	2.18024e+06 	 v 777583 	 = 2.80386x
csX	2.12523e+06 	 v 778656 	 = 2.72935x
Y	2.52903e+07 	 v 6.05188e+06 	 = 4.17892x
cY	1.31147e+07 	 v 3.2649e+06 	 = 4.01689x
ccccY	2.26223e+06 	 v 773032 	 = 2.92644x
csY	2.16547e+06 	 v 781117 	 = 2.77228x
Z	3.92266e+06 	 v 3.85295e+06 	 = 1.01809x
cZ	2.82263e+06 	 v 2.70185e+06 	 = 1.0447x
ccccZ	985449 	 v 841348 	 = 1.17127x
csZ	959226 	 v 859455 	 = 1.11609x
Rx	2.55459e+07 	 v 6.17982e+06 	 = 4.13376x
cRx	1.38171e+07 	 v 3.22987e+06 	 = 4.27791x
ccccRx	2.24468e+06 	 v 780949 	 = 2.8743x
csRx	2.1924e+06 	 v 801501 	 = 2.73537x
Ry	2.58387e+07 	 v 6.24091e+06 	 = 4.14021x
cRy	1.33959e+07 	 v 3.31161e+06 	 = 4.04512x
ccccRy	2.20554e+06 	 v 782629 	 = 2.81812x
csRy	2.24645e+06 	 v 804539 	 = 2.79222x
Rz	4.29274e+06 	 v 3.9928e+06 	 = 1.07512x
cRz	3.03124e+06 	 v 2.80248e+06 	 = 1.08163x
ccccRz	968904 	 v 840534 	 = 1.15272x
csRz	933105 	 v 872948 	 = 1.06891x

[25 qubits]
X	1.07423e+09 	 v 4.19309e+08 	 = 2.56191x
cX	5.43031e+08 	 v 2.77425e+08 	 = 1.9574x
ccccX	8.92313e+07 	 v 4.37506e+07 	 = 2.03955x
csX	7.96252e+07 	 v 5.53844e+07 	 = 1.43768x
Y	1.09878e+09 	 v 4.16923e+08 	 = 2.63546x
cY	5.87509e+08 	 v 2.54655e+08 	 = 2.30708x
ccccY	9.6926e+07 	 v 4.59236e+07 	 = 2.11059x
csY	8.61867e+07 	 v 4.70605e+07 	 = 1.8314x
Z	4.19731e+08 	 v 4.23077e+08 	 = 0.99209x	!!!
cZ	2.66697e+08 	 v 2.64383e+08 	 = 1.00875x
ccccZ	4.70686e+07 	 v 4.54908e+07 	 = 1.03468x
csZ	5.37393e+07 	 v 5.69023e+07 	 = 0.944414x	!!!
Rx	1.12932e+09 	 v 4.25497e+08 	 = 2.65413x
cRx	5.82822e+08 	 v 2.46578e+08 	 = 2.36364x
ccccRx	8.67564e+07 	 v 5.06659e+07 	 = 1.71232x
csRx	8.02669e+07 	 v 5.6126e+07 	 = 1.43012x

GPU (quadro P6000)

[5 qubits]
X	127262 	 v 78330 	 = 1.62469x
cX	134250 	 v 80673 	 = 1.66413x
ccccX	143551 	 v 84631 	 = 1.6962x
csX	137808 	 v 86904 	 = 1.58575x
Y	136863 	 v 104548 	 = 1.30909x
cY	180093 	 v 99744 	 = 1.80555x
ccccY	144098 	 v 83433 	 = 1.72711x
csY	138435 	 v 86616 	 = 1.59826x
Z	45635 	 v 35086 	 = 1.30066x
cZ	101433 	 v 74955 	 = 1.35325x
ccccZ	99704 	 v 78681 	 = 1.26719x
csZ	96012 	 v 81216 	 = 1.18218x
Rx	124444 	 v 76276 	 = 1.6315x
cRx	133684 	 v 80315 	 = 1.6645x
ccccRx	142598 	 v 80764 	 = 1.76561x
csRx	136471 	 v 84254 	 = 1.61976x
Ry	126092 	 v 80674 	 = 1.56298x
cRy	133144 	 v 80496 	 = 1.65404x
ccccRy	142414 	 v 81419 	 = 1.74915x
csRy	136412 	 v 84653 	 = 1.61143x
Rz	43282 	 v 35175 	 = 1.23048x
cRz	90564 	 v 98433 	 = 0.920057x	!!!
ccccRz	108743 	 v 88563 	 = 1.22786x
csRz	95174 	 v 80114 	 = 1.18798x

[10 qubits]
X	257701 	 v 156296 	 = 1.6488x
cX	279553 	 v 162434 	 = 1.72103x
ccccX	306398 	 v 168179 	 = 1.82186x
csX	279780 	 v 172006 	 = 1.62657x
Y	258881 	 v 158543 	 = 1.63288x
cY	278238 	 v 164016 	 = 1.69641x
ccccY	322552 	 v 176166 	 = 1.83095x
csY	279393 	 v 172696 	 = 1.61783x
Z	92387 	 v 71354 	 = 1.29477x
cZ	186623 	 v 154165 	 = 1.21054x
ccccZ	202821 	 v 160658 	 = 1.26244x
csZ	206571 	 v 162915 	 = 1.26797x
Rx	260376 	 v 158138 	 = 1.64651x
cRx	275326 	 v 163491 	 = 1.68404x
ccccRx	289164 	 v 164227 	 = 1.76076x
csRx	290217 	 v 188704 	 = 1.53795x
Ry	256763 	 v 158608 	 = 1.61885x
cRy	276156 	 v 166420 	 = 1.65939x
ccccRy	293149 	 v 174146 	 = 1.68335x
csRy	277966 	 v 172113 	 = 1.61502x
Rz	89436 	 v 72203 	 = 1.23867x
cRz	186556 	 v 158600 	 = 1.17627x
ccccRz	204410 	 v 161050 	 = 1.26923x
csRz	194491 	 v 162061 	 = 1.20011x

[15 qubits]
X	485163 	 v 279406 	 = 1.73641x
cX	434817 	 v 265368 	 = 1.63854x
ccccX	440588 	 v 259850 	 = 1.69555x
csX	428459 	 v 259852 	 = 1.64886x
Y	451818 	 v 284347 	 = 1.58897x
cY	457870 	 v 265654 	 = 1.72356x
ccccY	435341 	 v 252816 	 = 1.72197x
csY	418964 	 v 261687 	 = 1.60101x
Z	174149 	 v 145153 	 = 1.19976x
cZ	299227 	 v 248504 	 = 1.20411x
ccccZ	305775 	 v 250666 	 = 1.21985x
csZ	316080 	 v 244664 	 = 1.29189x
Rx	451580 	 v 279527 	 = 1.61551x
cRx	444562 	 v 264267 	 = 1.68225x
ccccRx	450206 	 v 254846 	 = 1.76658x
csRx	422523 	 v 274364 	 = 1.54001x
Ry	462622 	 v 277522 	 = 1.66697x
cRy	444297 	 v 267123 	 = 1.66327x
ccccRy	439902 	 v 263775 	 = 1.66772x
csRy	424580 	 v 263644 	 = 1.61043x
Rz	190410 	 v 143969 	 = 1.32258x
cRz	305225 	 v 244760 	 = 1.24704x
ccccRz	302121 	 v 240209 	 = 1.25774x
csRz	291973 	 v 244381 	 = 1.19475x

[20 qubits]
X	5.33184e+06 	 v 4.96344e+06 	 = 1.07422x
cX	4.06991e+06 	 v 3.79922e+06 	 = 1.07125x
ccccX	2.83841e+06 	 v 2.61281e+06 	 = 1.08634x
csX	2.81285e+06 	 v 2.60715e+06 	 = 1.0789x
Y	5.30536e+06 	 v 4.93426e+06 	 = 1.07521x
cY	4.01158e+06 	 v 3.67336e+06 	 = 1.09207x
ccccY	2.83273e+06 	 v 2.52823e+06 	 = 1.12044x
csY	2.80288e+06 	 v 2.5194e+06 	 = 1.11252x
Z	1.49366e+07 	 v 1.51361e+07 	 = 0.986825x	!!!
cZ	3.76625e+06 	 v 3.66229e+06 	 = 1.02839x
ccccZ	2.59809e+06 	 v 2.50872e+06 	 = 1.03562x
csZ	2.51872e+06 	 v 2.50648e+06 	 = 1.00488x
Rx	5.11532e+06 	 v 4.91058e+06 	 = 1.04169x
cRx	4.00272e+06 	 v 3.69052e+06 	 = 1.08459x
ccccRx	2.83963e+06 	 v 2.55193e+06 	 = 1.11274x
csRx	2.77143e+06 	 v 2.57119e+06 	 = 1.07788x
Ry	5.10099e+06 	 v 4.8484e+06 	 = 1.0521x
cRy	4.09316e+06 	 v 4.02845e+06 	 = 1.01606x
ccccRy	2.86225e+06 	 v 2.51322e+06 	 = 1.13888x
csRy	2.78158e+06 	 v 2.52479e+06 	 = 1.10171x
Rz	2.90888e+07 	 v 1.51443e+07 	 = 1.92077x
cRz	4.8668e+06 	 v 3.6004e+06 	 = 1.35174x
ccccRz	2.76661e+06 	 v 2.48557e+06 	 = 1.11307x
csRz	2.70893e+06 	 v 2.51405e+06 	 = 1.07752x

[25 qubits]
X	1.06511e+08 	 v 1.05997e+08 	 = 1.00485x
cX	5.6053e+07 	 v 5.48065e+07 	 = 1.02274x
ccccX	1.08632e+07 	 v 1.19012e+07 	 = 0.912776x	!!!
csX	1.23385e+07 	 v 1.15795e+07 	 = 1.06555x
Y	1.0649e+08 	 v 1.06148e+08 	 = 1.00323x
cY	5.59062e+07 	 v 5.84214e+07 	 = 0.956948x	!!!
ccccY	1.31134e+07 	 v 1.1982e+07 	 = 1.09443x
csY	1.18938e+07 	 v 1.20573e+07 	 = 0.986446x	!!!
Z	6.155e+08 	 v 6.15768e+08 	 = 0.999565x	!!!
cZ	5.70896e+07 	 v 5.4174e+07 	 = 1.05382x
ccccZ	1.05192e+07 	 v 1.21523e+07 	 = 0.865612x	!!!
csZ	1.13703e+07 	 v 1.03756e+07 	 = 1.09586x
Rx	1.07163e+08 	 v 1.06049e+08 	 = 1.0105x
cRx	5.97407e+07 	 v 6.94496e+07 	 = 0.860202x	!!!
ccccRx	1.19799e+07 	 v 1.2063e+07 	 = 0.993109x	!!!
csRx	1.24817e+07 	 v 1.21249e+07 	 = 1.02943x
Ry	1.0709e+08 	 v 1.05893e+08 	 = 1.0113x
cRy	5.87204e+07 	 v 6.19812e+07 	 = 0.94739x	!!!
ccccRy	1.26257e+07 	 v 1.09413e+07 	 = 1.15394x
csRy	1.24306e+07 	 v 1.08591e+07 	 = 1.14471x
Rz	1.16706e+09 	 v 6.17107e+08 	 = 1.89118x
cRz	9.99975e+07 	 v 5.64961e+07 	 = 1.76999x
ccccRz	1.84772e+07 	 v 1.11707e+07 	 = 1.65407x
csRz	1.66738e+07 	 v 1.17161e+07 	 = 1.42316x

32 GPUs (virtually, between 2 Quadro P6000s)

[10 qubits]
X	2.04061e+07 	 v 1.64577e+07 	 = 1.23991x
cX	3.09922e+07 	 v 2.82664e+07 	 = 1.09643x
ccccX	3.19429e+07 	 v 2.82189e+07 	 = 1.13197x
csX	3.25057e+07 	 v 2.80653e+07 	 = 1.15822x
Y	2.23115e+07 	 v 1.80718e+07 	 = 1.2346x
cY	3.24795e+07 	 v 2.82072e+07 	 = 1.15146x
ccccY	3.2263e+07 	 v 2.83922e+07 	 = 1.13633x
csY	3.2898e+07 	 v 2.83253e+07 	 = 1.16144x
Z	7.55335e+06 	 v 7.17929e+06 	 = 1.0521x
cZ	2.0635e+07 	 v 2.05451e+07 	 = 1.00437x
ccccZ	2.04836e+07 	 v 2.06033e+07 	 = 0.994189x	!!!
csZ	2.04547e+07 	 v 2.06782e+07 	 = 0.989193x	!!!
Rx	2.22415e+07 	 v 1.78034e+07 	 = 1.24928x
cRx	3.27874e+07 	 v 2.82495e+07 	 = 1.16063x
ccccRx	3.24114e+07 	 v 2.82985e+07 	 = 1.14534x
csRx	3.23017e+07 	 v 2.81188e+07 	 = 1.14876x
Ry	2.20273e+07 	 v 1.77788e+07 	 = 1.23896x
cRy	3.24615e+07 	 v 2.81624e+07 	 = 1.15265x
ccccRy	3.23339e+07 	 v 2.81416e+07 	 = 1.14897x
csRy	3.18251e+07 	 v 2.81125e+07 	 = 1.13206x
Rz	7.11217e+06 	 v 7.20293e+06 	 = 0.9874x	!!!
cRz	2.04878e+07 	 v 2.06331e+07 	 = 0.992958x	!!!
ccccRz	2.05997e+07 	 v 2.06231e+07 	 = 0.998864x	!!!
csRz	2.0685e+07 	 v 2.06509e+07 	 = 1.00165x

[15 qubits]
X	3.57294e+07 	 v 2.79642e+07 	 = 1.27768x
cX	4.66645e+07 	 v 3.69891e+07 	 = 1.26158x
ccccX	4.53401e+07 	 v 3.80349e+07 	 = 1.19206x
csX	4.52307e+07 	 v 3.79481e+07 	 = 1.19191x
Y	3.66851e+07 	 v 2.75796e+07 	 = 1.33015x
cY	4.6457e+07 	 v 3.68357e+07 	 = 1.26119x
ccccY	4.5422e+07 	 v 3.75023e+07 	 = 1.21118x
csY	4.54961e+07 	 v 3.86128e+07 	 = 1.17827x
Z	1.11986e+07 	 v 1.14325e+07 	 = 0.979534x	!!!
cZ	2.9482e+07 	 v 2.90256e+07 	 = 1.01573x
ccccZ	2.91512e+07 	 v 2.87033e+07 	 = 1.01561x
csZ	2.9072e+07 	 v 2.89217e+07 	 = 1.0052x
Rx	3.61364e+07 	 v 2.85598e+07 	 = 1.26529x
cRx	4.61002e+07 	 v 3.77028e+07 	 = 1.22273x
ccccRx	4.52664e+07 	 v 3.76641e+07 	 = 1.20185x
csRx	4.52998e+07 	 v 3.66626e+07 	 = 1.23559x
Ry	3.57088e+07 	 v 2.80061e+07 	 = 1.27503x
cRy	4.64397e+07 	 v 3.7358e+07 	 = 1.2431x
ccccRy	4.66959e+07 	 v 3.76348e+07 	 = 1.24077x
csRy	4.52413e+07 	 v 3.72691e+07 	 = 1.21391x
Rz	1.13422e+07 	 v 1.14345e+07 	 = 0.991929x	!!!
cRz	2.9419e+07 	 v 2.91742e+07 	 = 1.00839x
ccccRz	2.91423e+07 	 v 2.89993e+07 	 = 1.00493x
csRz	2.909e+07 	 v 2.89722e+07 	 = 1.00407x

[20 qubits]
X	5.57519e+07 	 v 4.3445e+07 	 = 1.28328x
cX	6.21739e+07 	 v 4.88623e+07 	 = 1.27243x
ccccX	6.09173e+07 	 v 4.91375e+07 	 = 1.23973x
csX	6.01073e+07 	 v 4.71605e+07 	 = 1.27453x
Y	5.46344e+07 	 v 3.91482e+07 	 = 1.39558x
cY	6.21197e+07 	 v 5.13948e+07 	 = 1.20868x
ccccY	6.1823e+07 	 v 4.91906e+07 	 = 1.25681x
csY	6.11636e+07 	 v 4.87216e+07 	 = 1.25537x
Z	1.85578e+07 	 v 1.91361e+07 	 = 0.96978x	!!!
cZ	4.18737e+07 	 v 4.15744e+07 	 = 1.0072x
ccccZ	4.05578e+07 	 v 4.06543e+07 	 = 0.997627x	!!!
csZ	4.08741e+07 	 v 4.06185e+07 	 = 1.00629x
Rx	5.71617e+07 	 v 4.29406e+07 	 = 1.33118x
cRx	6.24139e+07 	 v 5.10187e+07 	 = 1.22335x
ccccRx	6.22371e+07 	 v 4.97247e+07 	 = 1.25163x
csRx	6.16718e+07 	 v 5.00981e+07 	 = 1.23102x
Ry	5.58076e+07 	 v 4.26105e+07 	 = 1.30972x
cRy	6.27553e+07 	 v 5.12966e+07 	 = 1.22338x
ccccRy	6.27906e+07 	 v 5.01989e+07 	 = 1.25084x
csRy	6.24652e+07 	 v 5.03169e+07 	 = 1.24144x
Rz	2.36678e+07 	 v 1.90869e+07 	 = 1.24x
cRz	4.3754e+07 	 v 4.30007e+07 	 = 1.01752x
ccccRz	4.17512e+07 	 v 4.0828e+07 	 = 1.02261x
csRz	4.06592e+07 	 v 4.00449e+07 	 = 1.01534x

[25 qubits]
X	4.89434e+08 	 v 5.29224e+08 	 = 0.924815x	!!!
cX	3.19571e+08 	 v 3.28867e+08 	 = 0.971732x	!!!
ccccX	2.56628e+08 	 v 2.53567e+08 	 = 1.01207x
csX	2.6845e+08 	 v 2.32314e+08 	 = 1.15555x
Y	5.85416e+08 	 v 5.17617e+08 	 = 1.13098x
cY	3.60071e+08 	 v 3.05764e+08 	 = 1.17761x
ccccY	2.91327e+08 	 v 2.70247e+08 	 = 1.078x
csY	2.42112e+08 	 v 3.13327e+08 	 = 0.772713x	!!!
Z	3.23016e+08 	 v 3.24144e+08 	 = 0.99652x	!!!
cZ	2.19808e+08 	 v 2.25109e+08 	 = 0.976452x	!!!
ccccZ	2.41425e+08 	 v 2.17185e+08 	 = 1.11161x
csZ	2.09947e+08 	 v 2.33613e+08 	 = 0.898697x	!!!
Rx	4.91484e+08 	 v 7.23104e+08 	 = 0.679686x	!!!
cRx	3.56097e+08 	 v 3.26883e+08 	 = 1.08937x
ccccRx	3.08156e+08 	 v 2.76049e+08 	 = 1.11631x
csRx	2.68361e+08 	 v 2.72283e+08 	 = 0.985597x	!!!
Ry	5.48748e+08 	 v 5.67967e+08 	 = 0.966161x	!!!
cRy	3.49174e+08 	 v 3.33523e+08 	 = 1.04693x
ccccRy	2.47762e+08 	 v 2.90074e+08 	 = 0.854133x	!!!
csRy	2.35375e+08 	 v 2.72557e+08 	 = 0.863581x	!!!
Rz	6.10789e+08 	 v 3.24777e+08 	 = 1.88064x
cRz	1.81605e+08 	 v 1.77483e+08 	 = 1.02323x
ccccRz	2.16914e+08 	 v 1.91077e+08 	 = 1.13522x
csRz	2.26712e+08 	 v 2.55971e+08 	 = 0.885693x	!!!

[28 qubits]
X	4.0393e+09 	 v 3.37877e+09 	 = 1.19549x
cX	2.23921e+09 	 v 2.16187e+09 	 = 1.03577x
ccccX	4.9235e+08 	 v 6.27852e+08 	 = 0.784181x	!!!
csX	5.71199e+08 	 v 6.13484e+08 	 = 0.931073x	!!!
Y	3.70274e+09 	 v 3.74534e+09 	 = 0.988626x	!!!
cY	2.26904e+09 	 v 1.94532e+09 	 = 1.16641x
ccccY	6.04228e+08 	 v 6.11656e+08 	 = 0.987856x	!!!
csY	6.37266e+08 	 v 6.15024e+08 	 = 1.03616x
Z	2.89953e+09 	 v 2.90083e+09 	 = 0.999551x	!!!
cZ	7.29146e+08 	 v 7.5237e+08 	 = 0.969132x	!!!
ccccZ	3.86579e+08 	 v 3.89703e+08 	 = 0.991982x	!!!
csZ	3.99327e+08 	 v 3.79792e+08 	 = 1.05144x
Rx	3.65741e+09 	 v 3.54928e+09 	 = 1.03046x
cRx	2.31143e+09 	 v 2.09251e+09 	 = 1.10462x
ccccRx	5.73667e+08 	 v 6.11618e+08 	 = 0.937949x	!!!
csRx	5.40743e+08 	 v 5.59269e+08 	 = 0.966874x	!!!
Ry	4.64743e+09 	 v 4.29431e+09 	 = 1.08223x
cRy	2.19407e+09 	 v 2.62839e+09 	 = 0.834758x	!!!
ccccRy	6.07458e+08 	 v 5.41164e+08 	 = 1.1225x
csRy	5.42774e+08 	 v 5.83859e+08 	 = 0.929632x	!!!
Rz	5.615e+09 	 v 2.90069e+09 	 = 1.93575x
cRz	1.20457e+09 	 v 7.35107e+08 	 = 1.63864x
ccccRz	5.20607e+08 	 v 3.93477e+08 	 = 1.32309x
csRz	5.0443e+08 	 v 3.54026e+08 	 = 1.42484x

@TysonRayJones
Copy link
Member Author

The often significant speedups outweigh the relatively few and insignificant slowdowns, so this PR is accepted. Note however it may not necessarily fully resolve the single-core v4 performance regression, and further optimisations (including forcing compile-time loop unrolling by templating the bitwise functions) may be necessary

@TysonRayJones TysonRayJones merged commit 851691d into devel Sep 4, 2025
130 checks passed
@TysonRayJones TysonRayJones deleted the patch-pauli-performance-ppppregression branch September 4, 2025 19:08
@TysonRayJones TysonRayJones mentioned this pull request Oct 13, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant