-
Notifications
You must be signed in to change notification settings - Fork 163
redirect one-qubit Pauli to CompMatr1 #682
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
as per #638 Before merging, the performance impact of these changes must be measured in all settings.
Tested performance of the devel branch (not with this PR's changes) via: #include "quest.h"
#include <chrono>
#include <vector>
#include <random>
#include <iostream>
#include <algorithm>
using std::vector;
static std::mt19937 RNG;
vector<int> getRandomCtrls(int numCtrls, int maxCtrlIndExcl, int excludeTarget) {
vector<int> ctrls;
for (int i=0; i<maxCtrlIndExcl; i++)
if (i != excludeTarget)
ctrls.push_back(i);
// shuffle controls only when non-distributed (lazy)
if (!getQuESTEnv().isDistributed)
std::shuffle(ctrls.begin(), ctrls.end(), RNG);
return vector<int>(ctrls.begin(), ctrls.begin() + numCtrls);
}
template <typename T>
double getFuncDuration(int nQb, T func) {
double out = 0;
for (int t=0; t<nQb; t++) {
auto ctrls = getRandomCtrls(nQb-1, nQb, t);
// warmup
for(int r=0; r<5; r++)
func(ctrls.data(), t);
auto start = std::chrono::high_resolution_clock::now();
func(ctrls.data(), t);
syncQuESTEnv();
auto end = std::chrono::high_resolution_clock::now();
out += (end - start).count();
}
return out;
}
template <typename T1, typename T2>
void compareFuncs(std::string label, int nQb, T1 func1, T2 func2) {
std::cout << label << "\t";
double oldDur = getFuncDuration(nQb, func1);
double newDur = getFuncDuration(nQb, func2);
qreal speedup = oldDur/newDur;
std::cout
<< oldDur << " \t v " << newDur << " \t = " << speedup << "x"
<< ((speedup < 1)? "\t!!! " : "") << std::endl;
}
int main() {
initQuESTEnv();
setValidationEpsilon(0);
// suppress non-root output
if (getQuESTEnv().rank != 0)
std::cout.setstate(std::ios_base::failbit);
// prepare matrix alternatives to Pauli functions
CompMatr1 x = getCompMatr1({{0,1}, {1,0}});
CompMatr1 y = getCompMatr1({{0,-1i}, {1i,0}});
DiagMatr1 z = getDiagMatr1({1, -1});
qreal a = 0.123;
qreal c = std::cos(-a/2);
qreal s = std::sin(-a/2);
qcomp v = std::exp(qcomp(0, -a/2));
CompMatr1 rx = getCompMatr1({
{qcomp(c,0), qcomp(0,s)},
{qcomp(0,s), qcomp(c,0)}
});
CompMatr1 ry = getCompMatr1({
{ c, s},
{-s, c}
});
DiagMatr1 rz = getDiagMatr1(
{v, std::conj(v)}
);
// prepare random-ctrl-list RNG
std::random_device cspnrg;
unsigned seed = cspnrg();
RNG.seed(seed);
double oldDur, newDur;
for (int nQb : {5, 10, 15, 20, 25, 30}) {
std::cout << "\n[" << nQb << " qubits]" << std::endl;
// beware; will use all available accelerations
Qureg q = createForcedQureg(nQb);
int st[] = {0,1,0,1};
qreal a = 0.123;
compareFuncs("X", nQb,
[&](int* c, int t) { applyPauliX(q, t); },
[&](int* c, int t) { applyCompMatr1(q, t, x); });
compareFuncs("cX", nQb,
[&](int* c, int t) { applyControlledPauliX (q, c[0], t); },
[&](int* c, int t) { applyControlledCompMatr1(q, c[0], t, x); });
compareFuncs("ccccX", nQb,
[&](int* c, int t) { applyMultiControlledPauliX (q, c, 4, t); },
[&](int* c, int t) { applyMultiControlledCompMatr1(q, c, 4, t, x); });
compareFuncs("csX", nQb,
[&](int* c, int t) { applyMultiStateControlledPauliX (q, c, st, 4, t); },
[&](int* c, int t) { applyMultiStateControlledCompMatr1(q, c, st, 4, t, x); });
compareFuncs("Y", nQb,
[&](int* c, int t) { applyPauliY(q, t); },
[&](int* c, int t) { applyCompMatr1(q, t, y); });
compareFuncs("cY", nQb,
[&](int* c, int t) { applyControlledPauliY (q, c[0], t); },
[&](int* c, int t) { applyControlledCompMatr1(q, c[0], t, y); });
compareFuncs("ccccY", nQb,
[&](int* c, int t) { applyMultiControlledPauliY (q, c, 4, t); },
[&](int* c, int t) { applyMultiControlledCompMatr1(q, c, 4, t, y); });
compareFuncs("csY", nQb,
[&](int* c, int t) { applyMultiStateControlledPauliY (q, c, st, 4, t); },
[&](int* c, int t) { applyMultiStateControlledCompMatr1(q, c, st, 4, t, y); });
compareFuncs("Z", nQb,
[&](int* c, int t) { applyPauliZ(q, t); },
[&](int* c, int t) { applyDiagMatr1(q, t, z); });
compareFuncs("cZ", nQb,
[&](int* c, int t) { applyControlledPauliZ (q, c[0], t); },
[&](int* c, int t) { applyControlledDiagMatr1(q, c[0], t, z); });
compareFuncs("ccccZ", nQb,
[&](int* c, int t) { applyMultiControlledPauliZ (q, c, 4, t); },
[&](int* c, int t) { applyMultiControlledDiagMatr1(q, c, 4, t, z); });
compareFuncs("csZ", nQb,
[&](int* c, int t) { applyMultiStateControlledPauliZ (q, c, st, 4, t); },
[&](int* c, int t) { applyMultiStateControlledDiagMatr1(q, c, st, 4, t, z); });
compareFuncs("Rx", nQb,
[&](int* c, int t) { applyRotateX(q, t, a); },
[&](int* c, int t) { applyCompMatr1(q, t, rx); });
compareFuncs("cRx", nQb,
[&](int* c, int t) { applyControlledRotateX(q, c[0], t, a); },
[&](int* c, int t) { applyControlledCompMatr1(q, c[0], t, rx); });
compareFuncs("ccccRx", nQb,
[&](int* c, int t) { applyMultiControlledRotateX(q, c, 4, t, a); },
[&](int* c, int t) { applyMultiControlledCompMatr1(q, c, 4, t, rx); });
compareFuncs("csRx", nQb,
[&](int* c, int t) { applyMultiStateControlledRotateX(q, c, st, 4, t, a); },
[&](int* c, int t) { applyMultiStateControlledCompMatr1(q, c, st, 4, t, rx); });
compareFuncs("Ry", nQb,
[&](int* c, int t) { applyRotateY(q, t, a); },
[&](int* c, int t) { applyCompMatr1(q, t, ry); });
compareFuncs("cRy", nQb,
[&](int* c, int t) { applyControlledRotateY(q, c[0], t, a); },
[&](int* c, int t) { applyControlledCompMatr1(q, c[0], t, ry); });
compareFuncs("ccccRy", nQb,
[&](int* c, int t) { applyMultiControlledRotateY(q, c, 4, t, a); },
[&](int* c, int t) { applyMultiControlledCompMatr1(q, c, 4, t, ry); });
compareFuncs("csRy", nQb,
[&](int* c, int t) { applyMultiStateControlledRotateY(q, c, st, 4, t, a); },
[&](int* c, int t) { applyMultiStateControlledCompMatr1(q, c, st, 4, t, ry); });
compareFuncs("Rz", nQb,
[&](int* c, int t) { applyRotateZ(q, t, a); },
[&](int* c, int t) { applyDiagMatr1(q, t, rz); });
compareFuncs("cRz", nQb,
[&](int* c, int t) { applyControlledRotateZ(q, c[0], t, a); },
[&](int* c, int t) { applyControlledDiagMatr1(q, c[0], t, rz); });
compareFuncs("ccccRz", nQb,
[&](int* c, int t) { applyMultiControlledRotateZ(q, c, 4, t, a); },
[&](int* c, int t) { applyMultiControlledDiagMatr1(q, c, 4, t, rz); });
compareFuncs("csRz", nQb,
[&](int* c, int t) { applyMultiStateControlledRotateZ(q, c, st, 4, t, a); },
[&](int* c, int t) { applyMultiStateControlledDiagMatr1(q, c, st, 4, t, rz); });
destroyQureg(q);
}
finalizeQuESTEnv();
return 0;
} This compares the existing single-qubit Pauli methods (which call the general multi-qubit cases) with this PR's revised methods, which instead use the matrix methods. Broad speedup is obtained, though with concerning variability for large GPU-accelerated register. single-CPU[5 qubits]
X 82217 v 32243 = 2.54992x
cX 71296 v 37541 = 1.89915x
ccccX 88574 v 41423 = 2.13828x
csX 76711 v 52796 = 1.45297x
Y 55548 v 31895 = 1.74159x
cY 72492 v 37339 = 1.94146x
ccccY 88870 v 41602 = 2.1362x
csY 77445 v 46485 = 1.66602x
Z 48534 v 27174 = 1.78605x
cZ 65593 v 31974 = 2.05145x
ccccZ 83993 v 37601 = 2.2338x
csZ 71983 v 42407 = 1.69743x
Rx 56036 v 32051 = 1.74834x
cRx 72412 v 37855 = 1.91288x
ccccRx 89034 v 41415 = 2.1498x
csRx 56008 v 30378 = 1.8437x
Ry 36632 v 21083 = 1.73751x
cRy 47740 v 24315 = 1.9634x
ccccRy 58407 v 27107 = 2.15468x
csRy 50782 v 30289 = 1.67658x
Rz 30757 v 17759 = 1.73191x
cRz 39793 v 20744 = 1.91829x
ccccRz 52228 v 24321 = 2.14744x
csRz 45105 v 27656 = 1.63093x
[10 qubits]
X 306000 v 177304 = 1.72585x
cX 210699 v 127347 = 1.65453x
ccccX 133033 v 65518 = 2.03048x
csX 125323 v 48048 = 2.60829x
Y 201711 v 117080 = 1.72285x
cY 140014 v 84027 = 1.6663x
ccccY 90144 v 44912 = 2.00713x
csY 79987 v 48964 = 1.63359x
Z 102385 v 81684 = 1.25343x
cZ 88758 v 59630 = 1.48848x
ccccZ 78165 v 38060 = 2.05373x
csZ 69426 v 43235 = 1.60578x
Rx 204748 v 117896 = 1.73668x
cRx 143227 v 95307 = 1.5028x
ccccRx 87661 v 44807 = 1.95641x
csRx 77882 v 47277 = 1.64735x
Ry 202793 v 119267 = 1.70033x
cRy 144521 v 87854 = 1.64501x
ccccRy 90560 v 44273 = 2.04549x
csRy 80527 v 48468 = 1.66145x
Rz 98402 v 82999 = 1.18558x
cRz 89493 v 58817 = 1.52155x
ccccRz 256505 v 142097 = 1.80514x
csRz 241125 v 155814 = 1.54752x
[15 qubits]
X 9.93121e+06 v 7.88877e+06 = 1.25891x
cX 3.95484e+06 v 2.64698e+06 = 1.49409x
ccccX 653440 v 423853 = 1.54167x
csX 634701 v 425575 = 1.4914x
Y 7.67223e+06 v 4.5104e+06 = 1.70101x
cY 3.97778e+06 v 2.64686e+06 = 1.50283x
ccccY 666739 v 427513 = 1.55958x
csY 647943 v 435720 = 1.48706x
Z 2.95292e+06 v 2.94957e+06 = 1.00114x
cZ 1.67082e+06 v 1.61732e+06 = 1.03308x
ccccZ 402899 v 342247 = 1.17722x
csZ 386510 v 348003 = 1.11065x
Rx 7.71913e+06 v 4.50004e+06 = 1.71535x
cRx 3.99806e+06 v 2.67269e+06 = 1.49589x
ccccRx 656655 v 426341 = 1.54021x
csRx 630688 v 419537 = 1.5033x
Ry 7.70095e+06 v 4.54251e+06 = 1.69531x
cRy 4.00174e+06 v 2.63681e+06 = 1.51764x
ccccRy 662552 v 444994 = 1.4889x
csRy 644195 v 438820 = 1.46802x
Rz 2.97619e+06 v 2.92832e+06 = 1.01635x
cRz 1.80873e+06 v 1.61767e+06 = 1.11811x
ccccRz 416281 v 341905 = 1.21753x
csRz 406896 v 362148 = 1.12356x
[20 qubits]
X 3.34532e+08 v 1.94047e+08 = 1.72397x
cX 1.71359e+08 v 1.16569e+08 = 1.47002x
ccccX 2.45472e+07 v 1.68619e+07 = 1.45578x
csX 2.40385e+07 v 1.6221e+07 = 1.48194x
Y 3.34775e+08 v 1.96318e+08 = 1.70527x
cY 1.70611e+08 v 1.15871e+08 = 1.47242x
ccccY 2.42514e+07 v 1.76931e+07 = 1.37067x
csY 2.4223e+07 v 1.61117e+07 = 1.50344x
Z 1.25616e+08 v 1.26318e+08 = 0.994443x !!!
cZ 7.2086e+07 v 6.94097e+07 = 1.03856x
ccccZ 1.32982e+07 v 1.30699e+07 = 1.01747x
csZ 1.2875e+07 v 1.30011e+07 = 0.990305x !!!
Rx 3.33266e+08 v 1.93654e+08 = 1.72094x
cRx 1.72855e+08 v 1.15024e+08 = 1.50277x
ccccRx 2.42583e+07 v 1.83716e+07 = 1.32042x
csRx 2.40683e+07 v 1.65037e+07 = 1.45836x
Ry 3.33624e+08 v 1.94156e+08 = 1.71833x
cRy 1.72933e+08 v 1.15091e+08 = 1.50257x
ccccRy 2.5468e+07 v 1.70615e+07 = 1.49272x
csRy 2.41612e+07 v 1.67793e+07 = 1.43994x
Rz 1.27567e+08 v 1.26232e+08 = 1.01058x
cRz 7.5643e+07 v 7.02014e+07 = 1.07751x
ccccRz 1.41532e+07 v 1.30695e+07 = 1.08291x
csRz 1.39407e+07 v 1.34337e+07 = 1.03774x 32 CPUs[5 qubits]
X 778817 v 82471 = 9.44353x
cX 477839 v 92210 = 5.18207x
ccccX 173946 v 67690 = 2.56974x
csX 137931 v 73222 = 1.88374x
Y 564386 v 59407 = 9.50033x
cY 341998 v 66315 = 5.15717x
ccccY 149159 v 64935 = 2.29705x
csY 109629 v 57258 = 1.91465x
Z 62076 v 44349 = 1.39972x
cZ 76457 v 49949 = 1.5307x
ccccZ 88643 v 55662 = 1.59252x
csZ 79974 v 57809 = 1.38342x
Rx 461836 v 50576 = 9.13152x
cRx 276887 v 54416 = 5.08834x
ccccRx 115385 v 55803 = 2.06772x
csRx 106051 v 58435 = 1.81485x
Ry 436331 v 48476 = 9.00097x
cRy 268860 v 52196 = 5.15097x
ccccRy 114438 v 53477 = 2.13995x
csRy 107694 v 59587 = 1.80734x
Rz 53770 v 44524 = 1.20766x
cRz 69796 v 50174 = 1.39108x
ccccRz 83084 v 52829 = 1.5727x
csRz 75180 v 55792 = 1.34751x
[10 qubits]
X 190708 v 102918 = 1.85301x
cX 215623 v 108157 = 1.99361x
ccccX 238249 v 114876 = 2.07397x
csX 225622 v 127240 = 1.7732x
Y 194373 v 106710 = 1.82151x
cY 217402 v 110642 = 1.96491x
ccccY 240399 v 119135 = 2.01787x
csY 222305 v 126785 = 1.7534x
Z 118964 v 87528 = 1.35915x
cZ 149393 v 99928 = 1.49501x
ccccZ 176403 v 105926 = 1.66534x
csZ 161321 v 115808 = 1.393x
Rx 190537 v 99077 = 1.92312x
cRx 219404 v 110183 = 1.99127x
ccccRx 242459 v 113631 = 2.13374x
csRx 221166 v 125160 = 1.76707x
Ry 191021 v 100408 = 1.90245x
cRy 217604 v 109520 = 1.98689x
ccccRy 246910 v 115316 = 2.14116x
csRy 221627 v 119368 = 1.85667x
Rz 112088 v 86937 = 1.2893x
cRz 141537 v 98349 = 1.43913x
ccccRz 171233 v 111165 = 1.54035x
csRz 154584 v 113389 = 1.36331x
[15 qubits]
X 828288 v 273873 = 3.02435x
cX 612609 v 229038 = 2.6747x
ccccX 399489 v 189980 = 2.1028x
csX 373949 v 199038 = 1.87878x
Y 850689 v 271537 = 3.13287x
cY 611638 v 233049 = 2.6245x
ccccY 408067 v 184690 = 2.20947x
csY 375860 v 199042 = 1.88835x
Z 255270 v 211521 = 1.20683x
cZ 281443 v 204170 = 1.37847x
ccccZ 281763 v 674446 = 0.41777x !!!
csZ 255118 v 188324 = 1.35468x
Rx 831794 v 273005 = 3.04681x
cRx 612865 v 231717 = 2.64489x
ccccRx 402504 v 185505 = 2.16977x
csRx 378158 v 201205 = 1.87947x
Ry 833425 v 272841 = 3.05462x
cRy 611231 v 226307 = 2.70089x
ccccRy 404433 v 183172 = 2.20794x
csRy 380827 v 195267 = 1.95029x
Rz 248499 v 208619 = 1.19116x
cRz 270403 v 204627 = 1.32144x
ccccRz 272341 v 180624 = 1.50778x
csRz 245153 v 184427 = 1.32927x
[20 qubits]
X 2.50185e+07 v 6.06671e+06 = 4.1239x
cX 1.3099e+07 v 3.1873e+06 = 4.10975x
ccccX 2.18024e+06 v 777583 = 2.80386x
csX 2.12523e+06 v 778656 = 2.72935x
Y 2.52903e+07 v 6.05188e+06 = 4.17892x
cY 1.31147e+07 v 3.2649e+06 = 4.01689x
ccccY 2.26223e+06 v 773032 = 2.92644x
csY 2.16547e+06 v 781117 = 2.77228x
Z 3.92266e+06 v 3.85295e+06 = 1.01809x
cZ 2.82263e+06 v 2.70185e+06 = 1.0447x
ccccZ 985449 v 841348 = 1.17127x
csZ 959226 v 859455 = 1.11609x
Rx 2.55459e+07 v 6.17982e+06 = 4.13376x
cRx 1.38171e+07 v 3.22987e+06 = 4.27791x
ccccRx 2.24468e+06 v 780949 = 2.8743x
csRx 2.1924e+06 v 801501 = 2.73537x
Ry 2.58387e+07 v 6.24091e+06 = 4.14021x
cRy 1.33959e+07 v 3.31161e+06 = 4.04512x
ccccRy 2.20554e+06 v 782629 = 2.81812x
csRy 2.24645e+06 v 804539 = 2.79222x
Rz 4.29274e+06 v 3.9928e+06 = 1.07512x
cRz 3.03124e+06 v 2.80248e+06 = 1.08163x
ccccRz 968904 v 840534 = 1.15272x
csRz 933105 v 872948 = 1.06891x
[25 qubits]
X 1.07423e+09 v 4.19309e+08 = 2.56191x
cX 5.43031e+08 v 2.77425e+08 = 1.9574x
ccccX 8.92313e+07 v 4.37506e+07 = 2.03955x
csX 7.96252e+07 v 5.53844e+07 = 1.43768x
Y 1.09878e+09 v 4.16923e+08 = 2.63546x
cY 5.87509e+08 v 2.54655e+08 = 2.30708x
ccccY 9.6926e+07 v 4.59236e+07 = 2.11059x
csY 8.61867e+07 v 4.70605e+07 = 1.8314x
Z 4.19731e+08 v 4.23077e+08 = 0.99209x !!!
cZ 2.66697e+08 v 2.64383e+08 = 1.00875x
ccccZ 4.70686e+07 v 4.54908e+07 = 1.03468x
csZ 5.37393e+07 v 5.69023e+07 = 0.944414x !!!
Rx 1.12932e+09 v 4.25497e+08 = 2.65413x
cRx 5.82822e+08 v 2.46578e+08 = 2.36364x
ccccRx 8.67564e+07 v 5.06659e+07 = 1.71232x
csRx 8.02669e+07 v 5.6126e+07 = 1.43012x GPU (quadro P6000)[5 qubits]
X 127262 v 78330 = 1.62469x
cX 134250 v 80673 = 1.66413x
ccccX 143551 v 84631 = 1.6962x
csX 137808 v 86904 = 1.58575x
Y 136863 v 104548 = 1.30909x
cY 180093 v 99744 = 1.80555x
ccccY 144098 v 83433 = 1.72711x
csY 138435 v 86616 = 1.59826x
Z 45635 v 35086 = 1.30066x
cZ 101433 v 74955 = 1.35325x
ccccZ 99704 v 78681 = 1.26719x
csZ 96012 v 81216 = 1.18218x
Rx 124444 v 76276 = 1.6315x
cRx 133684 v 80315 = 1.6645x
ccccRx 142598 v 80764 = 1.76561x
csRx 136471 v 84254 = 1.61976x
Ry 126092 v 80674 = 1.56298x
cRy 133144 v 80496 = 1.65404x
ccccRy 142414 v 81419 = 1.74915x
csRy 136412 v 84653 = 1.61143x
Rz 43282 v 35175 = 1.23048x
cRz 90564 v 98433 = 0.920057x !!!
ccccRz 108743 v 88563 = 1.22786x
csRz 95174 v 80114 = 1.18798x
[10 qubits]
X 257701 v 156296 = 1.6488x
cX 279553 v 162434 = 1.72103x
ccccX 306398 v 168179 = 1.82186x
csX 279780 v 172006 = 1.62657x
Y 258881 v 158543 = 1.63288x
cY 278238 v 164016 = 1.69641x
ccccY 322552 v 176166 = 1.83095x
csY 279393 v 172696 = 1.61783x
Z 92387 v 71354 = 1.29477x
cZ 186623 v 154165 = 1.21054x
ccccZ 202821 v 160658 = 1.26244x
csZ 206571 v 162915 = 1.26797x
Rx 260376 v 158138 = 1.64651x
cRx 275326 v 163491 = 1.68404x
ccccRx 289164 v 164227 = 1.76076x
csRx 290217 v 188704 = 1.53795x
Ry 256763 v 158608 = 1.61885x
cRy 276156 v 166420 = 1.65939x
ccccRy 293149 v 174146 = 1.68335x
csRy 277966 v 172113 = 1.61502x
Rz 89436 v 72203 = 1.23867x
cRz 186556 v 158600 = 1.17627x
ccccRz 204410 v 161050 = 1.26923x
csRz 194491 v 162061 = 1.20011x
[15 qubits]
X 485163 v 279406 = 1.73641x
cX 434817 v 265368 = 1.63854x
ccccX 440588 v 259850 = 1.69555x
csX 428459 v 259852 = 1.64886x
Y 451818 v 284347 = 1.58897x
cY 457870 v 265654 = 1.72356x
ccccY 435341 v 252816 = 1.72197x
csY 418964 v 261687 = 1.60101x
Z 174149 v 145153 = 1.19976x
cZ 299227 v 248504 = 1.20411x
ccccZ 305775 v 250666 = 1.21985x
csZ 316080 v 244664 = 1.29189x
Rx 451580 v 279527 = 1.61551x
cRx 444562 v 264267 = 1.68225x
ccccRx 450206 v 254846 = 1.76658x
csRx 422523 v 274364 = 1.54001x
Ry 462622 v 277522 = 1.66697x
cRy 444297 v 267123 = 1.66327x
ccccRy 439902 v 263775 = 1.66772x
csRy 424580 v 263644 = 1.61043x
Rz 190410 v 143969 = 1.32258x
cRz 305225 v 244760 = 1.24704x
ccccRz 302121 v 240209 = 1.25774x
csRz 291973 v 244381 = 1.19475x
[20 qubits]
X 5.33184e+06 v 4.96344e+06 = 1.07422x
cX 4.06991e+06 v 3.79922e+06 = 1.07125x
ccccX 2.83841e+06 v 2.61281e+06 = 1.08634x
csX 2.81285e+06 v 2.60715e+06 = 1.0789x
Y 5.30536e+06 v 4.93426e+06 = 1.07521x
cY 4.01158e+06 v 3.67336e+06 = 1.09207x
ccccY 2.83273e+06 v 2.52823e+06 = 1.12044x
csY 2.80288e+06 v 2.5194e+06 = 1.11252x
Z 1.49366e+07 v 1.51361e+07 = 0.986825x !!!
cZ 3.76625e+06 v 3.66229e+06 = 1.02839x
ccccZ 2.59809e+06 v 2.50872e+06 = 1.03562x
csZ 2.51872e+06 v 2.50648e+06 = 1.00488x
Rx 5.11532e+06 v 4.91058e+06 = 1.04169x
cRx 4.00272e+06 v 3.69052e+06 = 1.08459x
ccccRx 2.83963e+06 v 2.55193e+06 = 1.11274x
csRx 2.77143e+06 v 2.57119e+06 = 1.07788x
Ry 5.10099e+06 v 4.8484e+06 = 1.0521x
cRy 4.09316e+06 v 4.02845e+06 = 1.01606x
ccccRy 2.86225e+06 v 2.51322e+06 = 1.13888x
csRy 2.78158e+06 v 2.52479e+06 = 1.10171x
Rz 2.90888e+07 v 1.51443e+07 = 1.92077x
cRz 4.8668e+06 v 3.6004e+06 = 1.35174x
ccccRz 2.76661e+06 v 2.48557e+06 = 1.11307x
csRz 2.70893e+06 v 2.51405e+06 = 1.07752x
[25 qubits]
X 1.06511e+08 v 1.05997e+08 = 1.00485x
cX 5.6053e+07 v 5.48065e+07 = 1.02274x
ccccX 1.08632e+07 v 1.19012e+07 = 0.912776x !!!
csX 1.23385e+07 v 1.15795e+07 = 1.06555x
Y 1.0649e+08 v 1.06148e+08 = 1.00323x
cY 5.59062e+07 v 5.84214e+07 = 0.956948x !!!
ccccY 1.31134e+07 v 1.1982e+07 = 1.09443x
csY 1.18938e+07 v 1.20573e+07 = 0.986446x !!!
Z 6.155e+08 v 6.15768e+08 = 0.999565x !!!
cZ 5.70896e+07 v 5.4174e+07 = 1.05382x
ccccZ 1.05192e+07 v 1.21523e+07 = 0.865612x !!!
csZ 1.13703e+07 v 1.03756e+07 = 1.09586x
Rx 1.07163e+08 v 1.06049e+08 = 1.0105x
cRx 5.97407e+07 v 6.94496e+07 = 0.860202x !!!
ccccRx 1.19799e+07 v 1.2063e+07 = 0.993109x !!!
csRx 1.24817e+07 v 1.21249e+07 = 1.02943x
Ry 1.0709e+08 v 1.05893e+08 = 1.0113x
cRy 5.87204e+07 v 6.19812e+07 = 0.94739x !!!
ccccRy 1.26257e+07 v 1.09413e+07 = 1.15394x
csRy 1.24306e+07 v 1.08591e+07 = 1.14471x
Rz 1.16706e+09 v 6.17107e+08 = 1.89118x
cRz 9.99975e+07 v 5.64961e+07 = 1.76999x
ccccRz 1.84772e+07 v 1.11707e+07 = 1.65407x
csRz 1.66738e+07 v 1.17161e+07 = 1.42316x 32 GPUs (virtually, between 2 Quadro P6000s)[10 qubits]
X 2.04061e+07 v 1.64577e+07 = 1.23991x
cX 3.09922e+07 v 2.82664e+07 = 1.09643x
ccccX 3.19429e+07 v 2.82189e+07 = 1.13197x
csX 3.25057e+07 v 2.80653e+07 = 1.15822x
Y 2.23115e+07 v 1.80718e+07 = 1.2346x
cY 3.24795e+07 v 2.82072e+07 = 1.15146x
ccccY 3.2263e+07 v 2.83922e+07 = 1.13633x
csY 3.2898e+07 v 2.83253e+07 = 1.16144x
Z 7.55335e+06 v 7.17929e+06 = 1.0521x
cZ 2.0635e+07 v 2.05451e+07 = 1.00437x
ccccZ 2.04836e+07 v 2.06033e+07 = 0.994189x !!!
csZ 2.04547e+07 v 2.06782e+07 = 0.989193x !!!
Rx 2.22415e+07 v 1.78034e+07 = 1.24928x
cRx 3.27874e+07 v 2.82495e+07 = 1.16063x
ccccRx 3.24114e+07 v 2.82985e+07 = 1.14534x
csRx 3.23017e+07 v 2.81188e+07 = 1.14876x
Ry 2.20273e+07 v 1.77788e+07 = 1.23896x
cRy 3.24615e+07 v 2.81624e+07 = 1.15265x
ccccRy 3.23339e+07 v 2.81416e+07 = 1.14897x
csRy 3.18251e+07 v 2.81125e+07 = 1.13206x
Rz 7.11217e+06 v 7.20293e+06 = 0.9874x !!!
cRz 2.04878e+07 v 2.06331e+07 = 0.992958x !!!
ccccRz 2.05997e+07 v 2.06231e+07 = 0.998864x !!!
csRz 2.0685e+07 v 2.06509e+07 = 1.00165x
[15 qubits]
X 3.57294e+07 v 2.79642e+07 = 1.27768x
cX 4.66645e+07 v 3.69891e+07 = 1.26158x
ccccX 4.53401e+07 v 3.80349e+07 = 1.19206x
csX 4.52307e+07 v 3.79481e+07 = 1.19191x
Y 3.66851e+07 v 2.75796e+07 = 1.33015x
cY 4.6457e+07 v 3.68357e+07 = 1.26119x
ccccY 4.5422e+07 v 3.75023e+07 = 1.21118x
csY 4.54961e+07 v 3.86128e+07 = 1.17827x
Z 1.11986e+07 v 1.14325e+07 = 0.979534x !!!
cZ 2.9482e+07 v 2.90256e+07 = 1.01573x
ccccZ 2.91512e+07 v 2.87033e+07 = 1.01561x
csZ 2.9072e+07 v 2.89217e+07 = 1.0052x
Rx 3.61364e+07 v 2.85598e+07 = 1.26529x
cRx 4.61002e+07 v 3.77028e+07 = 1.22273x
ccccRx 4.52664e+07 v 3.76641e+07 = 1.20185x
csRx 4.52998e+07 v 3.66626e+07 = 1.23559x
Ry 3.57088e+07 v 2.80061e+07 = 1.27503x
cRy 4.64397e+07 v 3.7358e+07 = 1.2431x
ccccRy 4.66959e+07 v 3.76348e+07 = 1.24077x
csRy 4.52413e+07 v 3.72691e+07 = 1.21391x
Rz 1.13422e+07 v 1.14345e+07 = 0.991929x !!!
cRz 2.9419e+07 v 2.91742e+07 = 1.00839x
ccccRz 2.91423e+07 v 2.89993e+07 = 1.00493x
csRz 2.909e+07 v 2.89722e+07 = 1.00407x
[20 qubits]
X 5.57519e+07 v 4.3445e+07 = 1.28328x
cX 6.21739e+07 v 4.88623e+07 = 1.27243x
ccccX 6.09173e+07 v 4.91375e+07 = 1.23973x
csX 6.01073e+07 v 4.71605e+07 = 1.27453x
Y 5.46344e+07 v 3.91482e+07 = 1.39558x
cY 6.21197e+07 v 5.13948e+07 = 1.20868x
ccccY 6.1823e+07 v 4.91906e+07 = 1.25681x
csY 6.11636e+07 v 4.87216e+07 = 1.25537x
Z 1.85578e+07 v 1.91361e+07 = 0.96978x !!!
cZ 4.18737e+07 v 4.15744e+07 = 1.0072x
ccccZ 4.05578e+07 v 4.06543e+07 = 0.997627x !!!
csZ 4.08741e+07 v 4.06185e+07 = 1.00629x
Rx 5.71617e+07 v 4.29406e+07 = 1.33118x
cRx 6.24139e+07 v 5.10187e+07 = 1.22335x
ccccRx 6.22371e+07 v 4.97247e+07 = 1.25163x
csRx 6.16718e+07 v 5.00981e+07 = 1.23102x
Ry 5.58076e+07 v 4.26105e+07 = 1.30972x
cRy 6.27553e+07 v 5.12966e+07 = 1.22338x
ccccRy 6.27906e+07 v 5.01989e+07 = 1.25084x
csRy 6.24652e+07 v 5.03169e+07 = 1.24144x
Rz 2.36678e+07 v 1.90869e+07 = 1.24x
cRz 4.3754e+07 v 4.30007e+07 = 1.01752x
ccccRz 4.17512e+07 v 4.0828e+07 = 1.02261x
csRz 4.06592e+07 v 4.00449e+07 = 1.01534x
[25 qubits]
X 4.89434e+08 v 5.29224e+08 = 0.924815x !!!
cX 3.19571e+08 v 3.28867e+08 = 0.971732x !!!
ccccX 2.56628e+08 v 2.53567e+08 = 1.01207x
csX 2.6845e+08 v 2.32314e+08 = 1.15555x
Y 5.85416e+08 v 5.17617e+08 = 1.13098x
cY 3.60071e+08 v 3.05764e+08 = 1.17761x
ccccY 2.91327e+08 v 2.70247e+08 = 1.078x
csY 2.42112e+08 v 3.13327e+08 = 0.772713x !!!
Z 3.23016e+08 v 3.24144e+08 = 0.99652x !!!
cZ 2.19808e+08 v 2.25109e+08 = 0.976452x !!!
ccccZ 2.41425e+08 v 2.17185e+08 = 1.11161x
csZ 2.09947e+08 v 2.33613e+08 = 0.898697x !!!
Rx 4.91484e+08 v 7.23104e+08 = 0.679686x !!!
cRx 3.56097e+08 v 3.26883e+08 = 1.08937x
ccccRx 3.08156e+08 v 2.76049e+08 = 1.11631x
csRx 2.68361e+08 v 2.72283e+08 = 0.985597x !!!
Ry 5.48748e+08 v 5.67967e+08 = 0.966161x !!!
cRy 3.49174e+08 v 3.33523e+08 = 1.04693x
ccccRy 2.47762e+08 v 2.90074e+08 = 0.854133x !!!
csRy 2.35375e+08 v 2.72557e+08 = 0.863581x !!!
Rz 6.10789e+08 v 3.24777e+08 = 1.88064x
cRz 1.81605e+08 v 1.77483e+08 = 1.02323x
ccccRz 2.16914e+08 v 1.91077e+08 = 1.13522x
csRz 2.26712e+08 v 2.55971e+08 = 0.885693x !!!
[28 qubits]
X 4.0393e+09 v 3.37877e+09 = 1.19549x
cX 2.23921e+09 v 2.16187e+09 = 1.03577x
ccccX 4.9235e+08 v 6.27852e+08 = 0.784181x !!!
csX 5.71199e+08 v 6.13484e+08 = 0.931073x !!!
Y 3.70274e+09 v 3.74534e+09 = 0.988626x !!!
cY 2.26904e+09 v 1.94532e+09 = 1.16641x
ccccY 6.04228e+08 v 6.11656e+08 = 0.987856x !!!
csY 6.37266e+08 v 6.15024e+08 = 1.03616x
Z 2.89953e+09 v 2.90083e+09 = 0.999551x !!!
cZ 7.29146e+08 v 7.5237e+08 = 0.969132x !!!
ccccZ 3.86579e+08 v 3.89703e+08 = 0.991982x !!!
csZ 3.99327e+08 v 3.79792e+08 = 1.05144x
Rx 3.65741e+09 v 3.54928e+09 = 1.03046x
cRx 2.31143e+09 v 2.09251e+09 = 1.10462x
ccccRx 5.73667e+08 v 6.11618e+08 = 0.937949x !!!
csRx 5.40743e+08 v 5.59269e+08 = 0.966874x !!!
Ry 4.64743e+09 v 4.29431e+09 = 1.08223x
cRy 2.19407e+09 v 2.62839e+09 = 0.834758x !!!
ccccRy 6.07458e+08 v 5.41164e+08 = 1.1225x
csRy 5.42774e+08 v 5.83859e+08 = 0.929632x !!!
Rz 5.615e+09 v 2.90069e+09 = 1.93575x
cRz 1.20457e+09 v 7.35107e+08 = 1.63864x
ccccRz 5.20607e+08 v 3.93477e+08 = 1.32309x
csRz 5.0443e+08 v 3.54026e+08 = 1.42484x |
The often significant speedups outweigh the relatively few and insignificant slowdowns, so this PR is accepted. Note however it may not necessarily fully resolve the single-core v4 performance regression, and further optimisations (including forcing compile-time loop unrolling by templating the bitwise functions) may be necessary |
as per #638
Before merging, the performance impact of these changes must be measured in all settings.