Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions quest/src/gpu/gpu_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@
*/


const int NUM_THREADS_PER_BLOCK = 128;
const int NUM_THREADS_PER_BLOCK =128;

__device__ __constant__ int ctrl_device[30];
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TODO: use a MAX_NUM_QUBITS = 64 or something constant in constants.hpp




__forceinline__ __device__ qindex getThreadInd() {
Expand Down Expand Up @@ -198,7 +201,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subC(
template <int NumCtrls>
__global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA(
cu_qcomp* amps, qindex numThreads,
int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ,
int numCtrls, qindex ctrlStateMask, int targ,
cu_qcomp m00, cu_qcomp m01, cu_qcomp m10, cu_qcomp m11
) {
GET_THREAD_IND(n, numThreads);
Expand All @@ -207,7 +210,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA(
SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls);

// i0 = nth local index where ctrls are active and targ is 0
qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTarg, numCtrlBits + 1, ctrlStateMask);
qindex i0 = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits + 1, ctrlStateMask);
qindex i1 = flipBit(i0, targ);

// note amps are strided by 2^targ
Expand Down Expand Up @@ -436,7 +439,7 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr(
template <int NumCtrls>
__global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub(
cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
int* ctrls, int numCtrls, qindex ctrlStateMask, int targ,
int numCtrls, qindex ctrlStateMask, int targ,
cu_qcomp m1, cu_qcomp m2
) {
GET_THREAD_IND(n, numThreads);
Expand All @@ -456,7 +459,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub(
SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls);

// j = nth local index where ctrls are active (in the specified states)
qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask);
qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask);

// i = global index corresponding to j
qindex i = concatenateBits(rank, j, logNumAmpsPerNode);
Expand All @@ -475,7 +478,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub(
template <int NumCtrls>
__global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub(
cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
int* ctrls, int numCtrls, qindex ctrlStateMask, int targ1, int targ2,
int numCtrls, qindex ctrlStateMask, int targ1, int targ2,
cu_qcomp m1, cu_qcomp m2, cu_qcomp m3, cu_qcomp m4
) {
GET_THREAD_IND(n, numThreads);
Expand All @@ -495,7 +498,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub(
SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls);

// j = nth local index where ctrls are active (in the specified states)
qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask);
qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask);

// i = global index corresponding to j
qindex i = concatenateBits(rank, j, logNumAmpsPerNode);
Expand All @@ -516,7 +519,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub(
template <int NumCtrls, int NumTargs, bool ApplyConj, bool HasPower>
__global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub(
cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode,
int* ctrls, int numCtrls, qindex ctrlStateMask, int* targs, int numTargs,
int numCtrls, qindex ctrlStateMask, int* targs, int numTargs,
cu_qcomp* elems, cu_qcomp exponent
) {
GET_THREAD_IND(n, numThreads);
Expand All @@ -537,7 +540,7 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub(
SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, numTargs);

// j = nth local index where ctrls are active (in the specified states)
qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask);
qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask);

// i = global index corresponding to j
qindex i = concatenateBits(rank, j, logNumAmpsPerNode);
Expand Down
62 changes: 54 additions & 8 deletions quest/src/gpu/gpu_subroutines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -301,14 +301,22 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector<int> ctrls, v
qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1);
qindex numBlocks = getNumBlocks(numThreads);

devints sortedQubits = util_getSorted(ctrls, {targ});
//devints sortedQubits = util_getSorted(ctrls, {targ});

vector<int> sortedQubits = util_getSorted(ctrls, {targ});

qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0});

auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr);


//int ctrl_device[sortedQubits.size()];

cudaMemcpyToSymbol(ctrl_device, sortedQubits.data(), sortedQubits.size()*sizeof(int));

kernel_statevec_anyCtrlOneTargDenseMatr_subA <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
toCuQcomps(qureg.gpuAmps), numThreads,
getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ,
ctrls.size(), qubitStateMask, targ,
m00, m01, m10, m11
);

Expand Down Expand Up @@ -568,13 +576,29 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
qindex numBlocks = getNumBlocks(numThreads);

devints deviceCtrls = util_getSorted(ctrls);

// removed implicit thrust mem copy
vector<int> sortedCtrls = util_getSorted(ctrls);


// Assume size of ctls is at most one per qubit so small enough for device contant memory
//int ctrl_device[ctrls.size()];

cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int));

// cudaMemcpyToSymbol (const char * symbol,
// const void * src,
// size_t count,
// size_t offset = 0,
// enum cudaMemcpyKind kind = cudaMemcpyHostToDevice
// )

qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
auto elems = unpackMatrixToCuQcomps(matr);

kernel_statevec_anyCtrlOneTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1]
ctrls.size(), ctrlStateMask, targ, elems[0], elems[1]
);

// explicitly return to avoid runtime error below
Expand Down Expand Up @@ -636,13 +660,24 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size());
qindex numBlocks = getNumBlocks(numThreads);

devints deviceCtrls = util_getSorted(ctrls);
// devints deviceCtrls = util_getSorted(ctrls);

// removed implicit thrust mem copy
vector<int> sortedCtrls = util_getSorted(ctrls);


// Assume size of ctls is at most one per qubit so small enough for device contant memory
// int ctrl_device[ctrls.size()];

cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int));


qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);
auto elems = unpackMatrixToCuQcomps(matr);

kernel_statevec_anyCtrlTwoTargDiagMatr_sub <NumCtrls> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2,
ctrls.size(), ctrlStateMask, targ1, targ2,
elems[0], elems[1], elems[2], elems[3]
);

Expand Down Expand Up @@ -705,12 +740,23 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector<int> ctrls, vec
qindex numBlocks = getNumBlocks(numThreads);

devints deviceTargs = targs;
devints deviceCtrls = util_getSorted(ctrls);
// devints deviceCtrls = util_getSorted(ctrls);

// removed implicit thrust mem copy
vector<int> sortedCtrls = util_getSorted(ctrls);


// Assume size of ctls is at most one per qubit so small enough for device contant memory
//int ctrl_device[ctrls.size()];

cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int));


qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates);

kernel_statevec_anyCtrlAnyTargDiagMatr_sub <NumCtrls, NumTargs, ApplyConj, HasPower> <<<numBlocks, NUM_THREADS_PER_BLOCK>>> (
toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode,
getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(),
ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(),
toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent)
);

Expand Down
6 changes: 3 additions & 3 deletions quest/src/gpu/gpu_thrust.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -791,7 +791,7 @@ qreal thrust_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector<int> q
auto indFunctor = functor_insertBits<NumQubits>(getPtr(sortedQubits), valueMask, qubits.size());
auto probFunctor = functor_getAmpNorm();

auto rawIter = thrust::make_counting_iterator(0);
auto rawIter = thrust::make_counting_iterator(0LL);
auto indIter = thrust::make_transform_iterator(rawIter, indFunctor);
auto ampIter = thrust::make_permutation_iterator(getStartPtr(qureg), indIter);
auto probIter = thrust::make_transform_iterator(ampIter, probFunctor);
Expand Down Expand Up @@ -1016,7 +1016,7 @@ void thrust_statevec_multiQubitProjector_sub(Qureg qureg, vector<int> qubits, ve
auto projFunctor = functor_projectStateVec<NumQubits>(
getPtr(devQubits), qubits.size(), retainValue, renorm);

auto indIter = thrust::make_counting_iterator(0);
auto indIter = thrust::make_counting_iterator(0LL);
auto ampIter = getStartPtr(qureg);

qindex numIts = qureg.numAmpsPerNode;
Expand Down Expand Up @@ -1082,4 +1082,4 @@ void thrust_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(Qureg qure



#endif // GPU_THRUST_HPP
#endif // GPU_THRUST_HPP