diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 4f2a737e..fe2ea80c 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -47,7 +47,10 @@ */ -const int NUM_THREADS_PER_BLOCK = 128; +const int NUM_THREADS_PER_BLOCK =128; + +__device__ __constant__ int ctrl_device[30]; + __forceinline__ __device__ qindex getThreadInd() { @@ -198,7 +201,7 @@ __global__ void kernel_statevec_anyCtrlSwap_subC( template __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( cu_qcomp* amps, qindex numThreads, - int* ctrlsAndTarg, int numCtrls, qindex ctrlStateMask, int targ, + int numCtrls, qindex ctrlStateMask, int targ, cu_qcomp m00, cu_qcomp m01, cu_qcomp m10, cu_qcomp m11 ) { GET_THREAD_IND(n, numThreads); @@ -207,7 +210,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA( SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); // i0 = nth local index where ctrls are active and targ is 0 - qindex i0 = insertBitsWithMaskedValues(n, ctrlsAndTarg, numCtrlBits + 1, ctrlStateMask); + qindex i0 = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits + 1, ctrlStateMask); qindex i1 = flipBit(i0, targ); // note amps are strided by 2^targ @@ -436,7 +439,7 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr( template __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - int* ctrls, int numCtrls, qindex ctrlStateMask, int targ, + int numCtrls, qindex ctrlStateMask, int targ, cu_qcomp m1, cu_qcomp m2 ) { GET_THREAD_IND(n, numThreads); @@ -456,7 +459,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); @@ -475,7 +478,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - int* ctrls, int numCtrls, qindex ctrlStateMask, int targ1, int targ2, + int numCtrls, qindex ctrlStateMask, int targ1, int targ2, cu_qcomp m1, cu_qcomp m2, cu_qcomp m3, cu_qcomp m4 ) { GET_THREAD_IND(n, numThreads); @@ -495,7 +498,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( SET_VAR_AT_COMPILE_TIME(int, numCtrlBits, NumCtrls, numCtrls); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); @@ -516,7 +519,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub( template __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( cu_qcomp* amps, qindex numThreads, int rank, qindex logNumAmpsPerNode, - int* ctrls, int numCtrls, qindex ctrlStateMask, int* targs, int numTargs, + int numCtrls, qindex ctrlStateMask, int* targs, int numTargs, cu_qcomp* elems, cu_qcomp exponent ) { GET_THREAD_IND(n, numThreads); @@ -537,7 +540,7 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub( SET_VAR_AT_COMPILE_TIME(int, numTargBits, NumTargs, numTargs); // j = nth local index where ctrls are active (in the specified states) - qindex j = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask); + qindex j = insertBitsWithMaskedValues(n, ctrl_device, numCtrlBits, ctrlStateMask); // i = global index corresponding to j qindex i = concatenateBits(rank, j, logNumAmpsPerNode); diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 5e18048f..a92cf021 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -301,14 +301,22 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1); qindex numBlocks = getNumBlocks(numThreads); - devints sortedQubits = util_getSorted(ctrls, {targ}); + //devints sortedQubits = util_getSorted(ctrls, {targ}); + + vector sortedQubits = util_getSorted(ctrls, {targ}); + qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr); + + //int ctrl_device[sortedQubits.size()]; + + cudaMemcpyToSymbol(ctrl_device, sortedQubits.data(), sortedQubits.size()*sizeof(int)); + kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, - getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, + ctrls.size(), qubitStateMask, targ, m00, m01, m10, m11 ); @@ -568,13 +576,29 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); qindex numBlocks = getNumBlocks(numThreads); - devints deviceCtrls = util_getSorted(ctrls); + + // removed implicit thrust mem copy + vector sortedCtrls = util_getSorted(ctrls); + + + // Assume size of ctls is at most one per qubit so small enough for device contant memory + //int ctrl_device[ctrls.size()]; + + cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); + +// cudaMemcpyToSymbol (const char * symbol, +// const void * src, +// size_t count, +// size_t offset = 0, +// enum cudaMemcpyKind kind = cudaMemcpyHostToDevice +// ) + qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = unpackMatrixToCuQcomps(matr); kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] + ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] ); // explicitly return to avoid runtime error below @@ -636,13 +660,24 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); qindex numBlocks = getNumBlocks(numThreads); - devints deviceCtrls = util_getSorted(ctrls); + // devints deviceCtrls = util_getSorted(ctrls); + + // removed implicit thrust mem copy + vector sortedCtrls = util_getSorted(ctrls); + + + // Assume size of ctls is at most one per qubit so small enough for device contant memory + // int ctrl_device[ctrls.size()]; + + cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); + + qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = unpackMatrixToCuQcomps(matr); kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2, + ctrls.size(), ctrlStateMask, targ1, targ2, elems[0], elems[1], elems[2], elems[3] ); @@ -705,12 +740,23 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec qindex numBlocks = getNumBlocks(numThreads); devints deviceTargs = targs; - devints deviceCtrls = util_getSorted(ctrls); + // devints deviceCtrls = util_getSorted(ctrls); + + // removed implicit thrust mem copy + vector sortedCtrls = util_getSorted(ctrls); + + + // Assume size of ctls is at most one per qubit so small enough for device contant memory + //int ctrl_device[ctrls.size()]; + + cudaMemcpyToSymbol(ctrl_device, sortedCtrls.data(), ctrls.size()*sizeof(int)); + + qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, - getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), + ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent) ); diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh index 8c11188e..ce708694 100644 --- a/quest/src/gpu/gpu_thrust.cuh +++ b/quest/src/gpu/gpu_thrust.cuh @@ -791,7 +791,7 @@ qreal thrust_statevec_calcProbOfMultiQubitOutcome_sub(Qureg qureg, vector q auto indFunctor = functor_insertBits(getPtr(sortedQubits), valueMask, qubits.size()); auto probFunctor = functor_getAmpNorm(); - auto rawIter = thrust::make_counting_iterator(0); + auto rawIter = thrust::make_counting_iterator(0LL); auto indIter = thrust::make_transform_iterator(rawIter, indFunctor); auto ampIter = thrust::make_permutation_iterator(getStartPtr(qureg), indIter); auto probIter = thrust::make_transform_iterator(ampIter, probFunctor); @@ -1016,7 +1016,7 @@ void thrust_statevec_multiQubitProjector_sub(Qureg qureg, vector qubits, ve auto projFunctor = functor_projectStateVec( getPtr(devQubits), qubits.size(), retainValue, renorm); - auto indIter = thrust::make_counting_iterator(0); + auto indIter = thrust::make_counting_iterator(0LL); auto ampIter = getStartPtr(qureg); qindex numIts = qureg.numAmpsPerNode; @@ -1082,4 +1082,4 @@ void thrust_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(Qureg qure -#endif // GPU_THRUST_HPP \ No newline at end of file +#endif // GPU_THRUST_HPP