From db91e7dad3a9466db35b441abfdea701bdfcec74 Mon Sep 17 00:00:00 2001 From: Jun Doi Date: Tue, 1 Mar 2022 10:14:47 +0900 Subject: [PATCH] Add experimental support of cuQuantum (#1400) * Add cuStateVec support Added support for cuQuantum, NVIDIA's APIs for quantum computing, to accelerate statevector, density matrix and unitary simulators by using GPUs. To include cuQuantum, custom build of Aer is necessary by setting path of cuQuantum library to CUSTATEVEC_ROOT (Binary distribution will be after official release of cuQuantum, which is now Beta 2 (0.1.0). cuStateVector of cuQuantum is enabled by setting device='GPU' and cuStateVec_threshold options. cuStateVec is enabled when number of qubits of input circuit is equal or greater than cuStateVec_threshold. Since cuQuantum is beta version, there are some limitations: - cuStateVec is not thread safe, multi-chunk parallelization (cache blocking) is done by single thread (slow) - Multi-shots parallelization is disabled (single thread, slow) - Multi-shots batched optimization is not support for cuStateVec Co-authored-by: Christopher J. Wood Co-authored-by: Hiroshi Horii --- CMakeLists.txt | 9 + CONTRIBUTING.md | 28 + .../providers/aer/backends/aer_simulator.py | 11 + .../providers/aer/backends/qasm_simulator.py | 19 +- .../cuQuantum-support-d33abe5b1cb778a8.yaml | 13 + src/controllers/aer_controller.hpp | 72 +- .../density_matrix/densitymatrix_state.hpp | 425 ++- .../density_matrix/densitymatrix_thrust.hpp | 36 +- src/simulators/state.hpp | 5 +- src/simulators/state_chunk.hpp | 277 +- src/simulators/statevector/chunk/chunk.hpp | 73 +- .../statevector/chunk/chunk_container.hpp | 649 ++-- .../statevector/chunk/chunk_manager.hpp | 37 +- .../chunk/cuStateVec_chunk_container.hpp | 860 ++++++ .../statevector/chunk/cuda_kernels.hpp | 2 + .../chunk/device_chunk_container.hpp | 71 +- .../chunk/host_chunk_container.hpp | 53 +- .../statevector/chunk/thrust_kernels.hpp | 2699 +++++++++++++++++ src/simulators/statevector/qubitvector.hpp | 47 + .../statevector/qubitvector_thrust.hpp | 2541 +--------------- .../statevector/statevector_state.hpp | 378 ++- src/simulators/unitary/unitary_state.hpp | 138 +- .../unitary/unitarymatrix_thrust.hpp | 43 +- .../backends/aer_simulator/test_options.py | 4 +- .../test_wrapper_qasm_simulator.py | 3 + test/terra/backends/simulator_test_case.py | 37 +- 26 files changed, 5214 insertions(+), 3316 deletions(-) create mode 100644 releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml create mode 100644 src/simulators/statevector/chunk/cuStateVec_chunk_container.hpp create mode 100644 src/simulators/statevector/chunk/thrust_kernels.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4eeb296f69..79ad5f80db 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -257,6 +257,15 @@ if(AER_THRUST_SUPPORTED) set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA) set(THRUST_DEPENDENT_LIBS "") + if(CUSTATEVEC_ROOT) + set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_CUSTATEVEC) + set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -I${CUSTATEVEC_ROOT}/include") + if(CUSTATEVEC_STATIC) + set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec_static -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 -lcublas") + else() + set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec") + endif() + endif() elseif(AER_THRUST_BACKEND STREQUAL "TBB") message(STATUS "TBB Support found!") set(THRUST_DEPENDENT_LIBS AER_DEPENDENCY_PKG::tbb) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 976c93f7a0..8ae8bc9ac1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -643,6 +643,34 @@ Few notes on GPU builds: 3. We don't need NVIDIA® drivers for building, but we need them for running simulations 4. Only Linux platforms are supported +Qiskit Aer now supports cuQuantum optimized Quantum computing APIs from NVIDIA®. +cuStateVec APIs can be exploited to accelerate statevector, density_matrix and unitary methods. +Because cuQuantum is beta version currently, some of the operations are not accelerated by cuStateVec. + +To build Qiskit Aer with cuStateVec support, please set the path to cuQuantum root directory to CUSTATEVEC_ROOT as following. + +For example, + + qiskit-aer$ python ./setup.py bdist_wheel -- -DAER_THRUST_BACKEND=CUDA -DCUSTATEVEC_ROOT=path_to_cuQuantum + +if you want to link cuQuantum library statically, set `CUSTATEVEC_STATIC` to setup.py. +Otherwise you also have to set environmental variable LD_LIBRARY_PATH to indicate path to the cuQuantum libraries. + +To run with cuStateVec, set `device='GPU'` to AerSimulator option and set `cuStateVec_enable=True` to option in execute method. + +``` +sim = AerSimulator(method='statevector', device='GPU') +results = execute(circuit,sim,cuStateVec_enable=True).result() +``` + +Also you can accelrate density matrix and unitary matrix simulations as well. +``` +sim = AerSimulator(method='density_matrix', device='GPU') +results = execute(circuit,sim,cuStateVec_enable=True).result() +``` + + + ### Building with MPI support Qiskit Aer can parallelize its simulation on the cluster systems by using MPI. diff --git a/qiskit/providers/aer/backends/aer_simulator.py b/qiskit/providers/aer/backends/aer_simulator.py index 4846a9e0f6..c9836b68b5 100644 --- a/qiskit/providers/aer/backends/aer_simulator.py +++ b/qiskit/providers/aer/backends/aer_simulator.py @@ -148,6 +148,10 @@ class AerSimulator(AerBackend): initialization or with :meth:`set_options`. The list of supported devices for the current system can be returned using :meth:`available_devices`. + If AerSimulator is built with cuStateVec support, cuStateVec APIs are enabled + by setting ``cuStateVec_enable=True``. This is experimental implementation + based on cuQuantum Beta 2. + **Additional Backend Options** The following simulator specific backend options are supported @@ -216,6 +220,11 @@ class AerSimulator(AerBackend): values (16 Bytes). If set to 0, the maximum will be automatically set to the system memory size (Default: 0). + * ``cuStateVec_enable`` (bool): This option enables accelerating by + cuStateVec library of cuQuantum from NVIDIA, that has highly optimized + kernels for GPUs (Default: False). This option will be ignored + if AerSimulator is not built with cuStateVec support. + * ``blocking_enable`` (bool): This option enables parallelization with multiple GPUs or multiple processes with MPI (CPU/GPU). This option is only available for ``"statevector"``, ``"density_matrix"`` and @@ -514,6 +523,8 @@ def _default_options(cls): memory=None, noise_model=None, seed_simulator=None, + # cuStateVec (cuQuantum) option + cuStateVec_enable=False, # cache blocking for multi-GPUs/MPI options blocking_qubits=None, blocking_enable=False, diff --git a/qiskit/providers/aer/backends/qasm_simulator.py b/qiskit/providers/aer/backends/qasm_simulator.py index 9abbce9056..23ad8a4927 100644 --- a/qiskit/providers/aer/backends/qasm_simulator.py +++ b/qiskit/providers/aer/backends/qasm_simulator.py @@ -339,9 +339,9 @@ class QasmSimulator(AerBackend): } _SIMULATION_METHODS = [ - 'automatic', 'statevector', 'statevector_gpu', + 'automatic', 'statevector', 'statevector_gpu', 'statevector_custatevec', 'statevector_thrust', 'density_matrix', - 'density_matrix_gpu', 'density_matrix_thrust', + 'density_matrix_gpu', 'density_matrix_custatevec', 'density_matrix_thrust', 'stabilizer', 'matrix_product_state', 'extended_stabilizer' ] @@ -595,7 +595,8 @@ def _basis_gates(self): def _method_basis_gates(self): """Return method basis gates and custom instructions""" method = self._options.get('method', None) - if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']: + if method in ['density_matrix', 'density_matrix_gpu', + 'density_matrix_custatevec', 'density_matrix_thrust']: return sorted([ 'u1', 'u2', 'u3', 'u', 'p', 'r', 'rx', 'ry', 'rz', 'id', 'x', 'y', 'z', 'h', 's', 'sdg', 'sx', 'sxdg', 't', 'tdg', 'swap', 'cx', @@ -628,7 +629,8 @@ def _custom_instructions(self): return self._options_configuration['custom_instructions'] method = self._options.get('method', None) - if method in ['statevector', 'statevector_gpu', 'statevector_thrust']: + if method in ['statevector', 'statevector_gpu', + 'statevector_custatevec', 'statevector_thrust']: return sorted([ 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'snapshot', 'save_expval', 'save_expval_var', 'save_probabilities', 'save_probabilities_dict', @@ -636,7 +638,8 @@ def _custom_instructions(self): 'save_density_matrix', 'save_statevector', 'save_statevector_dict', 'set_statevector' ]) - if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']: + if method in ['density_matrix', 'density_matrix_gpu', + 'density_matrix_custatevec', 'density_matrix_thrust']: return sorted([ 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'superop', 'snapshot', 'save_expval', 'save_expval_var', 'save_probabilities', 'save_probabilities_dict', @@ -666,10 +669,12 @@ def _custom_instructions(self): def _set_method_config(self, method=None): """Set non-basis gate options when setting method""" # Update configuration description and number of qubits - if method in ['statevector', 'statevector_gpu', 'statevector_thrust']: + if method in ['statevector', 'statevector_gpu', + 'statevector_custatevec', 'statevector_thrust']: description = 'A C++ statevector simulator with noise' n_qubits = MAX_QUBITS_STATEVECTOR - elif method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']: + elif method in ['density_matrix', 'density_matrix_gpu', + 'density_matrix_custatevec', 'density_matrix_thrust']: description = 'A C++ density matrix simulator with noise' n_qubits = MAX_QUBITS_STATEVECTOR // 2 elif method == 'matrix_product_state': diff --git a/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml b/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml new file mode 100644 index 0000000000..a302cda5fb --- /dev/null +++ b/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml @@ -0,0 +1,13 @@ +--- +features: + - | + Added support for cuQuantum, NVIDIA's APIs for quantum computing, + to accelerate statevector, density matrix and unitary simulators + by using GPUs. + This is experiemental implementation for cuQuantum Beta 2. (0.1.0) + cuStateVec APIs are enabled to accelerate instead of Aer's implementations + by building Aer by setting path of cuQuantum to ``CUSTATEVEC_ROOT``. + (binary distribution is not available currently.) + cuStateVector is enabled by setting ``device='GPU'`` and + ``cuStateVec_threshold`` options. cuStateVec is enabled when number of + qubits of input circuit is equal or greater than ``cuStateVec_threshold``. diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp index 2f1f35639f..c3f4f9aac9 100755 --- a/src/controllers/aer_controller.hpp +++ b/src/controllers/aer_controller.hpp @@ -377,6 +377,8 @@ class Controller { int_t batched_shots_gpu_max_qubits_ = 16; //multi-shot parallelization is applied if qubits is less than max qubits bool enable_batch_multi_shots_ = false; //multi-shot parallelization can be applied + //settings for cuStateVec + bool cuStateVec_enable_ = false; }; //========================================================================= @@ -466,6 +468,12 @@ void Controller::set_config(const json_t &config) { JSON::get_value(batched_shots_gpu_max_qubits_, "batched_shots_gpu_max_qubits", config); } + //cuStateVec configs + cuStateVec_enable_ = false; + if(JSON::check_key("cuStateVec_enable", config)) { + JSON::get_value(cuStateVec_enable_, "cuStateVec_enable", config); + } + // Override automatic simulation method with a fixed method std::string method; if (JSON::get_value(method, "method", config)) { @@ -489,6 +497,9 @@ void Controller::set_config(const json_t &config) { } } + if(method_ == Method::density_matrix || method_ == Method::unitary) + batched_shots_gpu_max_qubits_ /= 2; + // Override automatic simulation method with a fixed method if (JSON::get_value(sim_device_name_, "device", config)) { if (sim_device_name_ == "CPU") { @@ -502,18 +513,37 @@ void Controller::set_config(const json_t &config) { #endif } else if (sim_device_name_ == "GPU") { #ifndef AER_THRUST_CUDA - throw std::runtime_error( - "Simulation device \"GPU\" is not supported on this system"); + throw std::runtime_error( + "Simulation device \"GPU\" is not supported on this system"); #else - int nDev; - if (cudaGetDeviceCount(&nDev) != cudaSuccess) { - cudaGetLastError(); - throw std::runtime_error("No CUDA device available!"); - } - sim_device_ = Device::GPU; +#ifndef AER_CUSTATEVEC + if(cuStateVec_enable_){ + //Aer is not built for cuStateVec + throw std::runtime_error( + "Simulation device \"GPU\" does not supported cuStateVec on this system"); + } #endif + int nDev; + if (cudaGetDeviceCount(&nDev) != cudaSuccess) { + cudaGetLastError(); + throw std::runtime_error("No CUDA device available!"); + } + sim_device_ = Device::GPU; + +#ifdef AER_CUSTATEVEC + if(cuStateVec_enable_){ + //initialize custatevevtor handle once before actual calculation (takes long time at first call) + custatevecStatus_t err; + custatevecHandle_t stHandle; + err = custatevecCreate(&stHandle); + if(err == CUSTATEVEC_STATUS_SUCCESS){ + custatevecDestroy(stHandle); + } } +#endif +#endif + } else { throw std::runtime_error(std::string("Invalid simulation device (\"") + sim_device_name_ + std::string("\").")); @@ -636,9 +666,16 @@ void Controller::set_parallelization_circuit(const Circuit &circ, const Method method) { enable_batch_multi_shots_ = false; - if(batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 && max_batched_states_ >= num_gpus_ && - batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ - enable_batch_multi_shots_ = true; + if(batched_shots_gpu_ && sim_device_ == Device::GPU && + circ.shots > 1 && max_batched_states_ >= num_gpus_ && + batched_shots_gpu_max_qubits_ >= circ.num_qubits ){ + enable_batch_multi_shots_ = true; + } + + if(sim_device_ == Device::GPU && cuStateVec_enable_){ + enable_batch_multi_shots_ = false; //cuStateVec does not support batch execution of multi-shots + parallel_shots_ = 1; //cuStateVec is currently not thread safe + return; } if(explicit_parallelization_) @@ -785,6 +822,7 @@ size_t Controller::get_gpu_memory_mb() { } num_gpus_ = nDev; #endif + #ifdef AER_MPI // get minimum memory size per process uint64_t locMem, minMem; @@ -866,7 +904,6 @@ Result Controller::execute(const inputdata_t &input_qobj) { auto time_taken = std::chrono::duration(myclock_t::now() - timer_start).count(); result.metadata.add(time_taken, "time_taken"); - return result; } catch (std::exception &e) { // qobj was invalid, return valid output containing error message @@ -959,7 +996,7 @@ Result Controller::execute(std::vector &circuits, const int NUM_RESULTS = result.results.size(); //following looks very similar but we have to separate them to avoid omp nested loops that causes performance degradation //(DO NOT use if statement in #pragma omp) - if (parallel_experiments_ == 1) { + if (parallel_experiments_ == 1 || sim_device_ == Device::ThrustCPU) { for (int j = 0; j < NUM_RESULTS; ++j) { set_parallelization_circuit(circuits[j], noise_model, methods[j]); run_circuit(circuits[j], noise_model,methods[j], @@ -1439,7 +1476,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ, // Check if measure sampler and optimization are valid if (can_sample) { // Implement measure sampler - if (parallel_shots_ <= 1) { + if (parallel_shots_ <= 1 || sim_device_ == Device::GPU || sim_device_ == Device::ThrustCPU) { state.set_max_matrix_qubits(max_bits); RngEngine rng; rng.set_seed(circ.seed); @@ -1460,7 +1497,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ, shot_state.set_parallelization(parallel_state_update_); shot_state.set_global_phase(circ.global_phase_angle); - state.set_max_matrix_qubits(max_bits); + shot_state.set_max_matrix_qubits(max_bits); RngEngine rng; rng.set_seed(circ.seed + i); @@ -1736,7 +1773,12 @@ void Controller::measure_sampler( shots_or_index = shots; else shots_or_index = shot_index; + + auto timer_start = myclock_t::now(); auto all_samples = state.sample_measure(meas_qubits, shots_or_index, rng); + auto time_taken = + std::chrono::duration(myclock_t::now() - timer_start).count(); + result.metadata.add(time_taken, "sample_measure_time"); // Make qubit map of position in vector of measured qubits std::unordered_map qubit_map; diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp index 5804321769..dcce3e8e09 100644 --- a/src/simulators/density_matrix/densitymatrix_state.hpp +++ b/src/simulators/density_matrix/densitymatrix_state.hpp @@ -443,20 +443,38 @@ void State::initialize_qreg(uint_t num_qubits, if(BaseState::multi_chunk_distribution_){ auto input = state.copy_to_matrix(); -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); - for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ - uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); - uint_t irow = i >> (BaseState::chunk_bits_); - tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = input[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -485,20 +503,38 @@ void State::initialize_qreg(uint_t num_qubits, } if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); - for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ - uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); - uint_t irow = i >> (BaseState::chunk_bits_); - tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -526,20 +562,38 @@ void State::initialize_qreg(uint_t num_qubits, } if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); - for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ - uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); - uint_t irow = i >> (BaseState::chunk_bits_); - tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << (BaseState::chunk_bits_*2)); + for(i=0;i<(1ull << (BaseState::chunk_bits_*2));i++){ + uint_t icol = i & ((1ull << (BaseState::chunk_bits_))-1); + uint_t irow = i >> (BaseState::chunk_bits_); + tmp[i] = state[icol_chunk + icol + ((irow_chunk + irow) << (BaseState::num_qubits_))]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -569,21 +623,40 @@ void State::initialize_from_vector(const int_t iChunkIn, const list_t else if((1ull << (BaseState::num_qubits_*2)) == vec.size() * vec.size()) { int_t iChunk; if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - list_t vec1(1ull << BaseState::chunk_bits_); - list_t vec2(1ull << BaseState::chunk_bits_); - - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; - vec2[i] = std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + list_t vec1(1ull << BaseState::chunk_bits_); + list_t vec2(1ull << BaseState::chunk_bits_); + + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; + vec2[i] = std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); + } + BaseState::qregs_[iChunk].initialize_from_vector(AER::Utils::tensor_product(vec1, vec2)); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))) << (BaseState::chunk_bits_); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)) << (BaseState::chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + list_t vec1(1ull << BaseState::chunk_bits_); + list_t vec2(1ull << BaseState::chunk_bits_); + + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + vec1[i] = vec[(irow_chunk << BaseState::chunk_bits_) + i]; + vec2[i] = std::conj(vec[(icol_chunk << BaseState::chunk_bits_) + i]); + } + BaseState::qregs_[iChunk].initialize_from_vector(AER::Utils::tensor_product(vec1, vec2)); } - BaseState::qregs_[iChunk].initialize_from_vector(AER::Utils::tensor_product(vec1, vec2)); } } else{ @@ -876,38 +949,76 @@ double State::expval_pauli(const int_t iChunk, const reg_t &qubits, const uint_t mask_u = ~((1ull << (x_max + 1)) - 1); const uint_t mask_l = (1ull << x_max) - 1; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) - for(i=0;i iChunk){ //on this process - double sign = 2.0; - if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) - sign = -2.0; - expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli_non_diagonal_chunk(qubits_in_chunk, pauli_in_chunk,phase); + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(i) reduction(+:expval) + for(i=0;i iChunk){ //on this process + double sign = 2.0; + if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) + sign = -2.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli_non_diagonal_chunk(qubits_in_chunk, pauli_in_chunk,phase); + } + } + } + else{ + for(i=0;i iChunk){ //on this process + double sign = 2.0; + if (z_mask && (AER::Utils::popcount(irow & z_mask) & 1)) + sign = -2.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli_non_diagonal_chunk(qubits_in_chunk, pauli_in_chunk,phase); + } } } } else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(i) reduction(+:expval) + for(i=0;i iChunk){ //on this process + double sign = 1.0; + if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) + sign = -1.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,1.0); + } + } + } + else{ + for(i=0;i iChunk){ //on this process + double sign = 1.0; + if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) + sign = -1.0; + expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,1.0); + } + } + } + } + } + else{ //all bits are inside chunk + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(i) reduction(+:expval) for(i=0;i iChunk){ //on this process - double sign = 1.0; - if (z_mask && (AER::Utils::popcount(i & z_mask) & 1)) - sign = -1.0; - expval += sign * BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,1.0); + expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits, pauli,1.0); } } } - } - else{ //all bits are inside chunk -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) - for(i=0;i iChunk){ //on this process - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits, pauli,1.0); + else{ + for(i=0;i iChunk){ //on this process + expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits, pauli,1.0); + } } } } @@ -1344,7 +1455,7 @@ void State::apply_gate_u3(const int_t iChunk, uint_t qubit, double th template void State::apply_diagonal_unitary_matrix(const int_t iChunk, const reg_t &qubits, const cvector_t & diag) { - if(BaseState::thrust_optimization_){ + if(BaseState::thrust_optimization_ || !BaseState::multi_chunk_distribution_){ //GPU computes all chunks in one kernel, so pass qubits and diagonal matrix as is BaseState::qregs_[iChunk].apply_diagonal_unitary_matrix(qubits,diag); } @@ -1441,51 +1552,99 @@ rvector_t State::measure_probs(const int_t iChunk, const reg_t &qubit } } -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i,j,k) - for(i=0;i> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - - if(irow == icol){ //diagonal chunk - if(qubits_in_chunk.size() > 0){ - auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - if(qubits_in_chunk.size() == qubits.size()){ - for(j=0;j> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + + if(irow == icol){ //diagonal chunk + if(qubits_in_chunk.size() > 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + if(qubits_in_chunk.size() == qubits.size()){ + for(j=0;j> i_in) & 1) << k); - i_in++; - } - else{ - if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ - idx += 1ull << k; + else{ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ + idx += 1ull << k; + } } } - } #pragma omp atomic - sum[idx] += chunkSum[j]; + sum[idx] += chunkSum[j]; + } } } + else{ //there is no bit in chunk + auto tr = std::real(BaseState::qregs_[i].trace()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } +#pragma omp atomic + sum[idx] += tr; + } } - else{ //there is no bit in chunk - auto tr = std::real(BaseState::qregs_[i].trace()); - int idx = 0; - for(k=0;k> qubits_out_chunk[k]) & 1){ - idx += 1ull << k; + } + } + else{ + for(i=0;i> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + i) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + + if(irow == icol){ //diagonal chunk + if(qubits_in_chunk.size() > 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + if(qubits_in_chunk.size() == qubits.size()){ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << (BaseState::chunk_bits_)) >> qubits[k]) & 1){ + idx += 1ull << k; + } + } + } + sum[idx] += chunkSum[j]; + } } } -#pragma omp atomic - sum[idx] += tr; + else{ //there is no bit in chunk + auto tr = std::real(BaseState::qregs_[i].trace()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } + sum[idx] += tr; + } } } } @@ -1531,9 +1690,14 @@ void State::measure_reset_update(const int_t iChunk, const reg_t &qub if(!BaseState::multi_chunk_distribution_) apply_diagonal_unitary_matrix(iChunk, qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::measure_reset_update(const int_t iChunk, const reg_t &qub BaseState::qregs_[iChunk].apply_x(qubits[0]); else{ if(qubits[0] < BaseState::chunk_bits_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::measure_reset_update(const int_t iChunk, const reg_t &qub if(!BaseState::multi_chunk_distribution_) apply_diagonal_unitary_matrix(iChunk, qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::measure_reset_update(const int_t iChunk, const reg_t &qub } } if(qubits_in_chunk.size() > 0){ //in chunk exchange -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i 0){ //out of chunk exchange diff --git a/src/simulators/density_matrix/densitymatrix_thrust.hpp b/src/simulators/density_matrix/densitymatrix_thrust.hpp index 810ef2056b..7eaddb75c1 100755 --- a/src/simulators/density_matrix/densitymatrix_thrust.hpp +++ b/src/simulators/density_matrix/densitymatrix_thrust.hpp @@ -262,7 +262,7 @@ void DensityMatrixThrust::apply_diagonal_superop_matrix(const reg_t &qub template -class DensityMatrixUnitary2x2 : public GateFuncBase +class DensityMatrixUnitary2x2 : public Chunk::GateFuncBase { protected: thrust::complex m0,m1,m2,m3; @@ -364,7 +364,7 @@ void DensityMatrixThrust::apply_unitary_matrix(const reg_t &qubits, } template -class DensityDiagMatMult2x2 : public GateFuncBase +class DensityDiagMatMult2x2 : public Chunk::GateFuncBase { protected: uint_t offset; @@ -429,7 +429,7 @@ class DensityDiagMatMult2x2 : public GateFuncBase }; template -class DensityDiagMatMultNxN : public GateFuncBase +class DensityDiagMatMultNxN : public Chunk::GateFuncBase { protected: int nqubits_; @@ -512,7 +512,7 @@ void DensityMatrixThrust::apply_diagonal_unitary_matrix(const reg_t &qub // Apply Specialized Gates //----------------------------------------------------------------------- template -class DensityCX : public GateFuncBase +class DensityCX : public Chunk::GateFuncBase { protected: uint_t offset; @@ -599,7 +599,7 @@ void DensityMatrixThrust::apply_cnot(const uint_t qctrl, const uint_t qt } template -class DensityPhase : public GateFuncBase +class DensityPhase : public Chunk::GateFuncBase { protected: thrust::complex phase_; @@ -665,7 +665,7 @@ void DensityMatrixThrust::apply_phase(const uint_t q,const complex_t &ph } template -class DensityCPhase : public GateFuncBase +class DensityCPhase : public Chunk::GateFuncBase { protected: uint_t offset; @@ -753,7 +753,7 @@ void DensityMatrixThrust::apply_swap(const uint_t q0, const uint_t q1) { } template -class DensityX : public GateFuncBase +class DensityX : public Chunk::GateFuncBase { protected: uint_t mask0; @@ -829,7 +829,7 @@ void DensityMatrixThrust::apply_x(const uint_t qubit) } template -class DensityY : public GateFuncBase +class DensityY : public Chunk::GateFuncBase { protected: uint_t mask0; @@ -929,7 +929,7 @@ void DensityMatrixThrust::apply_toffoli(const uint_t qctrl0, //special case Z only template -class expval_pauli_Z_func_dm : public GateFuncBase +class expval_pauli_Z_func_dm : public Chunk::GateFuncBase { protected: uint_t z_mask_; @@ -966,7 +966,7 @@ class expval_pauli_Z_func_dm : public GateFuncBase ret = q0.real(); if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) + if(Chunk::pop_count_kernel(i & z_mask_) & 1) ret = -ret; } @@ -979,7 +979,7 @@ class expval_pauli_Z_func_dm : public GateFuncBase }; template -class expval_pauli_XYZ_func_dm : public GateFuncBase +class expval_pauli_XYZ_func_dm : public Chunk::GateFuncBase { protected: uint_t x_mask_; @@ -1026,7 +1026,7 @@ class expval_pauli_XYZ_func_dm : public GateFuncBase q0 = 2 * phase_ * q0; ret = q0.real(); if(z_mask_ != 0){ - if(pop_count_kernel(idx_vec & z_mask_) & 1) + if(Chunk::pop_count_kernel(idx_vec & z_mask_) & 1) ret = -ret; } return ret; @@ -1067,7 +1067,7 @@ double DensityMatrixThrust::expval_pauli(const reg_t &qubits, } template -class expval_pauli_XYZ_func_dm_non_diagonal : public GateFuncBase +class expval_pauli_XYZ_func_dm_non_diagonal : public Chunk::GateFuncBase { protected: uint_t x_mask_; @@ -1108,7 +1108,7 @@ class expval_pauli_XYZ_func_dm_non_diagonal : public GateFuncBase q0 = phase_ * q0; ret = q0.real(); if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) + if(Chunk::pop_count_kernel(i & z_mask_) & 1) ret = -ret; } return ret; @@ -1151,7 +1151,7 @@ double DensityMatrixThrust::probability(const uint_t outcome) const template -class density_probability_func : public GateFuncBase +class density_probability_func : public Chunk::GateFuncBase { protected: uint_t qubit_sp_; @@ -1257,7 +1257,7 @@ reg_t DensityMatrixThrust::sample_measure(const std::vector &rnd } template -class density_reset_after_measure_func : public GateFuncBase +class density_reset_after_measure_func : public Chunk::GateFuncBase { protected: uint_t num_qubits_; @@ -1325,7 +1325,7 @@ void DensityMatrixThrust::apply_batched_measure(const reg_t& qubits,std: count = BaseVector::chunk_.container()->num_chunks(); //total probability - BaseVector::apply_function_sum(nullptr,trace_func(BaseMatrix::rows_),true); + BaseVector::apply_function_sum(nullptr,Chunk::trace_func(BaseMatrix::rows_),true); BaseVector::apply_function(set_probability_buffer_for_reset_func(BaseVector::chunk_.probability_buffer(),BaseVector::chunk_.container()->num_chunks(), BaseVector::chunk_.reduce_buffer(),BaseVector::chunk_.reduce_buffer_size()) ); @@ -1374,7 +1374,7 @@ void DensityMatrixThrust::apply_batched_measure(const reg_t& qubits,std: } template -class density_reset_func : public GateFuncBase +class density_reset_func : public Chunk::GateFuncBase { protected: uint_t num_qubits_; diff --git a/src/simulators/state.hpp b/src/simulators/state.hpp index 4547ec02fa..c07b5e99df 100644 --- a/src/simulators/state.hpp +++ b/src/simulators/state.hpp @@ -355,8 +355,9 @@ State::~State(void) } template -void State::set_config(const json_t &config) { - (ignore_argument)config; +void State::set_config(const json_t &config) +{ + } template diff --git a/src/simulators/state_chunk.hpp b/src/simulators/state_chunk.hpp index 20ac3b8657..85571f98e0 100644 --- a/src/simulators/state_chunk.hpp +++ b/src/simulators/state_chunk.hpp @@ -391,6 +391,9 @@ class StateChunk : public State { reg_t top_chunk_of_group_; reg_t num_chunks_in_group_; + //cuStateVec settings + bool cuStateVec_enable_ = false; + //----------------------------------------------------------------------- // Apply circuits and ops //----------------------------------------------------------------------- @@ -508,6 +511,12 @@ class StateChunk : public State { uint_t mapped_index(const uint_t idx); + //apply OpenMP parallelization if enabled + template + void apply_omp_parallel(bool enabled, int_t i_begin, int_t i_end, Lambda& func); + + template + double apply_omp_parallel_reduction(bool enabled, int_t i_begin, int_t i_end, Lambda& func); }; @@ -526,8 +535,16 @@ StateChunk::~StateChunk(void) } template -void StateChunk::set_config(const json_t &config) { - (ignore_argument)config; +void StateChunk::set_config(const json_t &config) +{ + BaseState::set_config(config); + +#ifdef AER_CUSTATEVEC + //cuStateVec configs + if(JSON::check_key("cuStateVec_enable", config)) { + JSON::get_value(cuStateVec_enable_, "cuStateVec_enable", config); + } +#endif } template @@ -557,6 +574,38 @@ void StateChunk::set_distribution(uint_t nprocs) #endif } +template +template +void StateChunk::apply_omp_parallel(bool enabled, int_t i_begin, int_t i_end, Lambda& func) +{ + if(enabled){ +#pragma omp parallel for + for(int_t i=i_begin;i +template +double StateChunk::apply_omp_parallel_reduction(bool enabled, int_t i_begin, int_t i_end, Lambda& func) +{ + double val = 0.0; + if(enabled){ +#pragma omp parallel for reduction(+:val) + for(int_t i=i_begin;i bool StateChunk::allocate(uint_t num_qubits,uint_t block_bits,uint_t num_parallel_shots) { @@ -617,15 +666,26 @@ bool StateChunk::allocate(uint_t num_qubits,uint_t block_bits,uint_t nu chunk_omp_parallel_ = false; if(qregs_[0].name().find("gpu") != std::string::npos){ #ifdef _OPENMP - if(multi_chunk_distribution_){ - if(omp_get_num_threads() == 1) - chunk_omp_parallel_ = true; + if(omp_get_num_threads() == 1) + chunk_omp_parallel_ = true; +#endif + +#ifdef AER_CUSTATEVEC + //set cuStateVec_enable_ + if(cuStateVec_enable_){ + if(multi_shots_parallelization_) + cuStateVec_enable_ = false; //multi-shots parallelization is not supported for cuStateVec } + + if(cuStateVec_enable_) + chunk_omp_parallel_ = false; //because cuStateVec is not thread safe + else + thrust_optimization_ = true; //cuStateVec does not handle global chunk index for diagonal matrix #endif - thrust_optimization_ = true; } else if(qregs_[0].name().find("thrust") != std::string::npos){ thrust_optimization_ = true; + chunk_omp_parallel_ = false; } @@ -656,7 +716,8 @@ bool StateChunk::allocate_qregs(uint_t num_chunks) uint_t chunk_id = multi_chunk_distribution_ ? global_chunk_index_ : 0; bool ret = true; qregs_[0].set_max_matrix_bits(BaseState::max_matrix_qubits_); - ret &= qregs_[0].chunk_setup(chunk_bits_*qubit_scale(),num_qubits_*qubit_scale(),chunk_id,num_chunks); + qregs_[0].cuStateVec_enable(cuStateVec_enable_); + ret &= qregs_[0].chunk_setup(chunk_bits_*qubit_scale(), num_qubits_*qubit_scale(), chunk_id, num_chunks); for(i=1;i::apply_ops(InputIterator first, InputIterator last, } } } + + qregs_[0].synchronize(); + +#ifdef AER_CUSTATEVEC + result.metadata.add(cuStateVec_enable_, "cuStateVec_enable"); +#endif } template @@ -801,6 +868,11 @@ void StateChunk::apply_ops_chunks(InputIterator first, InputIterator la } iOp++; } + + qregs_[0].synchronize(); +#ifdef AER_CUSTATEVEC + result.metadata.add(cuStateVec_enable_, "cuStateVec_enable"); +#endif } template @@ -868,42 +940,23 @@ void StateChunk::apply_ops_multi_shots(InputIterator first, InputIterat //resize qregs allocate_qregs(n_shots); } - //initialization (equivalent to initialize_qreg + initialize_creg) - if(num_groups_ > 1 && chunk_omp_parallel_){ -#pragma omp parallel for - for(i=0;i 1 && chunk_omp_parallel_),0,num_groups_,init_group); - for(uint_t j=top_chunk_of_group_[i];j::initialize_from_vector(const int_t iChunkIn, const lis int_t iChunk; if(multi_chunk_distribution_){ -#pragma omp parallel for if(chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk::initialize_from_matrix(const int_t iChunkIn, const lis { int_t iChunk; if(multi_chunk_distribution_){ -#pragma omp parallel for if(chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << (chunk_bits_); - uint_t icol_chunk = ((iChunk + global_chunk_index_) & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << (chunk_bits_); - - //copy part of state for this chunk - uint_t i,row,col; - for(i=0;i<(1ull << (chunk_bits_*qubit_scale()));i++){ - uint_t icol = i & ((1ull << chunk_bits_)-1); - uint_t irow = i >> chunk_bits_; - tmp[i] = mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; + if(chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << (chunk_bits_); + uint_t icol_chunk = ((iChunk + global_chunk_index_) & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << (chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + for(i=0;i<(1ull << (chunk_bits_*qubit_scale()));i++){ + uint_t icol = i & ((1ull << chunk_bits_)-1); + uint_t irow = i >> chunk_bits_; + tmp[i] = mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; + } + qregs_[iChunk].initialize_from_matrix(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((num_qubits_ - chunk_bits_))) << (chunk_bits_); + uint_t icol_chunk = ((iChunk + global_chunk_index_) & ((1ull << ((num_qubits_ - chunk_bits_)))-1)) << (chunk_bits_); + + //copy part of state for this chunk + uint_t i,row,col; + for(i=0;i<(1ull << (chunk_bits_*qubit_scale()));i++){ + uint_t icol = i & ((1ull << chunk_bits_)-1); + uint_t irow = i >> chunk_bits_; + tmp[i] = mat[icol_chunk + icol + ((irow_chunk + irow) << num_qubits_)]; + } + qregs_[iChunk].initialize_from_matrix(tmp); } - qregs_[iChunk].initialize_from_matrix(tmp); } } else{ @@ -1611,49 +1693,26 @@ void StateChunk::apply_chunk_swap(const reg_t &qubits) nPair = num_local_chunks_ >> 2; } - if(chunk_omp_parallel_){ -#pragma omp parallel for private(iPair,baseChunk,iChunk1,iChunk2) - for(iPair=0;iPair::apply_chunk_x(const uint_t qubit) if(qubit < chunk_bits_*qubit_scale()){ - reg_t qubits(1,qubit); -#pragma omp parallel for if(chunk_omp_parallel_ && num_groups_ > 1) - for(int_t ig=0;ig 1),0,num_groups_,apply_mcx); } else{ //exchange over chunks int_t iPair; @@ -1792,16 +1852,17 @@ void StateChunk::apply_chunk_x(const uint_t qubit) if(distributed_procs_ == 1 || (proc_bits >= 0 && qubit < (num_qubits_*qubit_scale() - proc_bits))){ //no data transfer between processes is needed nPair = num_local_chunks_ >> 1; -#pragma omp parallel for if(chunk_omp_parallel_) private(iPair,baseChunk,iChunk1,iChunk2) - for(iPair=0;iPair> cc,uint_t pos) @@ -54,6 +61,7 @@ class Chunk num_qubits_ = 0; chunk_index_ = 0; mapped_ = false; + cache_ = nullptr; } Chunk(Chunk& chunk) //map chunk from exisiting chunk (used fo cache chunk) { @@ -63,9 +71,12 @@ class Chunk num_qubits_ = chunk.num_qubits_; chunk_index_ = chunk.chunk_index_; mapped_ = true; + cache_ = nullptr; } ~Chunk() { + if(cache_) + cache_.reset(); } void set_device(void) const @@ -256,9 +267,13 @@ class Chunk return chunk_container_.lock()->sample_measure(chunk_pos_,rnds,stride,dot,count); } - thrust::complex norm(uint_t count=1,uint_t stride = 1,bool dot = true) const + double norm(uint_t count) const { - return chunk_container_.lock()->norm(chunk_pos_,count,stride,dot); + return chunk_container_.lock()->norm(chunk_pos_,count); + } + double trace(uint_t row, uint_t count) const + { + return chunk_container_.lock()->trace(chunk_pos_,row,count); } #ifdef AER_THRUST_CUDA @@ -349,10 +364,64 @@ class Chunk chunk_container_.lock()->keep_conditional(keep); } + //apply matrix + void apply_matrix(const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) + { + chunk_container_.lock()->apply_matrix(chunk_pos_,qubits,control_bits,mat,count); + } + //apply diagonal matrix + void apply_diagonal_matrix(const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) + { + chunk_container_.lock()->apply_diagonal_matrix(chunk_pos_,qubits,control_bits,diag,count); + } + //apply (controlled) X + void apply_X(const reg_t& qubits,const uint_t count) + { + chunk_container_.lock()->apply_X(chunk_pos_,qubits,count); + } + //apply (controlled) Y + void apply_Y(const reg_t& qubits,const uint_t count) + { + chunk_container_.lock()->apply_Y(chunk_pos_,qubits,count); + } + //apply (controlled) phase + void apply_phase(const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) + { + chunk_container_.lock()->apply_phase(chunk_pos_,qubits,control_bits,phase,count); + } + //apply (controlled) swap gate + void apply_swap(const reg_t& qubits,const int_t control_bits,const uint_t count) + { + chunk_container_.lock()->apply_swap(chunk_pos_,qubits,control_bits,count); + } + //apply permutation + void apply_permutation(const reg_t& qubits,const std::vector> &pairs, const uint_t count) + { + chunk_container_.lock()->apply_permutation(chunk_pos_,qubits,pairs,count); + } + + //apply rotation around axis + void apply_rotation(const reg_t &qubits, const Rotation r, const double theta, const uint_t count) + { + chunk_container_.lock()->apply_rotation(chunk_pos_,qubits,r,theta,count); + } + + //get probabilities of chunk + void probabilities(std::vector& probs, const reg_t& qubits) const + { + chunk_container_.lock()->probabilities(probs, chunk_pos_,qubits); + } + //Pauli expectation values + double expval_pauli(const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const + { + return chunk_container_.lock()->expval_pauli(chunk_pos_,qubits,pauli,initial_phase); + } + }; //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/chunk_container.hpp b/src/simulators/statevector/chunk/chunk_container.hpp index 5fd68798e4..69604d6e55 100644 --- a/src/simulators/statevector/chunk/chunk_container.hpp +++ b/src/simulators/statevector/chunk/chunk_container.hpp @@ -63,8 +63,11 @@ DISABLE_WARNING_POP #include "simulators/statevector/chunk/cuda_kernels.hpp" #endif +#include "simulators/statevector/chunk/thrust_kernels.hpp" + namespace AER { namespace QV { +namespace Chunk { template class Chunk; template class DeviceChunkContainer; @@ -77,391 +80,6 @@ struct BlockedGateParams unsigned char qubit_; }; -//======================================== -// base class of gate functions -//======================================== -template -class GateFuncBase -{ -protected: - thrust::complex* data_; //pointer to state vector buffer - thrust::complex* matrix_; //storage for matrix on device - uint_t* params_; //storage for additional parameters on device - uint_t base_index_; //start index of state vector - uint_t chunk_bits_; - uint_t* cregs_; - uint_t num_creg_bits_; - int_t conditional_bit_; -#ifndef AER_THRUST_CUDA - uint_t index_offset_; -#endif -public: - GateFuncBase() - { - data_ = NULL; - base_index_ = 0; - cregs_ = NULL; - num_creg_bits_ = 0; - conditional_bit_ = -1; -#ifndef AER_THRUST_CUDA - index_offset_ = 0; -#endif - } - virtual void set_data(thrust::complex* p) - { - data_ = p; - } - void set_matrix(thrust::complex* mat) - { - matrix_ = mat; - } - void set_params(uint_t* p) - { - params_ = p; - } - void set_chunk_bits(uint_t bits) - { - chunk_bits_ = bits; - } - - void set_base_index(uint_t i) - { - base_index_ = i; - } - void set_cregs_(uint_t* cbits,uint_t nreg) - { - cregs_ = cbits; - num_creg_bits_ = nreg; - } - void set_conditional(int_t bit) - { - conditional_bit_ = bit; - } - -#ifndef AER_THRUST_CUDA - void set_index_offset(uint_t i) - { - index_offset_ = i; - } -#endif - - __host__ __device__ thrust::complex* data(void) - { - return data_; - } - - virtual bool is_diagonal(void) - { - return false; - } - virtual int qubits_count(void) - { - return 1; - } - virtual int num_control_bits(void) - { - return 0; - } - virtual int control_mask(void) - { - return 1; - } - virtual bool use_cache(void) - { - return false; - } - virtual bool batch_enable(void) - { - return true; - } - - virtual const char* name(void) - { - return "base function"; - } - virtual uint_t size(int num_qubits) - { - if(is_diagonal()){ - chunk_bits_ = num_qubits; - return (1ull << num_qubits); - } - else{ - chunk_bits_ = num_qubits - (qubits_count() - num_control_bits()); - return (1ull << (num_qubits - (qubits_count() - num_control_bits()))); - } - } - - virtual __host__ __device__ uint_t thread_to_index(uint_t _tid) const - { - return _tid; - } - virtual __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - //implemente this in the kernel class - } - virtual __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - //implemente this in the kernel class - return 0.0; - } - - virtual __host__ __device__ bool check_conditional(uint_t i) const - { - if(conditional_bit_ < 0) - return true; - - uint_t iChunk = i >> chunk_bits_; - uint_t n64,i64,ibit; - n64 = (num_creg_bits_ + 63) >> 6; - i64 = conditional_bit_ >> 6; - ibit = conditional_bit_ & 63; - return (((cregs_[iChunk*n64 + i64] >> ibit) & 1) != 0); - } -}; - -//======================================== - // gate functions with cache -//======================================== -template -class GateFuncWithCache : public GateFuncBase -{ -protected: - int nqubits_; -public: - GateFuncWithCache(uint_t nq) - { - nqubits_ = nq; - } - - bool use_cache(void) - { - return true; - } - - __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const - { - uint_t idx,ii,t,j; - uint_t* qubits; - uint_t* qubits_sorted; - - qubits_sorted = this->params_; - qubits = qubits_sorted + nqubits_; - - idx = 0; - ii = _tid >> nqubits_; - for(j=0;j> j) & 1) != 0){ - idx += (1ull << qubits[j]); - } - } - idx += ii; - return idx; - } - - __host__ __device__ void sync_threads() const - { -#ifdef CUDA_ARCH - __syncthreads(); -#endif - } - - __host__ __device__ void operator()(const uint_t &i) const - { - if(!this->check_conditional(i)) - return; - - thrust::complex cache[1024]; - uint_t j,idx; - uint_t matSize = 1ull << nqubits_; - - //load data to cache - for(j=0;jdata_[idx]; - } - - //execute using cache - for(j=0;jrun_with_cache(j,idx,cache); - } - } - - virtual int qubits_count(void) - { - return nqubits_; - } -}; - -template -class GateFuncSumWithCache : public GateFuncBase -{ -protected: - int nqubits_; -public: - GateFuncSumWithCache(uint_t nq) - { - nqubits_ = nq; - } - - bool use_cache(void) - { - return true; - } - - - __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const - { - uint_t idx,ii,t,j; - uint_t* qubits; - uint_t* qubits_sorted; - - qubits_sorted = this->params_; - qubits = qubits_sorted + nqubits_; - - idx = 0; - ii = _tid >> nqubits_; - for(j=0;j> j) & 1) != 0){ - idx += (1ull << qubits[j]); - } - } - idx += ii; - return idx; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - if(!this->check_conditional(i)) - return 0.0; - - thrust::complex cache[1024]; - uint_t j,idx; - uint_t matSize = 1ull << nqubits_; - double sum = 0.0; - - //load data to cache - for(j=0;jdata_[idx]; - } - - //execute using cache - for(j=0;jrun_with_cache_sum(j,idx,cache); - } - return sum; - } - - virtual int qubits_count(void) - { - return nqubits_; - } - -}; - -//stridded iterator to access diagonal probabilities -template -class strided_range -{ - public: - - typedef typename thrust::iterator_difference::type difference_type; - - struct stride_functor : public thrust::unary_function - { - difference_type stride; - - stride_functor(difference_type stride) - : stride(stride) {} - - __host__ __device__ - difference_type operator()(const difference_type& i) const - { - if(stride == 1) //statevector - return i; - - //density matrix - difference_type i_chunk; - i_chunk = i / (stride - 1); - difference_type ret = stride * i - i_chunk*(stride-1); - return ret; - } - }; - - typedef typename thrust::counting_iterator CountingIterator; - typedef typename thrust::transform_iterator TransformIterator; - typedef typename thrust::permutation_iterator PermutationIterator; - - // type of the strided_range iterator - typedef PermutationIterator iterator; - - // construct strided_range for the range [first,last) - strided_range(Iterator first, Iterator last, difference_type stride) - : first(first), last(last), stride(stride) {} - - iterator begin(void) const - { - return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); - } - - iterator end(void) const - { - if(stride == 1) //statevector - return begin() + (last - first); - - //density matrix - return begin() + (last - first) / (stride-1); - } - - protected: - Iterator first; - Iterator last; - difference_type stride; -}; - -template -struct complex_dot_scan : public thrust::unary_function,thrust::complex> -{ - __host__ __device__ - thrust::complex operator()(thrust::complex x) { return thrust::complex(x.real()*x.real()+x.imag()*x.imag(),0); } -}; - -template -struct complex_norm : public thrust::unary_function,thrust::complex> -{ - __host__ __device__ - thrust::complex operator()(thrust::complex x) { return thrust::complex((double)x.real()*(double)x.real(),(double)x.imag()*(double)x.imag()); } -}; - -template -struct complex_less -{ - typedef thrust::complex first_argument_type; - typedef thrust::complex second_argument_type; - typedef bool result_type; - __thrust_exec_check_disable__ - __host__ __device__ bool operator()(const thrust::complex &lhs, const thrust::complex &rhs) const {return lhs.real() < rhs.real();} -}; // end less - - -class HostFuncBase -{ -protected: -public: - HostFuncBase(){} - - virtual void execute(){} -}; //============================================================================ // chunk container base class @@ -474,6 +92,7 @@ class ChunkContainer : public std::enable_shared_from_this& operator[](uint_t i) = 0; virtual uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers = AER_MAX_BUFFERS,bool multi_shots = false,int matrix_bit = AER_DEFAULT_MATRIX_BITS) = 0; @@ -600,7 +226,8 @@ class ChunkContainer : public std::enable_shared_from_this &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const = 0; - virtual thrust::complex norm(uint_t iChunk,uint_t count,uint_t stride = 1,bool dot = true) const = 0; + virtual double norm(uint_t iChunk,uint_t count) const; + virtual double trace(uint_t iChunk,uint_t row,uint_t count) const; size_t size_of_complex(void) @@ -683,6 +310,35 @@ class ChunkContainer : public std::enable_shared_from_this &mat,const uint_t count); + + //apply diagonal matrix + virtual void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count); + + //apply (controlled) X + virtual void apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count); + + //apply (controlled) Y + virtual void apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count); + + //apply (controlled) phase + virtual void apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count); + + //apply (controlled) swap gate + virtual void apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count); + + //apply permutation + virtual void apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count); + + //apply rotation around axis + virtual void apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count); + + //get probabilities of chunk + virtual void probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const; + + //Pauli expectation values + virtual double expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const; protected: int convert_blocked_qubit(int qubit) @@ -765,6 +421,7 @@ void ChunkContainer::Execute(Function func,uint_t iChunk,uint_t count) { set_device(); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_data( chunk_pointer(iChunk) ); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -818,7 +475,11 @@ void ChunkContainer::Execute(Function func,uint_t iChunk,uint_t count) thrust::for_each_n(thrust::seq, ci , size, func); } #else - uint_t size = count * func.size(chunk_bits_); + uint_t size; + if(func.use_cache()) + size = count << (chunk_bits_ - func.qubits_count()); + else + size = count * func.size(chunk_bits_); auto ci = thrust::counting_iterator(0); thrust::for_each_n(thrust::device, ci , size, func); #endif @@ -835,6 +496,7 @@ void ChunkContainer::ExecuteSum(double* pSum,Function func,uint_t iChunk set_device(); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_data( chunk_pointer(iChunk) ); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -959,6 +621,7 @@ void ChunkContainer::ExecuteSum(double* pSum,Function func,uint_t iChunk #else uint_t size = func.size(chunk_bits_); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -1000,6 +663,7 @@ void ChunkContainer::ExecuteSum2(double* pSum,Function func,uint_t iChun set_device(); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_data( chunk_pointer(iChunk) ); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -1097,6 +761,7 @@ void ChunkContainer::ExecuteSum2(double* pSum,Function func,uint_t iChun #else uint_t size = func.size(chunk_bits_); + func.set_base_index((chunk_index_ + iChunk) << chunk_bits_); func.set_matrix( matrix_pointer(iChunk) ); func.set_params( param_pointer(iChunk) ); @@ -1148,7 +813,217 @@ void ChunkContainer::deallocate_chunks(void) reduced_queue_end_.clear(); } +template +void ChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) +{ + const size_t N = qubits.size() - control_bits; + + if(N == 1){ + if(control_bits == 0) + Execute(MatrixMult2x2(mat,qubits[0]), iChunk, count); + else //2x2 matrix with control bits + Execute(MatrixMult2x2Controlled(mat,qubits), iChunk, count); + } + else if(N == 2){ + Execute(MatrixMult4x4(mat,qubits[0],qubits[1]), iChunk, count); + } + else{ + auto qubits_sorted = qubits; + std::sort(qubits_sorted.begin(), qubits_sorted.end()); +#ifndef AER_THRUST_CUDA + if(N == 3){ + StoreMatrix(mat, iChunk); + Execute(MatrixMult8x8(qubits,qubits_sorted), iChunk, count); + } + else if(N == 4){ + StoreMatrix(mat, iChunk); + Execute(MatrixMult16x16(qubits,qubits_sorted), iChunk, count); + } + else if(N <= 10){ +#else + if(N <= 10){ +#endif + int i; + for(i=0;i(N), iChunk, count); + } + else{ + cvector_t matLU; + reg_t params; + MatrixMultNxN_LU f(mat,qubits_sorted,matLU,params); + + StoreMatrix(matLU, iChunk); + StoreUintParams(params, iChunk); + + Execute(f, iChunk, count); + } + } +} + +template +void ChunkContainer::apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) +{ + const size_t N = qubits.size() - control_bits; + + if(N == 1){ + if(control_bits == 0) + Execute(DiagonalMult2x2(diag,qubits[0]), iChunk, count); + else + Execute(DiagonalMult2x2Controlled(diag,qubits), iChunk, count); + } + else if(N == 2){ + Execute(DiagonalMult4x4(diag,qubits[0],qubits[1]), iChunk, count); + } + else{ + StoreMatrix(diag, iChunk); + StoreUintParams(qubits, iChunk); + + Execute(DiagonalMultNxN(qubits), iChunk, count); + } +} + +template +void ChunkContainer::apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + Execute(CX_func(qubits), iChunk, count); +} + +template +void ChunkContainer::apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + Execute(CY_func(qubits), iChunk, count); +} + +template +void ChunkContainer::apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) +{ + Execute(phase_func(qubits,*(thrust::complex*)&phase), iChunk, count ); +} + +template +void ChunkContainer::apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count) +{ + Execute(CSwap_func(qubits), iChunk, count); +} + +template +void ChunkContainer::apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count) +{ + const size_t N = qubits.size(); + auto qubits_sorted = qubits; + std::sort(qubits_sorted.begin(), qubits_sorted.end()); + + reg_t params; + Permutation f(qubits_sorted,qubits,pairs,params); + + StoreUintParams(params, iChunk); + + Execute(f, iChunk, count); +} + +template +void ChunkContainer::apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count) +{ + int control_bits = qubits.size() - 1; + switch(r){ + case Rotation::x: + apply_matrix(iChunk, qubits, control_bits, Linalg::VMatrix::rx(theta), count); + break; + case Rotation::y: + apply_matrix(iChunk, qubits, control_bits, Linalg::VMatrix::ry(theta), count); + break; + case Rotation::z: + apply_diagonal_matrix(iChunk, qubits, control_bits, Linalg::VMatrix::rz_diag(theta), count); + break; + case Rotation::xx: + apply_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::rxx(theta), count); + break; + case Rotation::yy: + apply_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::ryy(theta), count); + break; + case Rotation::zz: + apply_diagonal_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::rzz_diag(theta), count); + break; + case Rotation::zx: + apply_matrix(iChunk, qubits, control_bits-1, Linalg::VMatrix::rzx(theta), count); + break; + default: + throw std::invalid_argument( + "QubitVectorThrust::invalid rotation axis."); + } +} + +template +void ChunkContainer::probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const +{ + const size_t N = qubits.size(); + const int_t DIM = 1 << N; + probs.resize(DIM); + + if(N == 1){ //special case for 1 qubit (optimized for measure) + ExecuteSum2(&probs[0],probability_1qubit_func(qubits[0]), iChunk, 1); + } + else{ + for(int_t i=0;i(qubits,i), iChunk, 1); + } + } +} + +template +double ChunkContainer::norm(uint_t iChunk,uint_t count) const +{ + double ret; + ExecuteSum(&ret,norm_func(), iChunk, count); + + return ret; +} + +template +double ChunkContainer::trace(uint_t iChunk,uint_t row,uint_t count) const +{ + double ret; + ExecuteSum(&ret,trace_func(row), iChunk, count); + + return ret; +} + +template +double ChunkContainer::expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const +{ + uint_t x_mask, z_mask, num_y, x_max; + std::tie(x_mask, z_mask, num_y, x_max) = pauli_masks_and_phase(qubits, pauli); + + // Special case for only I Paulis + if (x_mask + z_mask == 0) { + thrust::complex ret = norm(iChunk, 1); + return ret.real() + ret.imag(); + } + double ret; + // specialize x_max == 0 + if(x_mask == 0) { + ExecuteSum(&ret, expval_pauli_Z_func(z_mask), iChunk, 1 ); + return ret; + } + + // Compute the overall phase of the operator. + // This is (-1j) ** number of Y terms modulo 4 + auto phase = std::complex(initial_phase); + add_y_phase(num_y, phase); + ExecuteSum(&ret, expval_pauli_XYZ_func(x_mask, z_mask, x_max, phase), iChunk, 1 ); + return ret; +} + + + + //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/chunk_manager.hpp b/src/simulators/statevector/chunk/chunk_manager.hpp index be3abf65c2..d8a8a4fbfa 100644 --- a/src/simulators/statevector/chunk/chunk_manager.hpp +++ b/src/simulators/statevector/chunk/chunk_manager.hpp @@ -1,7 +1,7 @@ /** * This code is part of Qiskit. * - * (C) Copyright IBM 2018, 2019, 2020. + * (C) Copyright IBM 2018, 2019, 2020, 2021, 2022. * * This code is licensed under the Apache License, Version 2.0. You may * obtain a copy of this license in the LICENSE.txt file in the root directory @@ -25,6 +25,8 @@ namespace AER { namespace QV { +namespace Chunk { + //============================================================================ // chunk manager class @@ -43,12 +45,15 @@ class ChunkManager int num_qubits_; //number of global qubits uint_t num_chunks_; //number of chunks on this process + uint_t chunk_index_; //global chunk index for the first chunk int i_dev_map_; //device index chunk to be mapped int idev_buffer_map_; //device index buffer to be mapped int iplace_host_; //chunk container for host memory bool multi_shots_; + + bool enable_cuStatevec_; public: ChunkManager(); @@ -65,7 +70,7 @@ class ChunkManager return chunks_.size(); } - uint_t Allocate(int chunk_bits,int nqubits,uint_t nchunks,int matrix_bit); + uint_t Allocate(int chunk_bits,int nqubits,uint_t nchunks,uint_t chunk_index,int matrix_bit,bool enable_cuStatevec); void Free(void); int num_devices(void) @@ -113,6 +118,7 @@ ChunkManager::ChunkManager() num_places_ = 1; chunk_bits_ = 0; num_chunks_ = 0; + chunk_index_ = 0; num_qubits_ = 0; multi_shots_ = false; @@ -161,7 +167,7 @@ ChunkManager::~ChunkManager() } template -uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks,int matrix_bit) +uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks,uint_t chunk_index,int matrix_bit, bool enable_cuStatevec) { uint_t num_buffers; int iDev; @@ -182,6 +188,9 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks, hybrid = true; } //--- + enable_cuStatevec_ = enable_cuStatevec; + + chunk_index_ = chunk_index; if(num_qubits_ != nqubits || chunk_bits_ != chunk_bits || nchunks > num_chunks_){ //free previous allocation @@ -246,40 +255,45 @@ uint_t ChunkManager::Allocate(int chunk_bits,int nqubits,uint_t nchunks, num_places_ = num_chunks_; } - nchunks = num_chunks_; - //allocate chunk container before parallel loop using push_back to store shared pointer for(i=0;i>()); + continue; + } +#endif chunks_.push_back(std::make_shared>()); } uint_t chunks_allocated = 0; #pragma omp parallel for if(num_places_ > 1) private(is,ie,nc) reduction(+:chunks_allocated) for(iDev=0;iDevset_chunk_index(chunk_index_ + chunks_allocated); //set first chunk index for the container if(num_devices_ > 0) chunks_allocated += chunks_[iDev]->Allocate((iDev + idev_start)%num_devices_,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit); else chunks_allocated += chunks_[iDev]->Allocate(iDev,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit); } - if(chunks_allocated < nchunks){ + if(chunks_allocated < num_chunks_){ //rest of chunks are stored on host for(iDev=0;iDev 0){ + chunks_[num_places_]->set_chunk_index(chunk_index_ + chunks_allocated + is); //set first chunk index for the container chunks_.push_back(std::make_shared>()); chunks_[num_places_]->Allocate(-1,chunk_bits,nqubits,nc,num_buffers,multi_shots_,matrix_bit); num_places_ += 1; } } - num_chunks_ = chunks_allocated; } #ifdef AER_DISABLE_GDR @@ -388,6 +402,7 @@ void ChunkManager::execute_on_device(Function func,const std::vector +class cuStateVecChunkContainer : public DeviceChunkContainer +{ +protected: + custatevecHandle_t custatevec_handle_; //cuStatevec handle for this chunk container + AERDeviceVector custatevec_work_; //work buffer for cuStatevec + uint_t custatevec_work_size_; //buffer size + uint_t custatevec_chunk_total_qubits_; //total qubits of statevector passed to ApplyMatrix + uint_t custatevec_chunk_count_; //number of counts for all chunks + +public: + using BaseContainer = DeviceChunkContainer; + + cuStateVecChunkContainer() + { + } + ~cuStateVecChunkContainer(); + + uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) override; + void Deallocate(void) override; + + unsigned char* custatevec_work_pointer(uint_t iChunk) const + { + if(custatevec_work_size_ == 0) + return nullptr; + if(iChunk >= this->num_chunks_){ //for buffer chunks + return ((unsigned char*)thrust::raw_pointer_cast(custatevec_work_.data())) + ((BaseContainer::num_matrices_ + iChunk - this->num_chunks_) * custatevec_work_size_); + } + else{ + return ((unsigned char*)thrust::raw_pointer_cast(custatevec_work_.data())) + ((iChunk % BaseContainer::num_matrices_) * custatevec_work_size_); + } + } + + reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const override; + double norm(uint_t iChunk,uint_t count) const override; + + //apply matrix + void apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) override; + + //apply diagonal matrix + void apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) override; + + //apply (controlled) X + void apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count) override; + + //apply (controlled) Y + void apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count) override; + + //apply (controlled) phase + virtual void apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) override; + + //apply (controlled) swap gate + void apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count) override; + + //apply permutation + void apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count) override; + + //apply rotation around axis + void apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count) override; + + //get probabilities of chunk + void probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const override; + + //Pauli expectation values + double expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const override; +}; + +template +cuStateVecChunkContainer::~cuStateVecChunkContainer(void) +{ + Deallocate(); +} + +template +uint_t cuStateVecChunkContainer::Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) +{ + uint_t nc; + nc = BaseContainer::Allocate(idev,chunk_bits,num_qubits,chunks,buffers,multi_shots,matrix_bit); + + //initialize custatevevtor handle + custatevecStatus_t err; + + err = custatevecCreate(&custatevec_handle_); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::allocate::custatevecCreate : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + //allocate extra workspace for custatevec + std::vector> mat(1ull << (matrix_bit*2)); + + //count bits for multi-chunks + custatevec_chunk_total_qubits_ = this->num_pow2_qubits_; + custatevec_chunk_count_ = this->num_chunks_ >> (this->num_pow2_qubits_ - this->chunk_bits_); + + //matrix + err = custatevecApplyMatrix_bufferSize( + custatevec_handle_, CUDA_C_64F, custatevec_chunk_total_qubits_ , &mat[0], CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_COL, + 0, matrix_bit, 0, CUSTATEVEC_COMPUTE_64F, &custatevec_work_size_); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::ResizeMatrixBuffers : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + //diagonal matrix + size_t diag_size; + std::vector perm(matrix_bit); + std::vector basis(matrix_bit); + for(int_t i=0;i 0) + custatevec_work_.resize(custatevec_work_size_*BaseContainer::num_matrices_); + + return nc; +} + +template +void cuStateVecChunkContainer::Deallocate(void) +{ + BaseContainer::Deallocate(); + + custatevec_work_.clear(); + custatevec_work_.shrink_to_fit(); + custatevecDestroy(custatevec_handle_); +} + +template +reg_t cuStateVecChunkContainer::sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride, bool dot,uint_t count) const +{ + if(count == (1ull << (this->num_qubits_ - this->chunk_bits_))){ + //custatevecSampler_sample only can be applied to whole statevector + const int_t SHOTS = rnds.size(); + reg_t samples(SHOTS,0); + + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + custatevecStatus_t err; + custatevecSamplerDescriptor_t sampler; + size_t extSize; + + cudaStreamSynchronize(BaseContainer::stream_[iChunk]); + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + err = custatevecSampler_create(custatevec_handle_, BaseContainer::chunk_pointer(iChunk), state_type, this->num_qubits_, &sampler, SHOTS, &extSize); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::sample_measure : custatevecSampler_create " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + AERDeviceVector extBuf; + void* pExtBuf = nullptr; + if(extSize > 0){ + extBuf.resize(extSize); + pExtBuf = thrust::raw_pointer_cast(extBuf.data()); + } + + err = custatevecSampler_preprocess(custatevec_handle_,&sampler,pExtBuf,extSize); + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::sample_measure : custatevecSampler_preprocess " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + std::vector bitStr(SHOTS); + std::vector bitOrdering(this->num_qubits_); + for(int_t i=0;inum_qubits_;i++){ + bitOrdering[i] = i; + } + + err = custatevecSampler_sample(custatevec_handle_, &sampler, &bitStr[0], &bitOrdering[0], this->num_qubits_, &rnds[0], SHOTS, + CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER ) ; + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::sample_measure : custatevecSampler_sample " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + for(int_t i=0;i 0){ + extBuf.clear(); + extBuf.shrink_to_fit(); + } + return samples; + } + else{ + return BaseContainer::sample_measure(iChunk, rnds, stride, dot, count); + } +} + +template +void cuStateVecChunkContainer::apply_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &mat,const uint_t count) +{ + thrust::complex* pMat; + int_t num_qubits = qubits.size()-control_bits; + + pMat = (thrust::complex*)&mat[0]; + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + std::vector qubits32(qubits.size()); + for(int_t i=0;i 0) + pControl = &qubits32[0]; + + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + cudaDataType_t state_type; + custatevecComputeType_t comp_type; + if(sizeof(data_t) == sizeof(double)){ + state_type = CUDA_C_64F; + comp_type = CUSTATEVEC_COMPUTE_64F; + } + else{ + state_type = CUDA_C_32F; + comp_type = CUSTATEVEC_COMPUTE_32F; + } + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_diagonal_matrix(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const cvector_t &diag,const uint_t count) +{ + thrust::complex* pMat; + int_t num_qubits = qubits.size(); + + if(control_bits > 0){ + uint_t size = 1ull << num_qubits; + cvector_t diag_ctrl(size); //make diagonal matrix with controls + + for(int_t i=0;i*)&diag[0]; + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + std::vector qubits32(qubits.size()); + for(int_t i=0;i 0) + pControl = &qubits32[0]; + + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_X(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + int_t num_qubits = qubits.size(); + + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + uint_t perm_size = 1ull << num_qubits; + std::vector perm(perm_size); + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_Y(const uint_t iChunk,const reg_t& qubits,const uint_t count) +{ + int_t num_qubits = qubits.size(); + + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + uint_t perm_size = 1ull << num_qubits; + cvector_t diag(perm_size); + std::vector perm(perm_size); + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_phase(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const std::complex phase,const uint_t count) +{ + uint_t size = 1ull << qubits.size(); + cvector_t diag(size); + for(int_t i=0;i +void cuStateVecChunkContainer::apply_swap(const uint_t iChunk,const reg_t& qubits,const int_t control_bits,const uint_t count) +{ + int_t num_qubits = qubits.size(); + + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + uint_t perm_size = 1ull << num_qubits; + std::vector swap(perm_size); + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_permutation(const uint_t iChunk,const reg_t& qubits,const std::vector> &pairs, const uint_t count) +{ + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + int_t size = 1ull << qubits.size(); + custatevecIndex_t perm[size]; + for(int_t i=0;i qubits32(qubits.size()); + for(int_t i=0;inum_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::apply_rotation(const uint_t iChunk,const reg_t &qubits, const Rotation r, const double theta, const uint_t count) +{ + custatevecPauli_t pauli[2]; + int nPauli = 1; + + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + int control_bits = qubits.size() - 1; + + switch(r){ + case Rotation::x: + pauli[0] = CUSTATEVEC_PAULI_X; + break; + case Rotation::y: + pauli[0] = CUSTATEVEC_PAULI_Y; + break; + case Rotation::z: + pauli[0] = CUSTATEVEC_PAULI_Z; + break; + case Rotation::xx: + pauli[0] = CUSTATEVEC_PAULI_X; + pauli[1] = CUSTATEVEC_PAULI_X; + nPauli = 2; + control_bits--; + break; + case Rotation::yy: + pauli[0] = CUSTATEVEC_PAULI_Y; + pauli[1] = CUSTATEVEC_PAULI_Y; + nPauli = 2; + control_bits--; + break; + case Rotation::zz: + pauli[0] = CUSTATEVEC_PAULI_Z; + pauli[1] = CUSTATEVEC_PAULI_Z; + nPauli = 2; + control_bits--; + break; + case Rotation::zx: + pauli[0] = CUSTATEVEC_PAULI_Z; + pauli[1] = CUSTATEVEC_PAULI_X; + nPauli = 2; + control_bits--; + break; + default: + throw std::invalid_argument( + "QubitVectorThrust::invalid rotation axis."); + } + + std::vector qubits32(qubits.size()); + for(int_t i=0;i 0) + pControl = &qubits32[0]; + + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +double cuStateVecChunkContainer::norm(uint_t iChunk,uint_t count) const +{ + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + double ret = 0.0; + uint_t bits; + uint_t nc; + if(count == this->num_chunks_){ + bits = custatevec_chunk_total_qubits_; + nc = custatevec_chunk_count_; + } + else{ + nc = count; + bits = this->chunk_bits_; + if(nc > 0){ + while((nc & 1) == 0){ + nc >>= 1; + bits++; + } + } + } + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecStatus_t err; + for(int_t i=0;i +void cuStateVecChunkContainer::probabilities(std::vector& probs, const uint_t iChunk, const reg_t& qubits) const +{ + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + std::vector qubits32(qubits.size()); + for(int_t i=0;ichunk_bits_, + &p0, &p1, &qubits32[0], 1); + probs.resize(2); + probs[0] = p0; + probs[1] = p1; + } + else{ + probs.resize(1ull << qubits.size()); + err = custatevecAbs2SumArray(custatevec_handle_, BaseContainer::chunk_pointer(iChunk), state_type, this->chunk_bits_, + &probs[0], &qubits32[0], qubits.size(), nullptr,nullptr,0); + } + + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::probabilities : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } +} + +template +double cuStateVecChunkContainer::expval_pauli(const uint_t iChunk,const reg_t& qubits,const std::string &pauli,const complex_t initial_phase) const +{ + if(initial_phase != 1.0){ + return BaseContainer::expval_pauli(iChunk, qubits, pauli, initial_phase); + } + BaseContainer::set_device(); + custatevecSetStream(custatevec_handle_,BaseContainer::stream_[iChunk]); + + cudaDataType_t state_type; + if(sizeof(data_t) == sizeof(double)) + state_type = CUDA_C_64F; + else + state_type = CUDA_C_32F; + + custatevecPauli_t pauliOps[pauli.size()]; + int32_t qubits32[qubits.size()]; + for(int_t i=0;ichunk_bits_, + ret, pauliOperatorsArray, basisBitsArray, nBasisBitsArray, 1); + + if(err != CUSTATEVEC_STATUS_SUCCESS){ + std::stringstream str; + str << "cuStateVecChunkContainer::expval_pauli : " << custatevecGetErrorString(err); + throw std::runtime_error(str.str()); + } + + return ret[0]; +} + + + +//------------------------------------------------------------------------------ +} // end namespace Chunk +} // end namespace QV +} // end namespace AER +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +#endif // end module diff --git a/src/simulators/statevector/chunk/cuda_kernels.hpp b/src/simulators/statevector/chunk/cuda_kernels.hpp index 9322a69279..4380578813 100644 --- a/src/simulators/statevector/chunk/cuda_kernels.hpp +++ b/src/simulators/statevector/chunk/cuda_kernels.hpp @@ -18,6 +18,7 @@ namespace AER { namespace QV { +namespace Chunk { template __global__ @@ -339,6 +340,7 @@ __global__ void dev_reduce_sum_uint(uint_t *pReduceBuffer,uint_t n,uint_t buf_si //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/device_chunk_container.hpp b/src/simulators/statevector/chunk/device_chunk_container.hpp index 34e92ab1c8..68126695c6 100644 --- a/src/simulators/statevector/chunk/device_chunk_container.hpp +++ b/src/simulators/statevector/chunk/device_chunk_container.hpp @@ -1,7 +1,7 @@ /** * This code is part of Qiskit. * - * (C) Copyright IBM 2018, 2019, 2020. + * (C) Copyright IBM 2018, 2019, 2020, 2021, 2022. * * This code is licensed under the Apache License, Version 2.0. You may * obtain a copy of this license in the LICENSE.txt file in the root directory @@ -18,10 +18,9 @@ #include "simulators/statevector/chunk/chunk_container.hpp" - - namespace AER { namespace QV { +namespace Chunk { //============================================================================ @@ -59,6 +58,7 @@ class DeviceChunkContainer : public ChunkContainer #ifdef AER_THRUST_CUDA std::vector stream_; //asynchronous execution #endif + public: DeviceChunkContainer() { @@ -103,13 +103,13 @@ class DeviceChunkContainer : public ChunkContainer return raw_reference_cast(data_[i]); } - uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit); - void Deallocate(void); + uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) override; + void Deallocate(void) override; - void StoreMatrix(const std::vector>& mat,uint_t iChunk); - void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size); - void StoreUintParams(const std::vector& prm,uint_t iChunk); - void ResizeMatrixBuffers(int bits); + void StoreMatrix(const std::vector>& mat,uint_t iChunk) override; + void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size) override; + void StoreUintParams(const std::vector& prm,uint_t iChunk) override; + void ResizeMatrixBuffers(int bits) override; void set_device(void) const { @@ -134,16 +134,15 @@ class DeviceChunkContainer : public ChunkContainer return data_[i]; } - void CopyIn(Chunk& src,uint_t iChunk); - void CopyOut(Chunk& src,uint_t iChunk); - void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size); - void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size); - void Swap(Chunk& src,uint_t iChunk); + void CopyIn(Chunk& src,uint_t iChunk) override; + void CopyOut(Chunk& src,uint_t iChunk) override; + void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size) override; + void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size) override; + void Swap(Chunk& src,uint_t iChunk) override; - void Zero(uint_t iChunk,uint_t count); + void Zero(uint_t iChunk,uint_t count) override; - reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const; - thrust::complex norm(uint_t iChunk,uint_t count,uint_t stride = 1,bool dot = true) const; + reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const override; thrust::complex* chunk_pointer(uint_t iChunk) const { @@ -322,6 +321,14 @@ uint_t DeviceChunkContainer::Allocate(int idev,int chunk_bits,int num_qu this->num_chunks_ = nc; data_.resize((nc+buffers) << chunk_bits); + //init number of bits for chunk count + uint_t nc_tmp = this->num_chunks_; + this->num_pow2_qubits_ = this->chunk_bits_; + while((nc_tmp & 1) == 0){ + this->num_pow2_qubits_++; + nc_tmp >>= 1; + } + #ifdef AER_THRUST_CUDA stream_.resize(nc + buffers); for(i=0;i::Deallocate(void) blocked_qubits_holder_.clear(); #ifdef AER_THRUST_CUDA - uint_t i; - for(i=0;i::sample_measure(uint_t iChunk,const std::vect #ifdef AER_THRUST_CUDA -// cudaGetLastError(); if(dot) thrust::transform_inclusive_scan(thrust::cuda::par.on(stream_[iChunk]),iter.begin(),iter.end(),iter.begin(),complex_dot_scan(),thrust::plus>()); else @@ -676,30 +681,6 @@ reg_t DeviceChunkContainer::sample_measure(uint_t iChunk,const std::vect return samples; } -template -thrust::complex DeviceChunkContainer::norm(uint_t iChunk, uint_t count, uint_t stride, bool dot) const -{ - thrust::complex sum,zero(0.0,0.0); - set_device(); - - strided_range*> iter(chunk_pointer(iChunk), chunk_pointer(iChunk+count), stride); - -#ifdef AER_THRUST_CUDA - cudaStreamSynchronize(stream_[iChunk]); - if(dot) - sum = thrust::transform_reduce(thrust::device, iter.begin(),iter.end(),complex_norm() ,zero,thrust::plus>()); - else - sum = thrust::reduce(thrust::device, iter.begin(),iter.end(),zero,thrust::plus>()); -#else - if(dot) - sum = thrust::transform_reduce(thrust::device, iter.begin(),iter.end(),complex_norm() ,zero,thrust::plus>()); - else - sum = thrust::reduce(thrust::device, iter.begin(),iter.end(),zero,thrust::plus>()); -#endif - - return sum; -} - //set qubits to be blocked template @@ -1177,7 +1158,9 @@ void DeviceChunkContainer::copy_to_probability_buffer(std::vector return data_[i]; } - uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit); - void Deallocate(void); + uint_t Allocate(int idev,int chunk_bits,int num_qubits,uint_t chunks,uint_t buffers,bool multi_shots,int matrix_bit) override; + void Deallocate(void) override; - void StoreMatrix(const std::vector>& mat,uint_t iChunk) + void StoreMatrix(const std::vector>& mat,uint_t iChunk) override { matrix_[iChunk] = (thrust::complex*)&mat[0]; } - void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size) + void StoreMatrix(const std::complex* mat,uint_t iChunk,uint_t size) override { matrix_[iChunk] = (thrust::complex*)mat; } - void StoreUintParams(const std::vector& prm,uint_t iChunk) + void StoreUintParams(const std::vector& prm,uint_t iChunk) override { params_[iChunk] = (uint_t*)&prm[0]; } void ResizeMatrixBuffers(int bits){} - void Set(uint_t i,const thrust::complex& t) + void Set(uint_t i,const thrust::complex& t) override { data_[i] = t; } - thrust::complex Get(uint_t i) const + thrust::complex Get(uint_t i) const override { return data_[i]; } - thrust::complex* chunk_pointer(uint_t iChunk) const + thrust::complex* chunk_pointer(uint_t iChunk) const override { return (thrust::complex*)thrust::raw_pointer_cast(data_.data()) + (iChunk << this->chunk_bits_); } - thrust::complex* matrix_pointer(uint_t iChunk) const + thrust::complex* matrix_pointer(uint_t iChunk) const override { return matrix_[iChunk]; } - uint_t* param_pointer(uint_t iChunk) const + uint_t* param_pointer(uint_t iChunk) const override { return params_[iChunk]; } @@ -104,16 +105,15 @@ class HostChunkContainer : public ChunkContainer #endif } - void CopyIn(Chunk& src,uint_t iChunk); - void CopyOut(Chunk& src,uint_t iChunk); - void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size); - void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size); - void Swap(Chunk& src,uint_t iChunk); + void CopyIn(Chunk& src,uint_t iChunk) override; + void CopyOut(Chunk& src,uint_t iChunk) override; + void CopyIn(thrust::complex* src,uint_t iChunk, uint_t size) override; + void CopyOut(thrust::complex* dest,uint_t iChunk, uint_t size) override; + void Swap(Chunk& src,uint_t iChunk) override; - void Zero(uint_t iChunk,uint_t count); + void Zero(uint_t iChunk,uint_t count) override; - reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const; - thrust::complex norm(uint_t iChunk,uint_t count,uint_t stride = 1,bool dot = true) const; + reg_t sample_measure(uint_t iChunk,const std::vector &rnds, uint_t stride = 1, bool dot = true,uint_t count = 1) const override; }; @@ -267,22 +267,9 @@ reg_t HostChunkContainer::sample_measure(uint_t iChunk,const std::vector return samples; } -template -thrust::complex HostChunkContainer::norm(uint_t iChunk, uint_t count, uint_t stride, bool dot) const -{ - thrust::complex sum,zero(0.0,0.0); - - strided_range*> iter(chunk_pointer(iChunk), chunk_pointer(iChunk+count), stride); - - if(dot) - sum = thrust::transform_reduce(thrust::omp::par, iter.begin(),iter.end(),complex_norm() ,zero,thrust::plus>()); - else - sum = thrust::reduce(thrust::omp::par, iter.begin(),iter.end(),zero,thrust::plus>()); - - return sum; -} //------------------------------------------------------------------------------ +} // end namespace Chunk } // end namespace QV } // end namespace AER //------------------------------------------------------------------------------ diff --git a/src/simulators/statevector/chunk/thrust_kernels.hpp b/src/simulators/statevector/chunk/thrust_kernels.hpp new file mode 100644 index 0000000000..a882f5c8fc --- /dev/null +++ b/src/simulators/statevector/chunk/thrust_kernels.hpp @@ -0,0 +1,2699 @@ +/** + * This code is part of Qiskit. + * + * (C) Copyright IBM 2018, 2019, 2020. + * + * This code is licensed under the Apache License, Version 2.0. You may + * obtain a copy of this license in the LICENSE.txt file in the root directory + * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0. + * + * Any modifications or derivative works of this code must retain this + * copyright notice, and modified files need to carry a notice indicating + * that they have been altered from the originals. + */ + + +#ifndef _qv_thrust_kernels_hpp_ +#define _qv_thrust_kernels_hpp_ + +#include "misc/warnings.hpp" +DISABLE_WARNING_PUSH +#ifdef AER_THRUST_CUDA +#include +#include +#endif +DISABLE_WARNING_POP + +#include "misc/wrap_thrust.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "framework/utils.hpp" + +#ifdef AER_THRUST_CUDA +#include "simulators/statevector/chunk/cuda_kernels.hpp" +#endif + +namespace AER { +namespace QV { +namespace Chunk { + +//======================================== +// base class of gate functions +//======================================== +template +class GateFuncBase +{ +protected: + thrust::complex* data_; //pointer to state vector buffer + thrust::complex* matrix_; //storage for matrix on device + uint_t* params_; //storage for additional parameters on device + uint_t base_index_; //start index of state vector + uint_t chunk_bits_; + uint_t* cregs_; + uint_t num_creg_bits_; + int_t conditional_bit_; +#ifndef AER_THRUST_CUDA + uint_t index_offset_; +#endif +public: + GateFuncBase() + { + data_ = NULL; + base_index_ = 0; + cregs_ = NULL; + num_creg_bits_ = 0; + conditional_bit_ = -1; +#ifndef AER_THRUST_CUDA + index_offset_ = 0; +#endif + } + virtual void set_data(thrust::complex* p) + { + data_ = p; + } + void set_matrix(thrust::complex* mat) + { + matrix_ = mat; + } + void set_params(uint_t* p) + { + params_ = p; + } + void set_chunk_bits(uint_t bits) + { + chunk_bits_ = bits; + } + + void set_base_index(uint_t i) + { + base_index_ = i; + } + void set_cregs_(uint_t* cbits,uint_t nreg) + { + cregs_ = cbits; + num_creg_bits_ = nreg; + } + void set_conditional(int_t bit) + { + conditional_bit_ = bit; + } + +#ifndef AER_THRUST_CUDA + void set_index_offset(uint_t i) + { + index_offset_ = i; + } +#endif + + __host__ __device__ thrust::complex* data(void) + { + return data_; + } + + virtual bool is_diagonal(void) + { + return false; + } + virtual int qubits_count(void) + { + return 1; + } + virtual int num_control_bits(void) + { + return 0; + } + virtual int control_mask(void) + { + return 1; + } + virtual bool use_cache(void) + { + return false; + } + virtual bool batch_enable(void) + { + return true; + } + + virtual const char* name(void) + { + return "base function"; + } + virtual uint_t size(int num_qubits) + { + if(is_diagonal()){ + chunk_bits_ = num_qubits; + return (1ull << num_qubits); + } + else{ + chunk_bits_ = num_qubits - (qubits_count() - num_control_bits()); + return (1ull << (num_qubits - (qubits_count() - num_control_bits()))); + } + } + + virtual __host__ __device__ uint_t thread_to_index(uint_t _tid) const + { + return _tid; + } + virtual __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + //implemente this in the kernel class + } + virtual __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + //implemente this in the kernel class + return 0.0; + } + + virtual __host__ __device__ bool check_conditional(uint_t i) const + { + if(conditional_bit_ < 0) + return true; + + uint_t iChunk = i >> chunk_bits_; + uint_t n64,i64,ibit; + n64 = (num_creg_bits_ + 63) >> 6; + i64 = conditional_bit_ >> 6; + ibit = conditional_bit_ & 63; + return (((cregs_[iChunk*n64 + i64] >> ibit) & 1) != 0); + } +}; + +//======================================== + // gate functions with cache +//======================================== +template +class GateFuncWithCache : public GateFuncBase +{ +protected: + int nqubits_; +public: + GateFuncWithCache(uint_t nq) + { + nqubits_ = nq; + } + + bool use_cache(void) + { + return true; + } + + __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const + { + uint_t idx,ii,t,j; + uint_t* qubits; + uint_t* qubits_sorted; + + qubits_sorted = this->params_; + qubits = qubits_sorted + nqubits_; + + idx = 0; + ii = _tid >> nqubits_; + for(j=0;j> j) & 1) != 0){ + idx += (1ull << qubits[j]); + } + } + idx += ii; + return idx; + } + + __host__ __device__ void sync_threads() const + { +#ifdef CUDA_ARCH + __syncthreads(); +#endif + } + + __host__ __device__ void operator()(const uint_t &i) const + { + if(!this->check_conditional(i)) + return; + + thrust::complex cache[1024]; + uint_t j,idx; + uint_t matSize = 1ull << nqubits_; + + //load data to cache + for(j=0;jdata_[idx]; + } + + //execute using cache + for(j=0;jrun_with_cache(j,idx,cache); + } + } + + virtual int qubits_count(void) + { + return nqubits_; + } +}; + +template +class GateFuncSumWithCache : public GateFuncBase +{ +protected: + int nqubits_; +public: + GateFuncSumWithCache(uint_t nq) + { + nqubits_ = nq; + } + + bool use_cache(void) + { + return true; + } + + + __host__ __device__ virtual uint_t thread_to_index(uint_t _tid) const + { + uint_t idx,ii,t,j; + uint_t* qubits; + uint_t* qubits_sorted; + + qubits_sorted = this->params_; + qubits = qubits_sorted + nqubits_; + + idx = 0; + ii = _tid >> nqubits_; + for(j=0;j> j) & 1) != 0){ + idx += (1ull << qubits[j]); + } + } + idx += ii; + return idx; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + if(!this->check_conditional(i)) + return 0.0; + + thrust::complex cache[1024]; + uint_t j,idx; + uint_t matSize = 1ull << nqubits_; + double sum = 0.0; + + //load data to cache + for(j=0;jdata_[idx]; + } + + //execute using cache + for(j=0;jrun_with_cache_sum(j,idx,cache); + } + return sum; + } + + virtual int qubits_count(void) + { + return nqubits_; + } + +}; + +//stridded iterator to access diagonal probabilities +template +class strided_range +{ + public: + + typedef typename thrust::iterator_difference::type difference_type; + + struct stride_functor : public thrust::unary_function + { + difference_type stride; + + stride_functor(difference_type stride) + : stride(stride) {} + + __host__ __device__ + difference_type operator()(const difference_type& i) const + { + if(stride == 1) //statevector + return i; + + //density matrix + difference_type i_chunk; + i_chunk = i / (stride - 1); + difference_type ret = stride * i - i_chunk*(stride-1); + return ret; + } + }; + + typedef typename thrust::counting_iterator CountingIterator; + typedef typename thrust::transform_iterator TransformIterator; + typedef typename thrust::permutation_iterator PermutationIterator; + + // type of the strided_range iterator + typedef PermutationIterator iterator; + + // construct strided_range for the range [first,last) + strided_range(Iterator first, Iterator last, difference_type stride) + : first(first), last(last), stride(stride) {} + + iterator begin(void) const + { + return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride))); + } + + iterator end(void) const + { + if(stride == 1) //statevector + return begin() + (last - first); + + //density matrix + return begin() + (last - first) / (stride-1); + } + + protected: + Iterator first; + Iterator last; + difference_type stride; +}; + +template +struct complex_dot_scan : public thrust::unary_function,thrust::complex> +{ + __host__ __device__ + thrust::complex operator()(thrust::complex x) { return thrust::complex(x.real()*x.real()+x.imag()*x.imag(),0); } +}; + +template +struct complex_norm : public thrust::unary_function,thrust::complex> +{ + __host__ __device__ + thrust::complex operator()(thrust::complex x) { return thrust::complex((double)x.real()*(double)x.real(),(double)x.imag()*(double)x.imag()); } +}; + +template +struct complex_less +{ + typedef thrust::complex first_argument_type; + typedef thrust::complex second_argument_type; + typedef bool result_type; + __thrust_exec_check_disable__ + __host__ __device__ bool operator()(const thrust::complex &lhs, const thrust::complex &rhs) const {return lhs.real() < rhs.real();} +}; // end less + + +class HostFuncBase +{ +protected: +public: + HostFuncBase(){} + + virtual void execute(){} +}; + + +//------------------------------------------------------------------------------ +// State initialize component +//------------------------------------------------------------------------------ +template +class initialize_component_1qubit_func : public GateFuncBase +{ +protected: + thrust::complex s0,s1; + uint_t mask; + uint_t offset; +public: + initialize_component_1qubit_func(int qubit,thrust::complex state0,thrust::complex state1) + { + s0 = state0; + s1 = state1; + + mask = (1ull << qubit) - 1; + offset = 1ull << qubit; + } + + virtual __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec0[i0]; + + vec0[i0] = s0*q0; + vec1[i0] = s1*q0; + } + + const char* name(void) + { + return "initialize_component 1 qubit"; + } +}; + +template +class initialize_component_func : public GateFuncBase +{ +protected: + int nqubits; + uint_t matSize; +public: + initialize_component_func(const cvector_t& mat,const reg_t &qb) + { + nqubits = qb.size(); + matSize = 1ull << nqubits; + } + + int qubits_count(void) + { + return nqubits; + } + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q; + thrust::complex* state; + uint_t* qubits; + uint_t* qubits_sorted; + uint_t j,k; + uint_t ii,idx,t; + uint_t mask; + + //get parameters from iterator + vec = this->data_; + state = this->matrix_; + qubits = this->params_; + qubits_sorted = qubits + nqubits; + + idx = 0; + ii = i; + for(j=0;j> j) & 1) != 0) + ii += (1ull << qubits[j]); + } + q = q0 * state[k]; + vec[ii] = q; + } + } + + const char* name(void) + { + return "initialize_component"; + } +}; + +template +class initialize_large_component_func : public GateFuncBase +{ +protected: + int num_qubits_; + uint_t mask_; + uint_t cmask_; + thrust::complex init_; +public: + initialize_large_component_func(thrust::complex m,const reg_t& qubits,int i) + { + num_qubits_ = qubits.size(); + init_ = m; + + mask_ = 0; + cmask_ = 0; + for(int k=0;k> k) & 1) != 0){ + cmask_ |= (1ull << qubits[k]); + } + } + } + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q; + vec = this->data_; + if((i & mask_) == cmask_){ + q = vec[i]; + vec[i] = init_*q; + } + } + const char* name(void) + { + return "initialize_large_component"; + } +}; + +//------------------------------------------------------------------------------ +// Zero clear +//------------------------------------------------------------------------------ +template +class ZeroClear : public GateFuncBase +{ +protected: +public: + ZeroClear() {} + bool is_diagonal(void) + { + return true; + } + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + vec = this->data_; + vec[i] = 0.0; + } + const char* name(void) + { + return "zero"; + } +}; + + +//------------------------------------------------------------------------------ +// Initialize state +//------------------------------------------------------------------------------ +template +class initialize_kernel : public GateFuncBase +{ +protected: + int num_qubits_state_; + uint_t offset_; + thrust::complex init_val_; +public: + initialize_kernel(thrust::complex v,int nqs,uint_t offset) + { + num_qubits_state_ = nqs; + offset_ = offset; + init_val_ = v; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + uint_t iChunk = (i >> num_qubits_state_); + + vec = this->data_; + + if(i == iChunk * offset_){ + vec[i] = init_val_; + } + else{ + vec[i] = 0.0; + } + } + const char* name(void) + { + return "initialize"; + } +}; + +//------------------------------------------------------------------------------ +// Matrix multiplication +//------------------------------------------------------------------------------ +template +class MatrixMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + int qubit; + uint_t mask; + uint_t offset0; + +public: + MatrixMult2x2(const cvector_t& mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + + mask = (1ull << qubit) - 1; + + offset0 = 1ull << qubit; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + vec1 = vec0 + offset0; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = m0 * q0 + m2 * q1; + vec1[i0] = m1 * q0 + m3 * q1; + } + const char* name(void) + { + return "mult2x2"; + } +}; + + +template +class MatrixMult4x4 : public GateFuncBase +{ +protected: + thrust::complex m00,m10,m20,m30; + thrust::complex m01,m11,m21,m31; + thrust::complex m02,m12,m22,m32; + thrust::complex m03,m13,m23,m33; + uint_t mask0; + uint_t mask1; + uint_t offset0; + uint_t offset1; + +public: + MatrixMult4x4(const cvector_t& mat,int qubit0,int qubit1) + { + m00 = mat[0]; + m01 = mat[1]; + m02 = mat[2]; + m03 = mat[3]; + + m10 = mat[4]; + m11 = mat[5]; + m12 = mat[6]; + m13 = mat[7]; + + m20 = mat[8]; + m21 = mat[9]; + m22 = mat[10]; + m23 = mat[11]; + + m30 = mat[12]; + m31 = mat[13]; + m32 = mat[14]; + m33 = mat[15]; + + offset0 = 1ull << qubit0; + offset1 = 1ull << qubit1; + if(qubit0 < qubit1){ + mask0 = offset0 - 1; + mask1 = offset1 - 1; + } + else{ + mask0 = offset1 - 1; + mask1 = offset0 - 1; + } + } + + int qubits_count(void) + { + return 2; + } + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1,i2; + thrust::complex* vec0; + thrust::complex* vec1; + thrust::complex* vec2; + thrust::complex* vec3; + thrust::complex q0,q1,q2,q3; + + vec0 = this->data_; + + i0 = i & mask0; + i2 = (i - i0) << 1; + i1 = i2 & mask1; + i2 = (i2 - i1) << 1; + + i0 = i0 + i1 + i2; + + vec1 = vec0 + offset0; + vec2 = vec0 + offset1; + vec3 = vec2 + offset0; + + q0 = vec0[i0]; + q1 = vec1[i0]; + q2 = vec2[i0]; + q3 = vec3[i0]; + + vec0[i0] = m00 * q0 + m10 * q1 + m20 * q2 + m30 * q3; + vec1[i0] = m01 * q0 + m11 * q1 + m21 * q2 + m31 * q3; + vec2[i0] = m02 * q0 + m12 * q1 + m22 * q2 + m32 * q3; + vec3[i0] = m03 * q0 + m13 * q1 + m23 * q2 + m33 * q3; + } + const char* name(void) + { + return "mult4x4"; + } +}; + +template +class MatrixMult8x8 : public GateFuncBase +{ +protected: + uint_t offset0; + uint_t offset1; + uint_t offset2; + uint_t mask0; + uint_t mask1; + uint_t mask2; + +public: + MatrixMult8x8(const reg_t &qubit,const reg_t &qubit_ordered) + { + offset0 = (1ull << qubit[0]); + offset1 = (1ull << qubit[1]); + offset2 = (1ull << qubit[2]); + + mask0 = (1ull << qubit_ordered[0]) - 1; + mask1 = (1ull << qubit_ordered[1]) - 1; + mask2 = (1ull << qubit_ordered[2]) - 1; + } + + int qubits_count(void) + { + return 3; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1,i2,i3; + thrust::complex* vec; + thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; + thrust::complex m0,m1,m2,m3,m4,m5,m6,m7; + thrust::complex* pMat; + + vec = this->data_; + pMat = this->matrix_; + + i0 = i & mask0; + i3 = (i - i0) << 1; + i1 = i3 & mask1; + i3 = (i3 - i1) << 1; + i2 = i3 & mask2; + i3 = (i3 - i2) << 1; + + i0 = i0 + i1 + i2 + i3; + + q0 = vec[i0]; + q1 = vec[i0 + offset0]; + q2 = vec[i0 + offset1]; + q3 = vec[i0 + offset1 + offset0]; + q4 = vec[i0 + offset2]; + q5 = vec[i0 + offset2 + offset0]; + q6 = vec[i0 + offset2 + offset1]; + q7 = vec[i0 + offset2 + offset1 + offset0]; + + m0 = pMat[0]; + m1 = pMat[8]; + m2 = pMat[16]; + m3 = pMat[24]; + m4 = pMat[32]; + m5 = pMat[40]; + m6 = pMat[48]; + m7 = pMat[56]; + + vec[i0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[1]; + m1 = pMat[9]; + m2 = pMat[17]; + m3 = pMat[25]; + m4 = pMat[33]; + m5 = pMat[41]; + m6 = pMat[49]; + m7 = pMat[57]; + + vec[i0 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[2]; + m1 = pMat[10]; + m2 = pMat[18]; + m3 = pMat[26]; + m4 = pMat[34]; + m5 = pMat[42]; + m6 = pMat[50]; + m7 = pMat[58]; + + vec[i0 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[3]; + m1 = pMat[11]; + m2 = pMat[19]; + m3 = pMat[27]; + m4 = pMat[35]; + m5 = pMat[43]; + m6 = pMat[51]; + m7 = pMat[59]; + + vec[i0 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[4]; + m1 = pMat[12]; + m2 = pMat[20]; + m3 = pMat[28]; + m4 = pMat[36]; + m5 = pMat[44]; + m6 = pMat[52]; + m7 = pMat[60]; + + vec[i0 + offset2] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[5]; + m1 = pMat[13]; + m2 = pMat[21]; + m3 = pMat[29]; + m4 = pMat[37]; + m5 = pMat[45]; + m6 = pMat[53]; + m7 = pMat[61]; + + vec[i0 + offset2 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[6]; + m1 = pMat[14]; + m2 = pMat[22]; + m3 = pMat[30]; + m4 = pMat[38]; + m5 = pMat[46]; + m6 = pMat[54]; + m7 = pMat[62]; + + vec[i0 + offset2 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + + m0 = pMat[7]; + m1 = pMat[15]; + m2 = pMat[23]; + m3 = pMat[31]; + m4 = pMat[39]; + m5 = pMat[47]; + m6 = pMat[55]; + m7 = pMat[63]; + + vec[i0 + offset2 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; + } + const char* name(void) + { + return "mult8x8"; + } +}; + +template +class MatrixMult16x16 : public GateFuncBase +{ +protected: + uint_t offset0; + uint_t offset1; + uint_t offset2; + uint_t offset3; + uint_t mask0; + uint_t mask1; + uint_t mask2; + uint_t mask3; +public: + MatrixMult16x16(const reg_t &qubit,const reg_t &qubit_ordered) + { + offset0 = (1ull << qubit[0]); + offset1 = (1ull << qubit[1]); + offset2 = (1ull << qubit[2]); + offset3 = (1ull << qubit[3]); + + mask0 = (1ull << qubit_ordered[0]) - 1; + mask1 = (1ull << qubit_ordered[1]) - 1; + mask2 = (1ull << qubit_ordered[2]) - 1; + mask3 = (1ull << qubit_ordered[3]) - 1; + } + + int qubits_count(void) + { + return 4; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1,i2,i3,i4,offset,f0,f1,f2; + thrust::complex* vec; + thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; + thrust::complex q8,q9,q10,q11,q12,q13,q14,q15; + thrust::complex r; + thrust::complex* pMat; + int j; + + vec = this->data_; + pMat = this->matrix_; + + i0 = i & mask0; + i4 = (i - i0) << 1; + i1 = i4 & mask1; + i4 = (i4 - i1) << 1; + i2 = i4 & mask2; + i4 = (i4 - i2) << 1; + i3 = i4 & mask3; + i4 = (i4 - i3) << 1; + + i0 = i0 + i1 + i2 + i3 + i4; + + q0 = vec[i0]; + q1 = vec[i0 + offset0]; + q2 = vec[i0 + offset1]; + q3 = vec[i0 + offset1 + offset0]; + q4 = vec[i0 + offset2]; + q5 = vec[i0 + offset2 + offset0]; + q6 = vec[i0 + offset2 + offset1]; + q7 = vec[i0 + offset2 + offset1 + offset0]; + q8 = vec[i0 + offset3]; + q9 = vec[i0 + offset3 + offset0]; + q10 = vec[i0 + offset3 + offset1]; + q11 = vec[i0 + offset3 + offset1 + offset0]; + q12 = vec[i0 + offset3 + offset2]; + q13 = vec[i0 + offset3 + offset2 + offset0]; + q14 = vec[i0 + offset3 + offset2 + offset1]; + q15 = vec[i0 + offset3 + offset2 + offset1 + offset0]; + + offset = 0; + f0 = 0; + f1 = 0; + f2 = 0; + for(j=0;j<16;j++){ + r = pMat[0+j]*q0; + r += pMat[16+j]*q1; + r += pMat[32+j]*q2; + r += pMat[48+j]*q3; + r += pMat[64+j]*q4; + r += pMat[80+j]*q5; + r += pMat[96+j]*q6; + r += pMat[112+j]*q7; + r += pMat[128+j]*q8; + r += pMat[144+j]*q9; + r += pMat[160+j]*q10; + r += pMat[176+j]*q11; + r += pMat[192+j]*q12; + r += pMat[208+j]*q13; + r += pMat[224+j]*q14; + r += pMat[240+j]*q15; + + offset = offset3 * (((uint_t)j >> 3) & 1) + + offset2 * (((uint_t)j >> 2) & 1) + + offset1 * (((uint_t)j >> 1) & 1) + + offset0 * ((uint_t)j & 1); + + vec[i0 + offset] = r; + } + } + const char* name(void) + { + return "mult16x16"; + } +}; + +template +class MatrixMultNxN : public GateFuncWithCache +{ +protected: +public: + MatrixMultNxN(uint_t nq) : GateFuncWithCache(nq) + { + ; + } + + __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + uint_t j,threadID; + thrust::complex q,r; + thrust::complex m; + uint_t mat_size,irow; + thrust::complex* vec; + thrust::complex* pMat; + + vec = this->data_; + pMat = this->matrix_; + + mat_size = 1ull << this->nqubits_; + irow = _tid & (mat_size - 1); + + r = 0.0; + for(j=0;j +class MatrixMultNxN_LU : public GateFuncBase +{ +protected: + int nqubits; + uint_t matSize; + int nswap; +public: + MatrixMultNxN_LU(const cvector_t& mat,const reg_t &qb,cvector_t& matLU,reg_t& params) + { + uint_t i,j,k,imax; + std::complex c0,c1; + double d,dmax; + uint_t* pSwap; + + nqubits = qb.size(); + matSize = 1ull << nqubits; + + matLU = mat; + params.resize(nqubits + matSize*2); + + for(k=0;k dmax){ + dmax = d; + imax = j; + } + } + if(imax != i){ + j = params[nqubits + imax]; + params[nqubits + imax] = params[nqubits + i]; + params[nqubits + i] = j; + } + + if(dmax != 0){ + c0 = matLU[(i << nqubits) + params[nqubits + i]]; + + for(j=i+1;j q,qt; + thrust::complex m; + thrust::complex r; + uint_t j,k,l,iq; + uint_t ii,idx,t; + uint_t mask,offset_j,offset_k; + thrust::complex* vec; + thrust::complex* pMat; + uint_t* qubits; + uint_t* pivot; + uint_t* table; + + vec = this->data_; + pMat = this->matrix_; + qubits = this->params_; + + pivot = qubits + nqubits; + table = pivot + matSize; + + idx = 0; + ii = i; + for(j=0;j> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + q = vec[offset_k+idx]; + + r += m*q; + } + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + vec[offset_j+idx] = r; + } + + //mult L + for(j=matSize-1;j>0;j--){ + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + r = vec[offset_j+idx]; + + for(k=0;k> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + q = vec[offset_k+idx]; + + r += m*q; + } + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + vec[offset_j+idx] = r; + } + + //swap results + if(nswap > 0){ + offset_j = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + q = vec[offset_j+idx]; + k = pivot[table[0]]; + for(j=1;j> iq) & 1) != 0) + offset_j += (1ull << qubits[iq]); + } + qt = vec[offset_j+idx]; + + offset_k = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + vec[offset_k+idx] = q; + q = qt; + k = pivot[table[j]]; + } + offset_k = 0; + for(iq=0;iq> iq) & 1) != 0) + offset_k += (1ull << qubits[iq]); + } + vec[offset_k+idx] = q; + } + } + const char* name(void) + { + return "multNxN"; + } +}; + +template +class MatrixMult2x2Controlled : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + uint_t mask; + uint_t cmask; + uint_t offset; + int nqubits; +public: + MatrixMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) + { + int i; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + nqubits = qubits.size(); + + offset = 1ull << qubits[nqubits-1]; + mask = (1ull << qubits[nqubits-1]) - 1; + cmask = 0; + for(i=0;i q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + if((i0 & cmask) == cmask){ + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = m0 * q0 + m2 * q1; + vec1[i0] = m1 * q0 + m3 * q1; + } + } + const char* name(void) + { + return "matrix_Cmult2x2"; + } +}; + +//------------------------------------------------------------------------------ +// Diagonal matrix multiplication +//------------------------------------------------------------------------------ +template +class DiagonalMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1; + int qubit; +public: + + DiagonalMult2x2(const cvector_t& mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + thrust::complex m; + uint_t gid; + + vec = this->data_; + gid = this->base_index_; + + q = vec[i]; + if((((i + gid) >> qubit) & 1) == 0){ + m = m0; + } + else{ + m = m1; + } + + vec[i] = m * q; + } + const char* name(void) + { + return "diagonal_mult2x2"; + } +}; + +template +class DiagonalMult4x4 : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + int qubit0; + int qubit1; +public: + + DiagonalMult4x4(const cvector_t& mat,int q0,int q1) + { + qubit0 = q0; + qubit1 = q1; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + } + + bool is_diagonal(void) + { + return true; + } + int qubits_count(void) + { + return 2; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + thrust::complex m; + uint_t gid; + + vec = this->data_; + gid = this->base_index_; + + q = vec[i]; + if((((i+gid) >> qubit1) & 1) == 0){ + if((((i+gid) >> qubit0) & 1) == 0){ + m = m0; + } + else{ + m = m1; + } + } + else{ + if((((i+gid) >> qubit0) & 1) == 0){ + m = m2; + } + else{ + m = m3; + } + } + + vec[i] = m * q; + } + const char* name(void) + { + return "diagonal_mult4x4"; + } +}; + +template +class DiagonalMultNxN : public GateFuncBase +{ +protected: + int nqubits; +public: + DiagonalMultNxN(const reg_t &qb) + { + nqubits = qb.size(); + } + + bool is_diagonal(void) + { + return true; + } + int qubits_count(void) + { + return nqubits; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t j,im; + thrust::complex* vec; + thrust::complex q; + thrust::complex m; + thrust::complex* pMat; + uint_t* qubits; + uint_t gid; + + vec = this->data_; + gid = this->base_index_; + + pMat = this->matrix_; + qubits = this->params_; + + im = 0; + for(j=0;j> qubits[j]) & 1) != 0){ + im += (1 << j); + } + } + + q = vec[i]; + m = pMat[im]; + + vec[i] = m * q; + } + const char* name(void) + { + return "diagonal_multNxN"; + } +}; + +template +class DiagonalMult2x2Controlled : public GateFuncBase +{ +protected: + thrust::complex m0,m1; + uint_t mask; + uint_t cmask; + int nqubits; +public: + DiagonalMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + m0 = mat[0]; + m1 = mat[1]; + + mask = (1ull << qubits[nqubits-1]) - 1; + cmask = 0; + for(i=0;i* vec; + thrust::complex q0; + thrust::complex m; + + vec = this->data_; + gid = this->base_index_; + + if(((i + gid) & cmask) == cmask){ + if((i + gid) & mask){ + m = m1; + } + else{ + m = m0; + } + + q0 = vec[i]; + vec[i] = m*q0; + } + } + const char* name(void) + { + return "diagonal_Cmult2x2"; + } +}; + +//------------------------------------------------------------------------------ +// Permutation +//------------------------------------------------------------------------------ +template +class Permutation : public GateFuncBase +{ +protected: + uint_t nqubits; + uint_t npairs; + +public: + Permutation(const reg_t& qubits_sorted,const reg_t& qubits,const std::vector> &pairs,reg_t& params) + { + uint_t j,k; + uint_t offset0,offset1; + + nqubits = qubits.size(); + npairs = pairs.size(); + + params.resize(nqubits + npairs*2); + + for(j=0;j> k) & 1) != 0){ + offset0 += (1ull << qubits[k]); + } + if(((pairs[j].second >> k) & 1) != 0){ + offset1 += (1ull << qubits[k]); + } + } + params[nqubits + j*2 ] = offset0; + params[nqubits + j*2+1] = offset1; + } + } + int qubits_count(void) + { + return nqubits; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + uint_t j; + uint_t ii,idx,t; + uint_t* mask; + uint_t* pairs; + + vec = this->data_; + mask = this->params_; + pairs = mask + nqubits; + + idx = 0; + ii = i; + for(j=0;j +class CX_func : public GateFuncBase +{ +protected: + uint_t offset; + uint_t mask; + uint_t cmask; + int nqubits; + int qubit_t; +public: + + CX_func(const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + qubit_t = qubits[nqubits-1]; + offset = 1ull << qubit_t; + mask = offset - 1; + + cmask = 0; + for(i=0;i q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + if((i0 & cmask) == cmask){ + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = q1; + vec1[i0] = q0; + } + } + const char* name(void) + { + return "CX"; + } +}; + +//------------------------------------------------------------------------------ +// Y gate +//------------------------------------------------------------------------------ +template +class CY_func : public GateFuncBase +{ +protected: + uint_t mask; + uint_t cmask; + uint_t offset; + int nqubits; + int qubit_t; +public: + CY_func(const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + qubit_t = qubits[nqubits-1]; + offset = (1ull << qubit_t); + mask = (1ull << qubit_t) - 1; + + cmask = 0; + for(i=0;i q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + + vec0 = this->data_; + + vec1 = vec0 + offset; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + if((i0 & cmask) == cmask){ + q0 = vec0[i0]; + q1 = vec1[i0]; + + vec0[i0] = thrust::complex(q1.imag(),-q1.real()); + vec1[i0] = thrust::complex(-q0.imag(),q0.real()); + } + } + const char* name(void) + { + return "CY"; + } +}; + +//------------------------------------------------------------------------------ +// Swap gate +//------------------------------------------------------------------------------ +template +class CSwap_func : public GateFuncBase +{ +protected: + uint_t mask0; + uint_t mask1; + uint_t cmask; + int nqubits; + int qubit_t0; + int qubit_t1; + uint_t offset1; + uint_t offset2; +public: + + CSwap_func(const reg_t &qubits) + { + int i; + nqubits = qubits.size(); + + if(qubits[nqubits-2] < qubits[nqubits-1]){ + qubit_t0 = qubits[nqubits-2]; + qubit_t1 = qubits[nqubits-1]; + } + else{ + qubit_t1 = qubits[nqubits-2]; + qubit_t0 = qubits[nqubits-1]; + } + mask0 = (1ull << qubit_t0) - 1; + mask1 = (1ull << qubit_t1) - 1; + + offset1 = 1ull << qubit_t0; + offset2 = 1ull << qubit_t1; + + cmask = 0; + for(i=0;i q1,q2; + thrust::complex* vec1; + thrust::complex* vec2; + + vec1 = this->data_; + + vec2 = vec1 + offset2; + vec1 = vec1 + offset1; + + i0 = i & mask0; + i2 = (i - i0) << 1; + i1 = i2 & mask1; + i2 = (i2 - i1) << 1; + + i0 = i0 + i1 + i2; + + if((i0 & cmask) == cmask){ + q1 = vec1[i0]; + q2 = vec2[i0]; + vec1[i0] = q2; + vec2[i0] = q1; + } + } + const char* name(void) + { + return "CSWAP"; + } +}; + +//swap operator between chunks +template +class CSwapChunk_func : public GateFuncBase +{ +protected: + uint_t mask; + thrust::complex* vec0; + thrust::complex* vec1; + bool write_back_; + bool swap_all_; +public: + + CSwapChunk_func(const reg_t &qubits,uint_t block_bits,thrust::complex* pVec0,thrust::complex* pVec1,bool wb) + { + int i; + int nqubits; + int qubit_t; + nqubits = qubits.size(); + + if(qubits[nqubits-2] < qubits[nqubits-1]){ + qubit_t = qubits[nqubits-2]; + } + else{ + qubit_t = qubits[nqubits-1]; + } + mask = (1ull << qubit_t) - 1; + + vec0 = pVec0; + vec1 = pVec1; + + write_back_ = wb; + if(qubit_t >= block_bits) + swap_all_ = true; + else + swap_all_ = false; + } + + bool batch_enable(void) + { + return false; + } + bool is_diagonal(void) + { + return swap_all_; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0,q1; + + i0 = i & mask; + i1 = (i - i0) << 1; + i0 += i1; + + q0 = vec0[i0]; + q1 = vec1[i0]; + vec0[i0] = q1; + if(write_back_) + vec1[i0] = q0; + } + const char* name(void) + { + return "Chunk SWAP"; + } +}; + + +//------------------------------------------------------------------------------ +// Phase gate +//------------------------------------------------------------------------------ +template +class phase_func : public GateFuncBase +{ +protected: + thrust::complex phase; + uint_t mask; + int nqubits; +public: + phase_func(const reg_t &qubits,thrust::complex p) + { + int i; + nqubits = qubits.size(); + phase = p; + + mask = 0; + for(i=0;i* vec; + thrust::complex q0; + + vec = this->data_; + gid = this->base_index_; + + if(((i+gid) & mask) == mask){ + q0 = vec[i]; + vec[i] = q0 * phase; + } + } + const char* name(void) + { + return "phase"; + } +}; + +//------------------------------------------------------------------------------ +// Norm functions +//------------------------------------------------------------------------------ +template +class norm_func : public GateFuncBase +{ +protected: +public: + norm_func(void) + { + + } + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + double d; + + vec = this->data_; + q = vec[i]; + d = (double)(q.real()*q.real() + q.imag()*q.imag()); + return d; + } + + const char* name(void) + { + return "norm"; + } +}; + +template +class trace_func : public GateFuncBase +{ +protected: + uint_t rows_; +public: + trace_func(uint_t nrow) + { + rows_ = nrow; + } + bool is_diagonal(void) + { + return true; + } + uint_t size(int num_qubits) + { + this->chunk_bits_ = num_qubits; + return rows_; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + + uint_t iChunk = (i / rows_); + uint_t lid = i - (iChunk * rows_); + uint_t idx = (iChunk << this->chunk_bits_) + lid*(rows_ + 1); + + vec = this->data_; + q = vec[idx]; + return q.real(); + } + + const char* name(void) + { + return "trace"; + } +}; + +template +class NormMatrixMultNxN : public GateFuncSumWithCache +{ +protected: +public: + NormMatrixMultNxN(uint_t nq) : GateFuncSumWithCache(nq) + { + ; + } + + __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const + { + uint_t j; + thrust::complex q,r; + thrust::complex m; + uint_t mat_size,irow; + thrust::complex* vec; + thrust::complex* pMat; + + vec = this->data_; + pMat = this->matrix_; + + mat_size = 1ull << this->nqubits_; + irow = _tid & (mat_size - 1); + + r = 0.0; + for(j=0;j +class NormDiagonalMultNxN : public GateFuncBase +{ +protected: + int nqubits; +public: + NormDiagonalMultNxN(const reg_t &qb) + { + nqubits = qb.size(); + } + + bool is_diagonal(void) + { + return true; + } + int qubits_count(void) + { + return nqubits; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + uint_t im,j,gid; + thrust::complex q; + thrust::complex m,r; + thrust::complex* pMat; + thrust::complex* vec; + uint_t* qubits; + + vec = this->data_; + pMat = this->matrix_; + qubits = this->params_; + gid = this->base_index_; + + im = 0; + for(j=0;j +class NormMatrixMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1,m2,m3; + int qubit; + uint_t mask; + uint_t offset; +public: + NormMatrixMult2x2(const cvector_t &mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + m2 = mat[2]; + m3 = mat[3]; + + offset = 1ull << qubit; + mask = (1ull << qubit) - 1; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex* vec; + thrust::complex q0,q1; + thrust::complex r0,r1; + double sum = 0.0; + + vec = this->data_; + + i1 = i & mask; + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec[i0]; + q1 = vec[offset+i0]; + + r0 = m0 * q0 + m2 * q1; + sum += r0.real()*r0.real() + r0.imag()*r0.imag(); + r1 = m1 * q0 + m3 * q1; + sum += r1.real()*r1.real() + r1.imag()*r1.imag(); + return sum; + } + const char* name(void) + { + return "Norm_mult2x2"; + } +}; + +template +class NormDiagonalMult2x2 : public GateFuncBase +{ +protected: + thrust::complex m0,m1; + int qubit; +public: + NormDiagonalMult2x2(cvector_t &mat,int q) + { + qubit = q; + m0 = mat[0]; + m1 = mat[1]; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + uint_t gid; + thrust::complex* vec; + thrust::complex q; + thrust::complex m,r; + + vec = this->data_; + gid = this->base_index_; + + q = vec[i]; + if((((i+gid) >> qubit) & 1) == 0){ + m = m0; + } + else{ + m = m1; + } + + r = m * q; + + return (r.real()*r.real() + r.imag()*r.imag()); + } + const char* name(void) + { + return "Norm_diagonal_mult2x2"; + } +}; + +//------------------------------------------------------------------------------ +// Probabilities +//------------------------------------------------------------------------------ +template +class probability_func : public GateFuncBase +{ +protected: + uint_t mask; + uint_t cmask; +public: + probability_func(const reg_t &qubits,int i) + { + int k; + int nq = qubits.size(); + + mask = 0; + cmask = 0; + for(k=0;k> k) & 1) != 0){ + cmask |= (1ull << qubits[k]); + } + } + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex q; + thrust::complex* vec; + double ret; + + vec = this->data_; + + ret = 0.0; + + if((i & mask) == cmask){ + q = vec[i]; + ret = q.real()*q.real() + q.imag()*q.imag(); + } + return ret; + } + + const char* name(void) + { + return "probabilities"; + } +}; + +template +class probability_1qubit_func : public GateFuncBase +{ +protected: + uint_t offset; +public: + probability_1qubit_func(const uint_t qubit) + { + offset = 1ull << qubit; + } + + __host__ __device__ thrust::complex operator()(const uint_t &i) const + { + uint_t i0,i1; + thrust::complex q0,q1; + thrust::complex* vec0; + thrust::complex* vec1; + thrust::complex ret; + double d0,d1; + + vec0 = this->data_; + vec1 = vec0 + offset; + + i1 = i & (offset - 1); + i0 = (i - i1) << 1; + i0 += i1; + + q0 = vec0[i0]; + q1 = vec1[i0]; + + d0 = (double)(q0.real()*q0.real() + q0.imag()*q0.imag()); + d1 = (double)(q1.real()*q1.real() + q1.imag()*q1.imag()); + + ret = thrust::complex(d0,d1); + return ret; + } + + const char* name(void) + { + return "probabilities_1qubit"; + } +}; + +//------------------------------------------------------------------------------ +// Expectation values +//------------------------------------------------------------------------------ +inline __host__ __device__ uint_t pop_count_kernel(uint_t val) +{ + uint_t count = val; + count = (count & 0x5555555555555555) + ((count >> 1) & 0x5555555555555555); + count = (count & 0x3333333333333333) + ((count >> 2) & 0x3333333333333333); + count = (count & 0x0f0f0f0f0f0f0f0f) + ((count >> 4) & 0x0f0f0f0f0f0f0f0f); + count = (count & 0x00ff00ff00ff00ff) + ((count >> 8) & 0x00ff00ff00ff00ff); + count = (count & 0x0000ffff0000ffff) + ((count >> 16) & 0x0000ffff0000ffff); + count = (count & 0x00000000ffffffff) + ((count >> 32) & 0x00000000ffffffff); + return count; +} + +//special case Z only +template +class expval_pauli_Z_func : public GateFuncBase +{ +protected: + uint_t z_mask_; + +public: + expval_pauli_Z_func(uint_t z) + { + z_mask_ = z; + } + + bool is_diagonal(void) + { + return true; + } + bool batch_enable(void) + { + return false; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + double ret = 0.0; + + vec = this->data_; + + q0 = vec[i]; + ret = q0.real()*q0.real() + q0.imag()*q0.imag(); + + if(z_mask_ != 0){ + if(pop_count_kernel(i & z_mask_) & 1) + ret = -ret; + } + + return ret; + } + const char* name(void) + { + return "expval_pauli_Z"; + } +}; + +template +class expval_pauli_XYZ_func : public GateFuncBase +{ +protected: + uint_t x_mask_; + uint_t z_mask_; + uint_t mask_l_; + uint_t mask_u_; + thrust::complex phase_; +public: + expval_pauli_XYZ_func(uint_t x,uint_t z,uint_t x_max,std::complex p) + { + x_mask_ = x; + z_mask_ = z; + phase_ = p; + + mask_u_ = ~((1ull << (x_max+1)) - 1); + mask_l_ = (1ull << x_max) - 1; + } + bool batch_enable(void) + { + return false; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + thrust::complex q0p; + thrust::complex q1p; + double d0,d1,ret = 0.0; + uint_t idx0,idx1; + + vec = this->data_; + + idx0 = ((i << 1) & mask_u_) | (i & mask_l_); + idx1 = idx0 ^ x_mask_; + + q0 = vec[idx0]; + q1 = vec[idx1]; + q0p = q1 * phase_; + q1p = q0 * phase_; + d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); + d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); + + if(z_mask_ != 0){ + if(pop_count_kernel(idx0 & z_mask_) & 1) + ret = -d0; + else + ret = d0; + if(pop_count_kernel(idx1 & z_mask_) & 1) + ret -= d1; + else + ret += d1; + } + else{ + ret = d0 + d1; + } + + return ret; + } + const char* name(void) + { + return "expval_pauli_XYZ"; + } +}; + +template +class expval_pauli_inter_chunk_func : public GateFuncBase +{ +protected: + uint_t x_mask_; + uint_t z_mask_; + thrust::complex phase_; + thrust::complex* pair_chunk_; + uint_t z_count_; + uint_t z_count_pair_; +public: + expval_pauli_inter_chunk_func(uint_t x,uint_t z,std::complex p,thrust::complex* pair_chunk,uint_t zc,uint_t zcp) + { + x_mask_ = x; + z_mask_ = z; + phase_ = p; + + pair_chunk_ = pair_chunk; + z_count_ = zc; + z_count_pair_ = zcp; + } + + bool is_diagonal(void) + { + return true; + } + bool batch_enable(void) + { + return false; + } + + __host__ __device__ double operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + thrust::complex q0p; + thrust::complex q1p; + double d0,d1,ret = 0.0; + uint_t ip; + + vec = this->data_; + + ip = i ^ x_mask_; + q0 = vec[i]; + q1 = pair_chunk_[ip]; + q0p = q1 * phase_; + q1p = q0 * phase_; + d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); + d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); + + if((pop_count_kernel(i & z_mask_) + z_count_) & 1) + ret = -d0; + else + ret = d0; + if((pop_count_kernel(ip & z_mask_) + z_count_pair_) & 1) + ret -= d1; + else + ret += d1; + + return ret; + } + const char* name(void) + { + return "expval_pauli_inter_chunk"; + } +}; + +//------------------------------------------------------------------------------ +// Pauli application +//------------------------------------------------------------------------------ +template +class multi_pauli_func : public GateFuncBase +{ +protected: + uint_t x_mask_; + uint_t z_mask_; + uint_t mask_l_; + uint_t mask_u_; + thrust::complex phase_; + uint_t nqubits_; +public: + multi_pauli_func(uint_t x,uint_t z,uint_t x_max,std::complex p) + { + x_mask_ = x; + z_mask_ = z; + phase_ = p; + + mask_u_ = ~((1ull << (x_max+1)) - 1); + mask_l_ = (1ull << x_max) - 1; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + thrust::complex q1; + uint_t idx0,idx1; + + vec = this->data_; + + idx0 = ((i << 1) & mask_u_) | (i & mask_l_); + idx1 = idx0 ^ x_mask_; + + q0 = vec[idx0]; + q1 = vec[idx1]; + + if(z_mask_ != 0){ + if(pop_count_kernel(idx0 & z_mask_) & 1) + q0 *= -1; + + if(pop_count_kernel(idx1 & z_mask_) & 1) + q1 *= -1; + } + vec[idx0] = q1 * phase_; + vec[idx1] = q0 * phase_; + } + const char* name(void) + { + return "multi_pauli"; + } +}; + +//special case Z only +template +class multi_pauli_Z_func : public GateFuncBase +{ +protected: + uint_t z_mask_; + thrust::complex phase_; +public: + multi_pauli_Z_func(uint_t z,std::complex p) + { + z_mask_ = z; + phase_ = p; + } + + bool is_diagonal(void) + { + return true; + } + + __host__ __device__ void operator()(const uint_t &i) const + { + thrust::complex* vec; + thrust::complex q0; + + vec = this->data_; + + q0 = vec[i]; + + if(z_mask_ != 0){ + if(pop_count_kernel(i & z_mask_) & 1) + q0 = -q0; + } + vec[i] = q0 * phase_; + } + const char* name(void) + { + return "multi_pauli_Z"; + } +}; + + +//------------------------------------------------------------------------------ +} // end namespace Chunk +} // end namespace QV +} // end namespace AER +//------------------------------------------------------------------------------ + +//------------------------------------------------------------------------------ +#endif // end module diff --git a/src/simulators/statevector/qubitvector.hpp b/src/simulators/statevector/qubitvector.hpp index 79fad5745b..ee037cb5fb 100755 --- a/src/simulators/statevector/qubitvector.hpp +++ b/src/simulators/statevector/qubitvector.hpp @@ -41,6 +41,12 @@ namespace QV { template using cvector_t = std::vector>; template using cdict_t = std::map>; +enum class Rotation { + x, y, z, + xx, yy, zz, + zx, +}; + //============================================================================ // QubitVector class //============================================================================ @@ -159,6 +165,8 @@ class QubitVector { void set_max_matrix_bits(int_t bits){} + void synchronize(void){} + //----------------------------------------------------------------------- // Check point operations //----------------------------------------------------------------------- @@ -256,6 +264,9 @@ class QubitVector { // If N=3 this implements an optimized Fredkin gate void apply_mcswap(const reg_t &qubits); + //apply rotation around axis + void apply_rotation(const reg_t &qubits, const Rotation r, const double theta); + //swap between chunk void apply_chunk_swap(const reg_t &qubits, QubitVector &chunk, bool write_back = true); void apply_chunk_swap(const reg_t &qubits, uint_t remote_chunk_index); @@ -389,6 +400,11 @@ class QubitVector { // Get the qubit threshold for activating OpenMP. uint_t get_omp_threshold() {return omp_threshold_;} + //cuStateVec + void cuStateVec_enable(bool flg) + { + } + //----------------------------------------------------------------------- // Optimization configuration settings //----------------------------------------------------------------------- @@ -1576,6 +1592,37 @@ void QubitVector::apply_mcu(const reg_t &qubits, } // end switch } +template +void QubitVector::apply_rotation(const reg_t &qubits, const Rotation r, const double theta) +{ + switch(r){ + case Rotation::x: + apply_mcu(qubits, Linalg::VMatrix::rx(theta)); + break; + case Rotation::y: + apply_mcu(qubits, Linalg::VMatrix::ry(theta)); + break; + case Rotation::z: + apply_mcu(qubits, Linalg::VMatrix::rz(theta)); + break; + case Rotation::xx: + apply_matrix(qubits, Linalg::VMatrix::rxx(theta)); + break; + case Rotation::yy: + apply_matrix(qubits, Linalg::VMatrix::ryy(theta)); + break; + case Rotation::zz: + apply_diagonal_matrix(qubits, Linalg::VMatrix::rzz_diag(theta)); + break; + case Rotation::zx: + apply_matrix(qubits, Linalg::VMatrix::rzx(theta)); + break; + default: + throw std::invalid_argument( + "QubitVector::invalid rotation axis."); + } +} + template void QubitVector::apply_chunk_swap(const reg_t &qubits, QubitVector &src, bool write_back) { diff --git a/src/simulators/statevector/qubitvector_thrust.hpp b/src/simulators/statevector/qubitvector_thrust.hpp index fdbbbbe2b5..3c4ca7c334 100644 --- a/src/simulators/statevector/qubitvector_thrust.hpp +++ b/src/simulators/statevector/qubitvector_thrust.hpp @@ -162,6 +162,11 @@ class QubitVectorThrust { void set_max_matrix_bits(int_t bits); + void synchronize(void) + { + chunk_.synchronize(); + } + //----------------------------------------------------------------------- // Check point operations //----------------------------------------------------------------------- @@ -267,6 +272,9 @@ class QubitVectorThrust { // If N=3 this implements an optimized Fredkin gate void apply_mcswap(const reg_t &qubits); + //apply rotation around axis + void apply_rotation(const reg_t &qubits, const Rotation r, const double theta); + //swap between chunk void apply_chunk_swap(const reg_t &qubits, QubitVectorThrust &chunk, bool write_back = true); void apply_chunk_swap(const reg_t &qubits, uint_t remote_chunk_index); @@ -420,6 +428,12 @@ class QubitVectorThrust { // Get the qubit threshold for activating OpenMP. uint_t get_omp_threshold() {return omp_threshold_;} + //cuStateVec + void cuStateVec_enable(bool flg) + { + cuStateVec_enable_ = flg; + } + //----------------------------------------------------------------------- // Optimization configuration settings //----------------------------------------------------------------------- @@ -430,7 +444,6 @@ class QubitVectorThrust { // Get the sample_measure index size int get_sample_measure_index_size() {return sample_measure_index_size_;} - protected: //----------------------------------------------------------------------- @@ -439,11 +452,11 @@ class QubitVectorThrust { size_t num_qubits_; size_t data_size_; - mutable Chunk chunk_; - mutable Chunk buffer_chunk_; - mutable Chunk send_chunk_; - mutable Chunk recv_chunk_; - std::shared_ptr> chunk_manager_ = nullptr; + mutable Chunk::Chunk chunk_; + mutable Chunk::Chunk buffer_chunk_; + mutable Chunk::Chunk send_chunk_; + mutable Chunk::Chunk recv_chunk_; + std::shared_ptr> chunk_manager_ = nullptr; mutable thrust::host_vector> checkpoint_; @@ -451,6 +464,7 @@ class QubitVectorThrust { bool multi_chunk_distribution_; bool multi_shots_; bool enable_batch_; + bool cuStateVec_enable_ = false; bool register_blocking_; @@ -739,166 +753,15 @@ AER::Vector> QubitVectorThrust::move_to_vector() return AER::Vector>::copy_from_buffer(data_size_, &ret[0]); } + //------------------------------------------------------------------------------ // State initialize component //------------------------------------------------------------------------------ -template -class initialize_component_1qubit_func : public GateFuncBase -{ -protected: - thrust::complex s0,s1; - uint_t mask; - uint_t offset; -public: - initialize_component_1qubit_func(int qubit,thrust::complex state0,thrust::complex state1) - { - s0 = state0; - s1 = state1; - - mask = (1ull << qubit) - 1; - offset = 1ull << qubit; - } - - virtual __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - q0 = vec0[i0]; - - vec0[i0] = s0*q0; - vec1[i0] = s1*q0; - } - - const char* name(void) - { - return "initialize_component 1 qubit"; - } -}; - -template -class initialize_component_func : public GateFuncBase -{ -protected: - int nqubits; - uint_t matSize; -public: - initialize_component_func(const cvector_t& mat,const reg_t &qb) - { - nqubits = qb.size(); - matSize = 1ull << nqubits; - } - - int qubits_count(void) - { - return nqubits; - } - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q; - thrust::complex* state; - uint_t* qubits; - uint_t* qubits_sorted; - uint_t j,k; - uint_t ii,idx,t; - uint_t mask; - - //get parameters from iterator - vec = this->data_; - state = this->matrix_; - qubits = this->params_; - qubits_sorted = qubits + nqubits; - - idx = 0; - ii = i; - for(j=0;j> j) & 1) != 0) - ii += (1ull << qubits[j]); - } - q = q0 * state[k]; - vec[ii] = q; - } - } - - const char* name(void) - { - return "initialize_component"; - } -}; - -template -class initialize_large_component_func : public GateFuncBase -{ -protected: - int num_qubits_; - uint_t mask_; - uint_t cmask_; - thrust::complex init_; -public: - initialize_large_component_func(thrust::complex m,const reg_t& qubits,int i) - { - num_qubits_ = qubits.size(); - init_ = m; - - mask_ = 0; - cmask_ = 0; - for(int k=0;k> k) & 1) != 0){ - cmask_ |= (1ull << qubits[k]); - } - } - } - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q; - vec = this->data_; - if((i & mask_) == cmask_){ - q = vec[i]; - vec[i] = init_*q; - } - } - const char* name(void) - { - return "initialize_large_component"; - } -}; - template void QubitVectorThrust::initialize_component(const reg_t &qubits, const cvector_t &state0) { if(qubits.size() == 1){ - apply_function(initialize_component_1qubit_func(qubits[0],state0[0],state0[1]) ); + apply_function(Chunk::initialize_component_1qubit_func(qubits[0],state0[0],state0[1]) ); } else if(qubits.size() <= chunk_.container()->matrix_bits()){ auto qubits_sorted = qubits; @@ -912,14 +775,14 @@ void QubitVectorThrust::initialize_component(const reg_t &qubits, const // chunk_.StoreMatrix(state0); // chunk_.StoreUintParams(qubits_param); - apply_function(initialize_component_func(state0,qubits_sorted), state0, qubits_param ); + apply_function(Chunk::initialize_component_func(state0,qubits_sorted), state0, qubits_param ); } else{ //if initial state is larger that matrix buffer, set one by one. uint_t DIM = 1ull << qubits.size(); uint_t i; for(i=0;i(state0[i],qubits,i) ); + apply_function(Chunk::initialize_large_component_func(state0[i],qubits,i) ); } } } @@ -927,29 +790,6 @@ void QubitVectorThrust::initialize_component(const reg_t &qubits, const //------------------------------------------------------------------------------ // Utility //------------------------------------------------------------------------------ - -template -class ZeroClear : public GateFuncBase -{ -protected: -public: - ZeroClear() {} - bool is_diagonal(void) - { - return true; - } - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - vec = this->data_; - vec[i] = 0.0; - } - const char* name(void) - { - return "zero"; - } -}; - template void QubitVectorThrust::zero() { @@ -957,14 +797,13 @@ void QubitVectorThrust::zero() DebugMsg("zero"); #endif - apply_function(ZeroClear(), cvector_t(), reg_t()); + apply_function(Chunk::ZeroClear(), cvector_t(), reg_t()); #ifdef AER_DEBUG DebugMsg("zero done"); #endif } - template bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t chunk_index,uint_t num_local_chunks) { @@ -987,8 +826,8 @@ bool QubitVectorThrust::chunk_setup(int chunk_bits,int num_qubits,uint_t //only first chunk call allocation function if(chunk_bits > 0 && num_qubits > 0){ - chunk_manager_ = std::make_shared>(); - chunk_manager_->Allocate(chunk_bits,num_qubits,num_local_chunks,max_matrix_bits_); + chunk_manager_ = std::make_shared>(); + chunk_manager_->Allocate(chunk_bits,num_qubits,num_local_chunks,chunk_index_,max_matrix_bits_, cuStateVec_enable_); } multi_chunk_distribution_ = false; @@ -1020,6 +859,7 @@ bool QubitVectorThrust::chunk_setup(QubitVectorThrust& base,cons base.multi_shots_ = true; } } + cuStateVec_enable_ = base.cuStateVec_enable_; //set global chunk ID / shot ID chunk_index_ = chunk_index; @@ -1289,47 +1129,6 @@ bool QubitVectorThrust::enable_batch(bool flg) //------------------------------------------------------------------------------ // Initialization //------------------------------------------------------------------------------ - -template -class initialize_kernel : public GateFuncBase -{ -protected: - int num_qubits_state_; - uint_t offset_; - thrust::complex init_val_; -public: - initialize_kernel(thrust::complex v,int nqs,uint_t offset) - { - num_qubits_state_ = nqs; - offset_ = offset; - init_val_ = v; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - uint_t iChunk = (i >> num_qubits_state_); - - vec = this->data_; - - if(i == iChunk * offset_){ - vec[i] = init_val_; - } - else{ - vec[i] = 0.0; - } - } - const char* name(void) - { - return "initialize"; - } -}; - template void QubitVectorThrust::initialize() { @@ -1338,7 +1137,7 @@ void QubitVectorThrust::initialize() if(multi_chunk_distribution_){ if(chunk_index_ == 0){ - apply_function(initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->num_qubits()))); + apply_function(Chunk::initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->num_qubits()))); } else{ zero(); @@ -1346,7 +1145,7 @@ void QubitVectorThrust::initialize() chunk_.synchronize(); } else{ - apply_function(initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->chunk_bits()))); + apply_function(Chunk::initialize_kernel(t,chunk_manager_->chunk_bits(),(1ull << chunk_manager_->chunk_bits()))); } #ifdef AER_DEBUG @@ -1600,1035 +1399,81 @@ void QubitVectorThrust::set_json_chop_threshold(double threshold) { * MATRIX MULTIPLICATION * ******************************************************************************/ -template -class MatrixMult2x2 : public GateFuncBase -{ -protected: - thrust::complex m0,m1,m2,m3; - int qubit; - uint_t mask; - uint_t offset0; - -public: - MatrixMult2x2(const cvector_t& mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; - - mask = (1ull << qubit) - 1; - - offset0 = 1ull << qubit; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - vec1 = vec0 + offset0; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - q0 = vec0[i0]; - q1 = vec1[i0]; - vec0[i0] = m0 * q0 + m2 * q1; - vec1[i0] = m1 * q0 + m3 * q1; - } - const char* name(void) - { - return "mult2x2"; - } -}; +template +void QubitVectorThrust::apply_matrix(const reg_t &qubits, + const cvector_t &mat) +{ + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch + if(qubits.size() == 1 && register_blocking_) + chunk_.queue_blocked_gate('u',qubits[0],0,&mat[0]); + else + chunk_.apply_matrix(qubits,0,mat,chunk_.container()->num_chunks()); +} template -class MatrixMult4x4 : public GateFuncBase +void QubitVectorThrust::apply_multiplexer(const reg_t &control_qubits, + const reg_t &target_qubits, + const cvector_t &mat) { -protected: - thrust::complex m00,m10,m20,m30; - thrust::complex m01,m11,m21,m31; - thrust::complex m02,m12,m22,m32; - thrust::complex m03,m13,m23,m33; - uint_t mask0; - uint_t mask1; - uint_t offset0; - uint_t offset1; + const size_t control_count = control_qubits.size(); + const size_t target_count = target_qubits.size(); + const uint_t DIM = 1ull << (target_count+control_count); + const uint_t columns = 1ull << target_count; + const uint_t blocks = 1ull << control_count; -public: - MatrixMult4x4(const cvector_t& mat,int qubit0,int qubit1) - { - m00 = mat[0]; - m01 = mat[1]; - m02 = mat[2]; - m03 = mat[3]; - - m10 = mat[4]; - m11 = mat[5]; - m12 = mat[6]; - m13 = mat[7]; - - m20 = mat[8]; - m21 = mat[9]; - m22 = mat[10]; - m23 = mat[11]; - - m30 = mat[12]; - m31 = mat[13]; - m32 = mat[14]; - m33 = mat[15]; - - offset0 = 1ull << qubit0; - offset1 = 1ull << qubit1; - if(qubit0 < qubit1){ - mask0 = offset0 - 1; - mask1 = offset1 - 1; - } - else{ - mask0 = offset1 - 1; - mask1 = offset0 - 1; - } - } + auto qubits = target_qubits; + for (const auto &q : control_qubits) {qubits.push_back(q);} + size_t N = qubits.size(); - int qubits_count(void) - { - return 2; + cvector_t matMP(DIM*DIM,0.0); + uint_t b,i,j; + + //make DIMxDIM matrix + for(b = 0; b < blocks; b++){ + for(i = 0; i < columns; i++){ + for(j = 0; j < columns; j++){ + matMP[(i+b*columns) + DIM*(b*columns+j)] += mat[i+b*columns + DIM * j]; + } + } } - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1,i2; - thrust::complex* vec0; - thrust::complex* vec1; - thrust::complex* vec2; - thrust::complex* vec3; - thrust::complex q0,q1,q2,q3; - vec0 = this->data_; +#ifdef AER_DEBUG + DebugMsg("apply_multiplexer",control_qubits); + DebugMsg(" ",target_qubits); +#endif - i0 = i & mask0; - i2 = (i - i0) << 1; - i1 = i2 & mask1; - i2 = (i2 - i1) << 1; + apply_matrix(qubits,matMP); +} - i0 = i0 + i1 + i2; - vec1 = vec0 + offset0; - vec2 = vec0 + offset1; - vec3 = vec2 + offset0; +template +void QubitVectorThrust::apply_diagonal_matrix(const reg_t &qubits, + const cvector_t &diag) +{ + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch - q0 = vec0[i0]; - q1 = vec1[i0]; - q2 = vec2[i0]; - q3 = vec3[i0]; + const int_t N = qubits.size(); + if(N == 1 && register_blocking_) + chunk_.queue_blocked_gate('d',qubits[0],0,&diag[0]); + else + chunk_.apply_diagonal_matrix(qubits,0,diag,chunk_.container()->num_chunks()); +} - vec0[i0] = m00 * q0 + m10 * q1 + m20 * q2 + m30 * q3; - vec1[i0] = m01 * q0 + m11 * q1 + m21 * q2 + m31 * q3; - vec2[i0] = m02 * q0 + m12 * q1 + m22 * q2 + m32 * q3; - vec3[i0] = m03 * q0 + m13 * q1 + m23 * q2 + m33 * q3; - } - const char* name(void) - { - return "mult4x4"; - } -}; template -class MatrixMult8x8 : public GateFuncBase +void QubitVectorThrust::apply_permutation_matrix(const reg_t& qubits, + const std::vector> &pairs) { -protected: - uint_t offset0; - uint_t offset1; - uint_t offset2; - uint_t mask0; - uint_t mask1; - uint_t mask2; + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch -public: - MatrixMult8x8(const reg_t &qubit,const reg_t &qubit_ordered) - { - offset0 = (1ull << qubit[0]); - offset1 = (1ull << qubit[1]); - offset2 = (1ull << qubit[2]); - - mask0 = (1ull << qubit_ordered[0]) - 1; - mask1 = (1ull << qubit_ordered[1]) - 1; - mask2 = (1ull << qubit_ordered[2]) - 1; - } - - int qubits_count(void) - { - return 3; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1,i2,i3; - thrust::complex* vec; - thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; - thrust::complex m0,m1,m2,m3,m4,m5,m6,m7; - thrust::complex* pMat; - - vec = this->data_; - pMat = this->matrix_; - - i0 = i & mask0; - i3 = (i - i0) << 1; - i1 = i3 & mask1; - i3 = (i3 - i1) << 1; - i2 = i3 & mask2; - i3 = (i3 - i2) << 1; - - i0 = i0 + i1 + i2 + i3; - - q0 = vec[i0]; - q1 = vec[i0 + offset0]; - q2 = vec[i0 + offset1]; - q3 = vec[i0 + offset1 + offset0]; - q4 = vec[i0 + offset2]; - q5 = vec[i0 + offset2 + offset0]; - q6 = vec[i0 + offset2 + offset1]; - q7 = vec[i0 + offset2 + offset1 + offset0]; - - m0 = pMat[0]; - m1 = pMat[8]; - m2 = pMat[16]; - m3 = pMat[24]; - m4 = pMat[32]; - m5 = pMat[40]; - m6 = pMat[48]; - m7 = pMat[56]; - - vec[i0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[1]; - m1 = pMat[9]; - m2 = pMat[17]; - m3 = pMat[25]; - m4 = pMat[33]; - m5 = pMat[41]; - m6 = pMat[49]; - m7 = pMat[57]; - - vec[i0 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[2]; - m1 = pMat[10]; - m2 = pMat[18]; - m3 = pMat[26]; - m4 = pMat[34]; - m5 = pMat[42]; - m6 = pMat[50]; - m7 = pMat[58]; - - vec[i0 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[3]; - m1 = pMat[11]; - m2 = pMat[19]; - m3 = pMat[27]; - m4 = pMat[35]; - m5 = pMat[43]; - m6 = pMat[51]; - m7 = pMat[59]; - - vec[i0 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[4]; - m1 = pMat[12]; - m2 = pMat[20]; - m3 = pMat[28]; - m4 = pMat[36]; - m5 = pMat[44]; - m6 = pMat[52]; - m7 = pMat[60]; - - vec[i0 + offset2] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[5]; - m1 = pMat[13]; - m2 = pMat[21]; - m3 = pMat[29]; - m4 = pMat[37]; - m5 = pMat[45]; - m6 = pMat[53]; - m7 = pMat[61]; - - vec[i0 + offset2 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[6]; - m1 = pMat[14]; - m2 = pMat[22]; - m3 = pMat[30]; - m4 = pMat[38]; - m5 = pMat[46]; - m6 = pMat[54]; - m7 = pMat[62]; - - vec[i0 + offset2 + offset1] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - - m0 = pMat[7]; - m1 = pMat[15]; - m2 = pMat[23]; - m3 = pMat[31]; - m4 = pMat[39]; - m5 = pMat[47]; - m6 = pMat[55]; - m7 = pMat[63]; - - vec[i0 + offset2 + offset1 + offset0] = m0 * q0 + m1 * q1 + m2 * q2 + m3 * q3 + m4 * q4 + m5 * q5 + m6 * q6 + m7 * q7; - } - const char* name(void) - { - return "mult8x8"; - } -}; - -template -class MatrixMult16x16 : public GateFuncBase -{ -protected: - uint_t offset0; - uint_t offset1; - uint_t offset2; - uint_t offset3; - uint_t mask0; - uint_t mask1; - uint_t mask2; - uint_t mask3; -public: - MatrixMult16x16(const reg_t &qubit,const reg_t &qubit_ordered) - { - offset0 = (1ull << qubit[0]); - offset1 = (1ull << qubit[1]); - offset2 = (1ull << qubit[2]); - offset3 = (1ull << qubit[3]); - - mask0 = (1ull << qubit_ordered[0]) - 1; - mask1 = (1ull << qubit_ordered[1]) - 1; - mask2 = (1ull << qubit_ordered[2]) - 1; - mask3 = (1ull << qubit_ordered[3]) - 1; - } - - int qubits_count(void) - { - return 4; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1,i2,i3,i4,offset,f0,f1,f2; - thrust::complex* vec; - thrust::complex q0,q1,q2,q3,q4,q5,q6,q7; - thrust::complex q8,q9,q10,q11,q12,q13,q14,q15; - thrust::complex r; - thrust::complex* pMat; - int j; - - vec = this->data_; - pMat = this->matrix_; - - i0 = i & mask0; - i4 = (i - i0) << 1; - i1 = i4 & mask1; - i4 = (i4 - i1) << 1; - i2 = i4 & mask2; - i4 = (i4 - i2) << 1; - i3 = i4 & mask3; - i4 = (i4 - i3) << 1; - - i0 = i0 + i1 + i2 + i3 + i4; - - q0 = vec[i0]; - q1 = vec[i0 + offset0]; - q2 = vec[i0 + offset1]; - q3 = vec[i0 + offset1 + offset0]; - q4 = vec[i0 + offset2]; - q5 = vec[i0 + offset2 + offset0]; - q6 = vec[i0 + offset2 + offset1]; - q7 = vec[i0 + offset2 + offset1 + offset0]; - q8 = vec[i0 + offset3]; - q9 = vec[i0 + offset3 + offset0]; - q10 = vec[i0 + offset3 + offset1]; - q11 = vec[i0 + offset3 + offset1 + offset0]; - q12 = vec[i0 + offset3 + offset2]; - q13 = vec[i0 + offset3 + offset2 + offset0]; - q14 = vec[i0 + offset3 + offset2 + offset1]; - q15 = vec[i0 + offset3 + offset2 + offset1 + offset0]; - - offset = 0; - f0 = 0; - f1 = 0; - f2 = 0; - for(j=0;j<16;j++){ - r = pMat[0+j]*q0; - r += pMat[16+j]*q1; - r += pMat[32+j]*q2; - r += pMat[48+j]*q3; - r += pMat[64+j]*q4; - r += pMat[80+j]*q5; - r += pMat[96+j]*q6; - r += pMat[112+j]*q7; - r += pMat[128+j]*q8; - r += pMat[144+j]*q9; - r += pMat[160+j]*q10; - r += pMat[176+j]*q11; - r += pMat[192+j]*q12; - r += pMat[208+j]*q13; - r += pMat[224+j]*q14; - r += pMat[240+j]*q15; - - offset = offset3 * (((uint_t)j >> 3) & 1) + - offset2 * (((uint_t)j >> 2) & 1) + - offset1 * (((uint_t)j >> 1) & 1) + - offset0 * ((uint_t)j & 1); - - vec[i0 + offset] = r; - } - } - const char* name(void) - { - return "mult16x16"; - } -}; - -template -class MatrixMultNxN : public GateFuncWithCache -{ -protected: -public: - MatrixMultNxN(uint_t nq) : GateFuncWithCache(nq) - { - ; - } - - __host__ __device__ void run_with_cache(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - uint_t j,threadID; - thrust::complex q,r; - thrust::complex m; - uint_t mat_size,irow; - thrust::complex* vec; - thrust::complex* pMat; - - vec = this->data_; - pMat = this->matrix_; - - mat_size = 1ull << this->nqubits_; - irow = _tid & (mat_size - 1); - - r = 0.0; - for(j=0;j -class MatrixMultNxN_LU : public GateFuncBase -{ -protected: - int nqubits; - uint_t matSize; - int nswap; -public: - MatrixMultNxN_LU(const cvector_t& mat,const reg_t &qb,cvector_t& matLU,reg_t& params) - { - uint_t i,j,k,imax; - std::complex c0,c1; - double d,dmax; - uint_t* pSwap; - - nqubits = qb.size(); - matSize = 1ull << nqubits; - - matLU = mat; - params.resize(nqubits + matSize*2); - - for(k=0;k dmax){ - dmax = d; - imax = j; - } - } - if(imax != i){ - j = params[nqubits + imax]; - params[nqubits + imax] = params[nqubits + i]; - params[nqubits + i] = j; - } - - if(dmax != 0){ - c0 = matLU[(i << nqubits) + params[nqubits + i]]; - - for(j=i+1;j q,qt; - thrust::complex m; - thrust::complex r; - uint_t j,k,l,iq; - uint_t ii,idx,t; - uint_t mask,offset_j,offset_k; - thrust::complex* vec; - thrust::complex* pMat; - uint_t* qubits; - uint_t* pivot; - uint_t* table; - - vec = this->data_; - pMat = this->matrix_; - qubits = this->params_; - - pivot = qubits + nqubits; - table = pivot + matSize; - - idx = 0; - ii = i; - for(j=0;j> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - q = vec[offset_k+idx]; - - r += m*q; - } - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - vec[offset_j+idx] = r; - } - - //mult L - for(j=matSize-1;j>0;j--){ - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - r = vec[offset_j+idx]; - - for(k=0;k> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - q = vec[offset_k+idx]; - - r += m*q; - } - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - vec[offset_j+idx] = r; - } - - //swap results - if(nswap > 0){ - offset_j = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - q = vec[offset_j+idx]; - k = pivot[table[0]]; - for(j=1;j> iq) & 1) != 0) - offset_j += (1ull << qubits[iq]); - } - qt = vec[offset_j+idx]; - - offset_k = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - vec[offset_k+idx] = q; - q = qt; - k = pivot[table[j]]; - } - offset_k = 0; - for(iq=0;iq> iq) & 1) != 0) - offset_k += (1ull << qubits[iq]); - } - vec[offset_k+idx] = q; - } - } - const char* name(void) - { - return "multNxN"; - } -}; - - - -template -void QubitVectorThrust::apply_matrix(const reg_t &qubits, - const cvector_t &mat) -{ - if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) - return; //first chunk execute all in batch - - const size_t N = qubits.size(); - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - - if(N == 1){ - if(register_blocking_){ - chunk_.queue_blocked_gate('u',qubits[0],0,&mat[0]); - } - else{ - apply_function(MatrixMult2x2(mat,qubits[0])); - } - } - else if(N == 2){ - apply_function(MatrixMult4x4(mat,qubits[0],qubits[1])); - } - else if(N <= 10){ - int i; - for(i=0;i(N), mat, qubits_sorted); - } - else{ - cvector_t matLU; - reg_t params; - MatrixMultNxN_LU f(mat,qubits_sorted,matLU,params); - -// chunk_.StoreMatrix(matLU); -// chunk_.StoreUintParams(params); - - apply_function(f, matLU, params); - } - -} - -template -void QubitVectorThrust::apply_multiplexer(const reg_t &control_qubits, - const reg_t &target_qubits, - const cvector_t &mat) -{ - const size_t control_count = control_qubits.size(); - const size_t target_count = target_qubits.size(); - const uint_t DIM = 1ull << (target_count+control_count); - const uint_t columns = 1ull << target_count; - const uint_t blocks = 1ull << control_count; - - auto qubits = target_qubits; - for (const auto &q : control_qubits) {qubits.push_back(q);} - size_t N = qubits.size(); - - cvector_t matMP(DIM*DIM,0.0); - uint_t b,i,j; - - //make DIMxDIM matrix - for(b = 0; b < blocks; b++){ - for(i = 0; i < columns; i++){ - for(j = 0; j < columns; j++){ - matMP[(i+b*columns) + DIM*(b*columns+j)] += mat[i+b*columns + DIM * j]; - } - } - } - -#ifdef AER_DEBUG - DebugMsg("apply_multiplexer",control_qubits); - DebugMsg(" ",target_qubits); -#endif - - apply_matrix(qubits,matMP); -} - -template -class DiagonalMult2x2 : public GateFuncBase -{ -protected: - thrust::complex m0,m1; - int qubit; -public: - - DiagonalMult2x2(const cvector_t& mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - thrust::complex m; - uint_t gid; - - vec = this->data_; - gid = this->base_index_; - - q = vec[i]; - if((((i + gid) >> qubit) & 1) == 0){ - m = m0; - } - else{ - m = m1; - } - - vec[i] = m * q; - } - const char* name(void) - { - return "diagonal_mult2x2"; - } -}; - -template -class DiagonalMult4x4 : public GateFuncBase -{ -protected: - thrust::complex m0,m1,m2,m3; - int qubit0; - int qubit1; -public: - - DiagonalMult4x4(const cvector_t& mat,int q0,int q1) - { - qubit0 = q0; - qubit1 = q1; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; - } - - bool is_diagonal(void) - { - return true; - } - int qubits_count(void) - { - return 2; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - thrust::complex m; - uint_t gid; - - vec = this->data_; - gid = this->base_index_; - - q = vec[i]; - if((((i+gid) >> qubit1) & 1) == 0){ - if((((i+gid) >> qubit0) & 1) == 0){ - m = m0; - } - else{ - m = m1; - } - } - else{ - if((((i+gid) >> qubit0) & 1) == 0){ - m = m2; - } - else{ - m = m3; - } - } - - vec[i] = m * q; - } - const char* name(void) - { - return "diagonal_mult4x4"; - } -}; - -template -class DiagonalMultNxN : public GateFuncBase -{ -protected: - int nqubits; -public: - DiagonalMultNxN(const reg_t &qb) - { - nqubits = qb.size(); - } - - bool is_diagonal(void) - { - return true; - } - int qubits_count(void) - { - return nqubits; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t j,im; - thrust::complex* vec; - thrust::complex q; - thrust::complex m; - thrust::complex* pMat; - uint_t* qubits; - uint_t gid; - - vec = this->data_; - gid = this->base_index_; - - pMat = this->matrix_; - qubits = this->params_; - - im = 0; - for(j=0;j> qubits[j]) & 1) != 0){ - im += (1 << j); - } - } - - q = vec[i]; - m = pMat[im]; - - vec[i] = m * q; - } - const char* name(void) - { - return "diagonal_multNxN"; - } -}; - -template -void QubitVectorThrust::apply_diagonal_matrix(const reg_t &qubits, - const cvector_t &diag) -{ - if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) - return; //first chunk execute all in batch - - const int_t N = qubits.size(); - - if(N == 1){ - if(register_blocking_){ - chunk_.queue_blocked_gate('d',qubits[0],0,&diag[0]); - } - else{ - apply_function(DiagonalMult2x2(diag,qubits[0])); - } - } - else if(N == 2){ - apply_function(DiagonalMult4x4(diag,qubits[0],qubits[1])); - } - else{ -// chunk_.StoreMatrix(diag); -// chunk_.StoreUintParams(qubits); - - apply_function(DiagonalMultNxN(qubits), diag, qubits); - } -} - - -template -class Permutation : public GateFuncBase -{ -protected: - uint_t nqubits; - uint_t npairs; - -public: - Permutation(const reg_t& qubits_sorted,const reg_t& qubits,const std::vector> &pairs,reg_t& params) - { - uint_t j,k; - uint_t offset0,offset1; - - nqubits = qubits.size(); - npairs = pairs.size(); - - params.resize(nqubits + npairs*2); - - for(j=0;j> k) & 1) != 0){ - offset0 += (1ull << qubits[k]); - } - if(((pairs[j].second >> k) & 1) != 0){ - offset1 += (1ull << qubits[k]); - } - } - params[nqubits + j*2 ] = offset0; - params[nqubits + j*2+1] = offset1; - } - } - int qubits_count(void) - { - return nqubits; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - uint_t j; - uint_t ii,idx,t; - uint_t* mask; - uint_t* pairs; - - vec = this->data_; - mask = this->params_; - pairs = mask + nqubits; - - idx = 0; - ii = i; - for(j=0;j -void QubitVectorThrust::apply_permutation_matrix(const reg_t& qubits, - const std::vector> &pairs) -{ - const size_t N = qubits.size(); - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - - reg_t params; - Permutation f(qubits_sorted,qubits,pairs,params); -// chunk_.StoreUintParams(params); - - apply_function(f, cvector_t(), params); -} + chunk_.apply_permutation(qubits,pairs,chunk_.container()->num_chunks()); +} /******************************************************************************* @@ -2640,70 +1485,6 @@ void QubitVectorThrust::apply_permutation_matrix(const reg_t& qubits, //------------------------------------------------------------------------------ // Multi-controlled gates //------------------------------------------------------------------------------ - -template -class CX_func : public GateFuncBase -{ -protected: - uint_t offset; - uint_t mask; - uint_t cmask; - int nqubits; - int qubit_t; -public: - - CX_func(const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - qubit_t = qubits[nqubits-1]; - offset = 1ull << qubit_t; - mask = offset - 1; - - cmask = 0; - for(i=0;i q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - if((i0 & cmask) == cmask){ - q0 = vec0[i0]; - q1 = vec1[i0]; - - vec0[i0] = q1; - vec1[i0] = q0; - } - } - const char* name(void) - { - return "CX"; - } -}; - template void QubitVectorThrust::apply_mcx(const reg_t &qubits) { @@ -2719,74 +1500,11 @@ void QubitVectorThrust::apply_mcx(const reg_t &qubits) chunk_.queue_blocked_gate('x',qubits[qubits.size()-1],mask); } else{ - apply_function(CX_func(qubits)); + chunk_.apply_X(qubits, chunk_.container()->num_chunks()); } } -template -class CY_func : public GateFuncBase -{ -protected: - uint_t mask; - uint_t cmask; - uint_t offset; - int nqubits; - int qubit_t; -public: - CY_func(const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - qubit_t = qubits[nqubits-1]; - offset = (1ull << qubit_t); - mask = (1ull << qubit_t) - 1; - - cmask = 0; - for(i=0;i q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - if((i0 & cmask) == cmask){ - q0 = vec0[i0]; - q1 = vec1[i0]; - - vec0[i0] = thrust::complex(q1.imag(),-q1.real()); - vec1[i0] = thrust::complex(-q0.imag(),q0.real()); - } - } - const char* name(void) - { - return "CY"; - } -}; - template void QubitVectorThrust::apply_mcy(const reg_t &qubits) { @@ -2802,163 +1520,18 @@ void QubitVectorThrust::apply_mcy(const reg_t &qubits) chunk_.queue_blocked_gate('y',qubits[qubits.size()-1],mask); } else{ - apply_function(CY_func(qubits)); + chunk_.apply_Y(qubits, chunk_.container()->num_chunks()); } } -template -class CSwap_func : public GateFuncBase -{ -protected: - uint_t mask0; - uint_t mask1; - uint_t cmask; - int nqubits; - int qubit_t0; - int qubit_t1; - uint_t offset1; - uint_t offset2; -public: - - CSwap_func(const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - if(qubits[nqubits-2] < qubits[nqubits-1]){ - qubit_t0 = qubits[nqubits-2]; - qubit_t1 = qubits[nqubits-1]; - } - else{ - qubit_t1 = qubits[nqubits-2]; - qubit_t0 = qubits[nqubits-1]; - } - mask0 = (1ull << qubit_t0) - 1; - mask1 = (1ull << qubit_t1) - 1; - - offset1 = 1ull << qubit_t0; - offset2 = 1ull << qubit_t1; - - cmask = 0; - for(i=0;i q1,q2; - thrust::complex* vec1; - thrust::complex* vec2; - - vec1 = this->data_; - - vec2 = vec1 + offset2; - vec1 = vec1 + offset1; - - i0 = i & mask0; - i2 = (i - i0) << 1; - i1 = i2 & mask1; - i2 = (i2 - i1) << 1; - - i0 = i0 + i1 + i2; - - if((i0 & cmask) == cmask){ - q1 = vec1[i0]; - q2 = vec2[i0]; - vec1[i0] = q2; - vec2[i0] = q1; - } - } - const char* name(void) - { - return "CSWAP"; - } -}; - template void QubitVectorThrust::apply_mcswap(const reg_t &qubits) { - apply_function(CSwap_func(qubits)); -} - - -//swap operator between chunks -template -class CSwapChunk_func : public GateFuncBase -{ -protected: - uint_t mask; - thrust::complex* vec0; - thrust::complex* vec1; - bool write_back_; - bool swap_all_; -public: - - CSwapChunk_func(const reg_t &qubits,uint_t block_bits,thrust::complex* pVec0,thrust::complex* pVec1,bool wb) - { - int i; - int nqubits; - int qubit_t; - nqubits = qubits.size(); - - if(qubits[nqubits-2] < qubits[nqubits-1]){ - qubit_t = qubits[nqubits-2]; - } - else{ - qubit_t = qubits[nqubits-1]; - } - mask = (1ull << qubit_t) - 1; - - vec0 = pVec0; - vec1 = pVec1; - - write_back_ = wb; - if(qubit_t >= block_bits) - swap_all_ = true; - else - swap_all_ = false; - } - - bool batch_enable(void) - { - return false; - } - bool is_diagonal(void) - { - return swap_all_; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0,q1; - - i0 = i & mask; - i1 = (i - i0) << 1; - i0 += i1; + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch - q0 = vec0[i0]; - q1 = vec1[i0]; - vec0[i0] = q1; - if(write_back_) - vec1[i0] = q0; - } - const char* name(void) - { - return "Chunk SWAP"; - } -}; + chunk_.apply_swap(qubits,qubits.size()-2,chunk_.container()->num_chunks()); +} template void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, QubitVectorThrust &src, bool write_back) @@ -2976,7 +1549,7 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, QubitVecto thrust::complex* pChunk0; thrust::complex* pChunk1; - Chunk bufferChunk; + Chunk::Chunk bufferChunk; bool exec_on_src = false; if(chunk_.device() >= 0){ @@ -3020,13 +1593,13 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, QubitVecto } if(exec_on_src){ - src.apply_function(CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); + src.apply_function(Chunk::CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); src.chunk_.synchronize(); //should be synchronized here if(bufferChunk.is_mapped()) bufferChunk.CopyOut(chunk_); } else{ - apply_function(CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); + apply_function(Chunk::CSwapChunk_func(qubits,num_qubits_,pChunk0,pChunk1,true)); chunk_.synchronize(); //should be synchronized here if(bufferChunk.is_mapped()) bufferChunk.CopyOut(src.chunk_); @@ -3058,7 +1631,7 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, uint_t rem else{ thrust::complex* pLocal; thrust::complex* pRemote; - Chunk buffer; + Chunk::Chunk buffer; #ifdef AER_DISABLE_GDR if(chunk_.device() >= 0){ //if there is no GPUDirectRDMA support, copy chunk from CPU @@ -3083,64 +1656,21 @@ void QubitVectorThrust::apply_chunk_swap(const reg_t &qubits, uint_t rem DebugMsg("chunk swap (process)",qubits); #endif - chunk_.Execute(CSwapChunk_func(qubits,num_qubits_,pLocal,pRemote,false),1); + chunk_.Execute(Chunk::CSwapChunk_func(qubits,num_qubits_,pLocal,pRemote,false),1); chunk_.synchronize(); //should be synchronized here - if(buffer.is_mapped()){ - chunk_manager_->UnmapBufferChunk(buffer); - } - } - - release_recv_buffer(); - -#ifdef AER_DISABLE_GDR - release_send_buffer(); -#endif -} - -template -class phase_func : public GateFuncBase -{ -protected: - thrust::complex phase; - uint_t mask; - int nqubits; -public: - phase_func(const reg_t &qubits,thrust::complex p) - { - int i; - nqubits = qubits.size(); - phase = p; - - mask = 0; - for(i=0;i* vec; - thrust::complex q0; - - vec = this->data_; - gid = this->base_index_; - - if(((i+gid) & mask) == mask){ - q0 = vec[i]; - vec[i] = q0 * phase; + if(buffer.is_mapped()){ + chunk_manager_->UnmapBufferChunk(buffer); } } - const char* name(void) - { - return "phase"; - } -}; + + release_recv_buffer(); + +#ifdef AER_DISABLE_GDR + release_send_buffer(); +#endif +} + template void QubitVectorThrust::apply_mcphase(const reg_t &qubits, const std::complex phase) @@ -3157,140 +1687,10 @@ void QubitVectorThrust::apply_mcphase(const reg_t &qubits, const std::co chunk_.queue_blocked_gate('p',qubits[qubits.size()-1],mask,&phase); } else{ - apply_function(phase_func(qubits,*(thrust::complex*)&phase) ); + chunk_.apply_phase(qubits,qubits.size()-1,phase,chunk_.container()->num_chunks()); } } -template -class DiagonalMult2x2Controlled : public GateFuncBase -{ -protected: - thrust::complex m0,m1; - uint_t mask; - uint_t cmask; - int nqubits; -public: - DiagonalMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) - { - int i; - nqubits = qubits.size(); - - m0 = mat[0]; - m1 = mat[1]; - - mask = (1ull << qubits[nqubits-1]) - 1; - cmask = 0; - for(i=0;i* vec; - thrust::complex q0; - thrust::complex m; - - vec = this->data_; - gid = this->base_index_; - - if(((i + gid) & cmask) == cmask){ - if((i + gid) & mask){ - m = m1; - } - else{ - m = m0; - } - - q0 = vec[i]; - vec[i] = m*q0; - } - } - const char* name(void) - { - return "diagonal_Cmult2x2"; - } -}; - -template -class MatrixMult2x2Controlled : public GateFuncBase -{ -protected: - thrust::complex m0,m1,m2,m3; - uint_t mask; - uint_t cmask; - uint_t offset; - int nqubits; -public: - MatrixMult2x2Controlled(const cvector_t& mat,const reg_t &qubits) - { - int i; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; - nqubits = qubits.size(); - - offset = 1ull << qubits[nqubits-1]; - mask = (1ull << qubits[nqubits-1]) - 1; - cmask = 0; - for(i=0;i q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - - vec0 = this->data_; - - vec1 = vec0 + offset; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - if((i0 & cmask) == cmask){ - q0 = vec0[i0]; - q1 = vec1[i0]; - - vec0[i0] = m0 * q0 + m2 * q1; - vec1[i0] = m1 * q0 + m3 * q1; - } - } - const char* name(void) - { - return "matrix_Cmult2x2"; - } -}; template void QubitVectorThrust::apply_mcu(const reg_t &qubits, @@ -3329,7 +1729,7 @@ void QubitVectorThrust::apply_mcu(const reg_t &qubits, chunk_.queue_blocked_gate('d',qubits[qubits.size()-1],mask,&diag[0]); } else{ - apply_function(DiagonalMult2x2Controlled(diag,qubits) ); + chunk_.apply_diagonal_matrix(qubits,qubits.size()-1,diag,chunk_.container()->num_chunks()); } } } @@ -3349,12 +1749,20 @@ void QubitVectorThrust::apply_mcu(const reg_t &qubits, chunk_.queue_blocked_gate('u',qubits[qubits.size()-1],mask,&mat[0]); } else{ - apply_function(MatrixMult2x2Controlled(mat,qubits) ); + chunk_.apply_matrix(qubits,qubits.size()-1,mat,chunk_.container()->num_chunks()); } } } } +template +void QubitVectorThrust::apply_rotation(const reg_t &qubits, const Rotation r, const double theta) +{ + if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) + return; //first chunk execute all in batch + + chunk_.apply_rotation(qubits,r,theta,chunk_.container()->num_chunks()); +} //------------------------------------------------------------------------------ // Single-qubit matrices @@ -3377,7 +1785,8 @@ void QubitVectorThrust::apply_matrix(const uint_t qubit, chunk_.queue_blocked_gate('u',qubit,0,&mat[0]); } else{ - apply_function(MatrixMult2x2(mat,qubit)); + reg_t qubits = {qubit}; + chunk_.apply_matrix(qubits,0,mat,chunk_.container()->num_chunks()); } } @@ -3393,7 +1802,7 @@ void QubitVectorThrust::apply_diagonal_matrix(const uint_t qubit, } else{ reg_t qubits = {qubit}; - apply_function(DiagonalMult2x2(diag,qubits[0])); + chunk_.apply_diagonal_matrix(qubits,0,diag,chunk_.container()->num_chunks()); } } @@ -3402,50 +1811,21 @@ void QubitVectorThrust::apply_diagonal_matrix(const uint_t qubit, * NORMS * ******************************************************************************/ -template -class norm_func : public GateFuncBase -{ -protected: -public: - norm_func(void) - { - - } - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - double d; - - vec = this->data_; - q = vec[i]; - d = (double)(q.real()*q.real() + q.imag()*q.imag()); - return d; - } - - const char* name(void) - { - return "norm"; - } -}; - template double QubitVectorThrust::norm() const { double ret; + uint_t count = 1; + #ifdef AER_THRUST_CUDA if((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_){ if(chunk_.pos() != 0) return 0.0; //first chunk execute all in batch + count = chunk_.container()->num_chunks(); } #endif - apply_function_sum(&ret,norm_func()); + ret = chunk_.norm(count); #ifdef AER_DEBUG DebugMsg("norm",ret); @@ -3454,48 +1834,6 @@ double QubitVectorThrust::norm() const return ret; } -template -class NormMatrixMultNxN : public GateFuncSumWithCache -{ -protected: -public: - NormMatrixMultNxN(uint_t nq) : GateFuncSumWithCache(nq) - { - ; - } - - __host__ __device__ double run_with_cache_sum(uint_t _tid,uint_t _idx,thrust::complex* _cache) const - { - uint_t j; - thrust::complex q,r; - thrust::complex m; - uint_t mat_size,irow; - thrust::complex* vec; - thrust::complex* pMat; - - vec = this->data_; - pMat = this->matrix_; - - mat_size = 1ull << this->nqubits_; - irow = _tid & (mat_size - 1); - - r = 0.0; - for(j=0;j double QubitVectorThrust::norm(const reg_t &qubits, const cvector_t &mat) const @@ -3516,63 +1854,11 @@ double QubitVectorThrust::norm(const reg_t &qubits, const cvector_t(N)); + apply_function_sum(&ret,Chunk::NormMatrixMultNxN(N)); return ret; } } -template -class NormDiagonalMultNxN : public GateFuncBase -{ -protected: - int nqubits; -public: - NormDiagonalMultNxN(const reg_t &qb) - { - nqubits = qb.size(); - } - - bool is_diagonal(void) - { - return true; - } - int qubits_count(void) - { - return nqubits; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - uint_t im,j,gid; - thrust::complex q; - thrust::complex m,r; - thrust::complex* pMat; - thrust::complex* vec; - uint_t* qubits; - - vec = this->data_; - pMat = this->matrix_; - qubits = this->params_; - gid = this->base_index_; - - im = 0; - for(j=0;j double QubitVectorThrust::norm_diagonal(const reg_t &qubits, const cvector_t &mat) const { @@ -3587,7 +1873,7 @@ double QubitVectorThrust::norm_diagonal(const reg_t &qubits, const cvect chunk_.StoreUintParams(qubits); double ret; - apply_function_sum(&ret,NormDiagonalMultNxN(qubits) ); + apply_function_sum(&ret,Chunk::NormDiagonalMultNxN(qubits) ); return ret; } } @@ -3595,118 +1881,20 @@ double QubitVectorThrust::norm_diagonal(const reg_t &qubits, const cvect //------------------------------------------------------------------------------ // Single-qubit specialization //------------------------------------------------------------------------------ -template -class NormMatrixMult2x2 : public GateFuncBase -{ -protected: - thrust::complex m0,m1,m2,m3; - int qubit; - uint_t mask; - uint_t offset; -public: - NormMatrixMult2x2(const cvector_t &mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - m2 = mat[2]; - m3 = mat[3]; - - offset = 1ull << qubit; - mask = (1ull << qubit) - 1; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex* vec; - thrust::complex q0,q1; - thrust::complex r0,r1; - double sum = 0.0; - - vec = this->data_; - - i1 = i & mask; - i0 = (i - i1) << 1; - i0 += i1; - - q0 = vec[i0]; - q1 = vec[offset+i0]; - - r0 = m0 * q0 + m2 * q1; - sum += r0.real()*r0.real() + r0.imag()*r0.imag(); - r1 = m1 * q0 + m3 * q1; - sum += r1.real()*r1.real() + r1.imag()*r1.imag(); - return sum; - } - const char* name(void) - { - return "Norm_mult2x2"; - } -}; - template double QubitVectorThrust::norm(const uint_t qubit, const cvector_t &mat) const { double ret; - apply_function_sum(&ret,NormMatrixMult2x2(mat,qubit)); - - return ret; -} - - -template -class NormDiagonalMult2x2 : public GateFuncBase -{ -protected: - thrust::complex m0,m1; - int qubit; -public: - NormDiagonalMult2x2(cvector_t &mat,int q) - { - qubit = q; - m0 = mat[0]; - m1 = mat[1]; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - uint_t gid; - thrust::complex* vec; - thrust::complex q; - thrust::complex m,r; - - vec = this->data_; - gid = this->base_index_; - - q = vec[i]; - if((((i+gid) >> qubit) & 1) == 0){ - m = m0; - } - else{ - m = m1; - } - - r = m * q; + apply_function_sum(&ret,Chunk::NormMatrixMult2x2(mat,qubit)); - return (r.real()*r.real() + r.imag()*r.imag()); - } - const char* name(void) - { - return "Norm_diagonal_mult2x2"; - } -}; + return ret; +} template double QubitVectorThrust::norm_diagonal(const uint_t qubit, const cvector_t &mat) const { double ret; - apply_function_sum(&ret,NormDiagonalMult2x2(mat,qubit)); + apply_function_sum(&ret,Chunk::NormDiagonalMult2x2(mat,qubit)); return ret; } @@ -3746,101 +1934,6 @@ std::vector QubitVectorThrust::probabilities() const { return probs; } - -template -class probability_func : public GateFuncBase -{ -protected: - uint_t mask; - uint_t cmask; -public: - probability_func(const reg_t &qubits,int i) - { - int k; - int nq = qubits.size(); - - mask = 0; - cmask = 0; - for(k=0;k> k) & 1) != 0){ - cmask |= (1ull << qubits[k]); - } - } - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - double ret; - - vec = this->data_; - - ret = 0.0; - - if((i & mask) == cmask){ - q = vec[i]; - ret = q.real()*q.real() + q.imag()*q.imag(); - } - return ret; - } - - const char* name(void) - { - return "probabilities"; - } -}; - -template -class probability_1qubit_func : public GateFuncBase -{ -protected: - uint_t offset; -public: - probability_1qubit_func(const uint_t qubit) - { - offset = 1ull << qubit; - } - - __host__ __device__ thrust::complex operator()(const uint_t &i) const - { - uint_t i0,i1; - thrust::complex q0,q1; - thrust::complex* vec0; - thrust::complex* vec1; - thrust::complex ret; - double d0,d1; - - vec0 = this->data_; - vec1 = vec0 + offset; - - i1 = i & (offset - 1); - i0 = (i - i1) << 1; - i0 += i1; - - q0 = vec0[i0]; - q1 = vec1[i0]; - - d0 = (double)(q0.real()*q0.real() + q0.imag()*q0.imag()); - d1 = (double)(q1.real()*q1.real() + q1.imag()*q1.imag()); - - ret = thrust::complex(d0,d1); - return ret; - } - - const char* name(void) - { - return "probabilities_1qubit"; - } -}; - template std::vector QubitVectorThrust::probabilities(const reg_t &qubits) const { @@ -3848,25 +1941,7 @@ std::vector QubitVectorThrust::probabilities(const reg_t &qubits const int_t DIM = 1 << N; std::vector probs(DIM, 0.); - if(N == 1){ //special case for 1 qubit (optimized for measure) - apply_function_sum2(&probs[0],probability_1qubit_func(qubits[0])); - -#ifdef AER_DEBUG - DebugMsg("probabilities",probs); -#endif - return probs; - } - - auto qubits_sorted = qubits; - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - if ((N == num_qubits_) && (qubits == qubits_sorted)) - return probabilities(); - - - int i; - for(i=0;i(qubits,i)); - } + chunk_.probabilities(probs, qubits); #ifdef AER_DEBUG DebugMsg("probabilities",probs); @@ -3881,7 +1956,7 @@ std::vector QubitVectorThrust::probabilities(const reg_t &qubits #define QV_RESET_TARGET_PROB 3 template -class reset_after_measure_func : public GateFuncBase +class reset_after_measure_func : public Chunk::GateFuncBase { protected: int num_qubits_; @@ -3933,7 +2008,7 @@ class reset_after_measure_func : public GateFuncBase }; template -class set_probability_buffer_for_reset_func : public GateFuncBase +class set_probability_buffer_for_reset_func : public Chunk::GateFuncBase { protected: uint_t reduce_buf_size_; @@ -3974,7 +2049,7 @@ class set_probability_buffer_for_reset_func : public GateFuncBase }; template -class check_measure_probability_func : public GateFuncBase +class check_measure_probability_func : public Chunk::GateFuncBase { protected: int num_qubits_; @@ -4116,7 +2191,7 @@ void QubitVectorThrust::apply_batched_measure(const reg_t& qubits,std::v chunk_.keep_conditional(true); //total probability - apply_function_sum(nullptr,norm_func(),true); + apply_function_sum(nullptr,Chunk::norm_func(),true); apply_function(set_probability_buffer_for_reset_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size()) ); @@ -4142,7 +2217,7 @@ void QubitVectorThrust::apply_batched_measure(const reg_t& qubits,std::v //loop for probability for(i=0;i(qubits,i),true); + apply_function_sum(nullptr,Chunk::probability_func(qubits,i),true); apply_function(check_measure_probability_func(qubits.size(),chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size(), @@ -4161,7 +2236,7 @@ void QubitVectorThrust::apply_batched_measure(const reg_t& qubits,std::v } template -class reset_func : public GateFuncBase +class reset_func : public Chunk::GateFuncBase { protected: int num_qubits_; @@ -4254,7 +2329,7 @@ void QubitVectorThrust::apply_batched_reset(const reg_t& qubits,std::vec chunk_.keep_conditional(true); //total probability - apply_function_sum(nullptr,norm_func(),true); + apply_function_sum(nullptr,Chunk::norm_func(),true); apply_function(set_probability_buffer_for_reset_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size()) ); @@ -4268,7 +2343,7 @@ void QubitVectorThrust::apply_batched_reset(const reg_t& qubits,std::vec chunk_.StoreUintParams(qubits); for(i=0;i(qubits,i),true); + apply_function_sum(nullptr,Chunk::probability_func(qubits,i),true); apply_function(check_measure_probability_func(qubits.size(),chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size(), @@ -4325,7 +2400,7 @@ void QubitVectorThrust::get_creg(ClassicalRegister& creg) } template -class set_creg_func : public GateFuncBase +class set_creg_func : public Chunk::GateFuncBase { protected: uint_t reg_set_; @@ -4375,7 +2450,7 @@ void QubitVectorThrust::store_cmemory(uint_t qubit,int val) } template -class set_batched_creg_func : public GateFuncBase +class set_batched_creg_func : public Chunk::GateFuncBase { protected: int_t reg_set_; @@ -4437,7 +2512,7 @@ int_t QubitVectorThrust::set_batched_system_conditional(int_t src_reg, r } template -class copy_creg_func : public GateFuncBase +class copy_creg_func : public Chunk::GateFuncBase { protected: uint_t reg_dest_; @@ -4492,9 +2567,11 @@ reg_t QubitVectorThrust::sample_measure(const std::vector &rnds) { uint_t count = 1; #ifdef AER_THRUST_CUDA - if(((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_) && chunk_.pos() != 0) - return reg_t(); //first chunk execute all in batch - count = chunk_.container()->num_chunks(); + if((multi_chunk_distribution_ && chunk_.device() >= 0) || enable_batch_){ + if(chunk_.pos() != 0) + return reg_t(); //first chunk execute all in batch + count = chunk_.container()->num_chunks(); + } #endif #ifdef AER_DEBUG @@ -4516,136 +2593,12 @@ reg_t QubitVectorThrust::sample_measure(const std::vector &rnds) * ******************************************************************************/ -inline __host__ __device__ uint_t pop_count_kernel(uint_t val) -{ - uint_t count = val; - count = (count & 0x5555555555555555) + ((count >> 1) & 0x5555555555555555); - count = (count & 0x3333333333333333) + ((count >> 2) & 0x3333333333333333); - count = (count & 0x0f0f0f0f0f0f0f0f) + ((count >> 4) & 0x0f0f0f0f0f0f0f0f); - count = (count & 0x00ff00ff00ff00ff) + ((count >> 8) & 0x00ff00ff00ff00ff); - count = (count & 0x0000ffff0000ffff) + ((count >> 16) & 0x0000ffff0000ffff); - count = (count & 0x00000000ffffffff) + ((count >> 32) & 0x00000000ffffffff); - return count; -} - -//special case Z only -template -class expval_pauli_Z_func : public GateFuncBase -{ -protected: - uint_t z_mask_; - -public: - expval_pauli_Z_func(uint_t z) - { - z_mask_ = z; - } - - bool is_diagonal(void) - { - return true; - } - bool batch_enable(void) - { - return false; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - double ret = 0.0; - - vec = this->data_; - - q0 = vec[i]; - ret = q0.real()*q0.real() + q0.imag()*q0.imag(); - - if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) - ret = -ret; - } - - return ret; - } - const char* name(void) - { - return "expval_pauli_Z"; - } -}; - -template -class expval_pauli_XYZ_func : public GateFuncBase -{ -protected: - uint_t x_mask_; - uint_t z_mask_; - uint_t mask_l_; - uint_t mask_u_; - thrust::complex phase_; -public: - expval_pauli_XYZ_func(uint_t x,uint_t z,uint_t x_max,std::complex p) - { - x_mask_ = x; - z_mask_ = z; - phase_ = p; - - mask_u_ = ~((1ull << (x_max+1)) - 1); - mask_l_ = (1ull << x_max) - 1; - } - bool batch_enable(void) - { - return false; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - thrust::complex q0p; - thrust::complex q1p; - double d0,d1,ret = 0.0; - uint_t idx0,idx1; - - vec = this->data_; - - idx0 = ((i << 1) & mask_u_) | (i & mask_l_); - idx1 = idx0 ^ x_mask_; - - q0 = vec[idx0]; - q1 = vec[idx1]; - q0p = q1 * phase_; - q1p = q0 * phase_; - d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); - d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); - - if(z_mask_ != 0){ - if(pop_count_kernel(idx0 & z_mask_) & 1) - ret = -d0; - else - ret = d0; - if(pop_count_kernel(idx1 & z_mask_) & 1) - ret -= d1; - else - ret += d1; - } - else{ - ret = d0 + d1; - } - - return ret; - } - const char* name(void) - { - return "expval_pauli_XYZ"; - } -}; - template double QubitVectorThrust::expval_pauli(const reg_t &qubits, const std::string &pauli,const complex_t initial_phase) const { + return chunk_.expval_pauli(qubits,pauli,initial_phase); + uint_t x_mask, z_mask, num_y, x_max; std::tie(x_mask, z_mask, num_y, x_max) = pauli_masks_and_phase(qubits, pauli); @@ -4657,7 +2610,7 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, double ret; // specialize x_max == 0 if(x_mask == 0) { - apply_function_sum(&ret, expval_pauli_Z_func(z_mask) ); + apply_function_sum(&ret, Chunk::expval_pauli_Z_func(z_mask) ); return ret; } @@ -4665,77 +2618,10 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, // This is (-1j) ** number of Y terms modulo 4 auto phase = std::complex(initial_phase); add_y_phase(num_y, phase); - apply_function_sum(&ret, expval_pauli_XYZ_func(x_mask, z_mask, x_max, phase) ); + apply_function_sum(&ret, Chunk::expval_pauli_XYZ_func(x_mask, z_mask, x_max, phase) ); return ret; } -template -class expval_pauli_inter_chunk_func : public GateFuncBase -{ -protected: - uint_t x_mask_; - uint_t z_mask_; - thrust::complex phase_; - thrust::complex* pair_chunk_; - uint_t z_count_; - uint_t z_count_pair_; -public: - expval_pauli_inter_chunk_func(uint_t x,uint_t z,std::complex p,thrust::complex* pair_chunk,uint_t zc,uint_t zcp) - { - x_mask_ = x; - z_mask_ = z; - phase_ = p; - - pair_chunk_ = pair_chunk; - z_count_ = zc; - z_count_pair_ = zcp; - } - - bool is_diagonal(void) - { - return true; - } - bool batch_enable(void) - { - return false; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - thrust::complex q0p; - thrust::complex q1p; - double d0,d1,ret = 0.0; - uint_t ip; - - vec = this->data_; - - ip = i ^ x_mask_; - q0 = vec[i]; - q1 = pair_chunk_[ip]; - q0p = q1 * phase_; - q1p = q0 * phase_; - d0 = q0.real()*q0p.real() + q0.imag()*q0p.imag(); - d1 = q1.real()*q1p.real() + q1.imag()*q1p.imag(); - - if((pop_count_kernel(i & z_mask_) + z_count_) & 1) - ret = -d0; - else - ret = d0; - if((pop_count_kernel(ip & z_mask_) + z_count_pair_) & 1) - ret -= d1; - else - ret += d1; - - return ret; - } - const char* name(void) - { - return "expval_pauli_inter_chunk"; - } -}; template double QubitVectorThrust::expval_pauli(const reg_t &qubits, @@ -4749,7 +2635,7 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, //get pointer to pairing chunk (copy if needed) double ret; thrust::complex* pair_ptr; - Chunk buffer; + Chunk::Chunk buffer; if(pair_chunk.data() == this->data()){ #ifdef AER_DISABLE_GDR @@ -4797,7 +2683,7 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, auto phase = std::complex(initial_phase); add_y_phase(num_y, phase); - apply_function_sum(&ret, expval_pauli_inter_chunk_func(x_mask, z_mask, phase, pair_ptr,z_count,z_count_pair) ); + apply_function_sum(&ret, Chunk::expval_pauli_inter_chunk_func(x_mask, z_mask, phase, pair_ptr,z_count,z_count_pair) ); if(buffer.is_mapped()){ chunk_manager_->UnmapBufferChunk(buffer); @@ -4820,97 +2706,6 @@ double QubitVectorThrust::expval_pauli(const reg_t &qubits, * ******************************************************************************/ -template -class multi_pauli_func : public GateFuncBase -{ -protected: - uint_t x_mask_; - uint_t z_mask_; - uint_t mask_l_; - uint_t mask_u_; - thrust::complex phase_; - uint_t nqubits_; -public: - multi_pauli_func(uint_t x,uint_t z,uint_t x_max,std::complex p) - { - x_mask_ = x; - z_mask_ = z; - phase_ = p; - - mask_u_ = ~((1ull << (x_max+1)) - 1); - mask_l_ = (1ull << x_max) - 1; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - thrust::complex q1; - uint_t idx0,idx1; - - vec = this->data_; - - idx0 = ((i << 1) & mask_u_) | (i & mask_l_); - idx1 = idx0 ^ x_mask_; - - q0 = vec[idx0]; - q1 = vec[idx1]; - - if(z_mask_ != 0){ - if(pop_count_kernel(idx0 & z_mask_) & 1) - q0 *= -1; - - if(pop_count_kernel(idx1 & z_mask_) & 1) - q1 *= -1; - } - vec[idx0] = q1 * phase_; - vec[idx1] = q0 * phase_; - } - const char* name(void) - { - return "multi_pauli"; - } -}; - -//special case Z only -template -class multi_pauli_Z_func : public GateFuncBase -{ -protected: - uint_t z_mask_; - thrust::complex phase_; -public: - multi_pauli_Z_func(uint_t z,std::complex p) - { - z_mask_ = z; - phase_ = p; - } - - bool is_diagonal(void) - { - return true; - } - - __host__ __device__ void operator()(const uint_t &i) const - { - thrust::complex* vec; - thrust::complex q0; - - vec = this->data_; - - q0 = vec[i]; - - if(z_mask_ != 0){ - if(pop_count_kernel(i & z_mask_) & 1) - q0 = -q0; - } - vec[i] = q0 * phase_; - } - const char* name(void) - { - return "multi_pauli_Z"; - } -}; template void QubitVectorThrust::apply_pauli(const reg_t &qubits, @@ -4928,16 +2723,16 @@ void QubitVectorThrust::apply_pauli(const reg_t &qubits, add_y_phase(num_y, phase); if(x_mask == 0){ - apply_function(multi_pauli_Z_func(z_mask, phase)); + apply_function(Chunk::multi_pauli_Z_func(z_mask, phase)); } else{ - apply_function(multi_pauli_func(x_mask, z_mask, x_max, phase) ); + apply_function(Chunk::multi_pauli_func(x_mask, z_mask, x_max, phase) ); } } //batched Pauli operation used for Pauli noise template -class batched_pauli_func : public GateFuncBase +class batched_pauli_func : public Chunk::GateFuncBase { protected: thrust::complex coeff_; @@ -4995,10 +2790,10 @@ class batched_pauli_func : public GateFuncBase phase = thrust::complex(-coeff_.imag(),coeff_.real()); if(z_mask_ != 0){ - if(pop_count_kernel(idx0 & z_mask_) & 1) + if(Chunk::pop_count_kernel(idx0 & z_mask_) & 1) q0 *= -1; - if(pop_count_kernel(idx1 & z_mask_) & 1) + if(Chunk::pop_count_kernel(idx1 & z_mask_) & 1) q1 *= -1; } if(x_mask_ == 0){ @@ -5023,7 +2818,7 @@ void QubitVectorThrust::apply_batched_pauli_ops(const std::vector::apply_batched_pauli_ops(const std::vector -class MatrixMult2x2_conditional : public GateFuncBase +class MatrixMult2x2_conditional : public Chunk::GateFuncBase { protected: thrust::complex m0,m1,m2,m3; @@ -5122,13 +2917,13 @@ class MatrixMult2x2_conditional : public GateFuncBase }; template -class MatrixMultNxN_conditional : public GateFuncWithCache +class MatrixMultNxN_conditional : public Chunk::GateFuncWithCache { protected: uint_t prob_buf_size_; double* probs_; public: - MatrixMultNxN_conditional(uint_t nq,double* probs,uint_t prob_size) : GateFuncWithCache(nq) + MatrixMultNxN_conditional(uint_t nq,double* probs,uint_t prob_size) : Chunk::GateFuncWithCache(nq) { probs_ = probs; prob_buf_size_ = prob_size; @@ -5169,7 +2964,7 @@ class MatrixMultNxN_conditional : public GateFuncWithCache }; template -class check_kraus_probability_func : public GateFuncBase +class check_kraus_probability_func : public Chunk::GateFuncBase { protected: uint_t reduce_buf_size_; @@ -5277,7 +3072,7 @@ void QubitVectorThrust::apply_batched_kraus(const reg_t &qubits, cvector_t vmat = Utils::vectorize_matrix(kmats[i]); chunk_.set_conditional(system_reg); - apply_function_sum(nullptr,NormMatrixMult2x2(vmat,qubits[0]),true); + apply_function_sum(nullptr,Chunk::NormMatrixMult2x2(vmat,qubits[0]),true); apply_function(check_kraus_probability_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size() ) ); @@ -5299,7 +3094,7 @@ void QubitVectorThrust::apply_batched_kraus(const reg_t &qubits, chunk_.set_conditional(system_reg); chunk_.StoreMatrix(Utils::vectorize_matrix(kmats[i])); - apply_function_sum(nullptr,NormMatrixMultNxN(N),true); + apply_function_sum(nullptr,Chunk::NormMatrixMultNxN(N),true); apply_function(check_kraus_probability_func(chunk_.probability_buffer(),chunk_.container()->num_chunks(), chunk_.reduce_buffer(),chunk_.reduce_buffer_size() ) ); @@ -5315,7 +3110,7 @@ void QubitVectorThrust::apply_batched_kraus(const reg_t &qubits, } template -class bfunc_kernel : public GateFuncBase +class bfunc_kernel : public Chunk::GateFuncBase { protected: uint_t bfunc_num_regs_; @@ -5441,7 +3236,7 @@ void QubitVectorThrust::apply_bfunc(const Operations::Op &op) } template -class roerror_kernel : public GateFuncBase +class roerror_kernel : public Chunk::GateFuncBase { protected: uint_t num_regs_; diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp index 5606be96e7..542d839fc0 100755 --- a/src/simulators/statevector/statevector_state.hpp +++ b/src/simulators/statevector/statevector_state.hpp @@ -919,49 +919,65 @@ double State::expval_pauli(const int_t iChunk, const reg_t &qubits, z_mask >>= BaseState::chunk_bits_; x_max -= BaseState::chunk_bits_; - const uint_t mask_u = ~((1ull << (x_max + 1)) - 1); - const uint_t mask_l = (1ull << x_max) - 1; - -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && on_same_process) private(i) reduction(+:expval) - for(i=0;i iChunk){ //on this process uint_t z_count,z_count_pair; z_count = AER::Utils::popcount(iChunk & z_mask); z_count_pair = AER::Utils::popcount(pair_chunk & z_mask); if(iProc == BaseState::distributed_rank_){ //pair is on the same process - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[pair_chunk - BaseState::global_chunk_index_],z_count,z_count_pair,phase); + expval = BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[pair_chunk - BaseState::global_chunk_index_],z_count,z_count_pair,phase); } else{ BaseState::recv_chunk(iChunk-BaseState::global_chunk_index_,pair_chunk); //refer receive buffer to calculate expectation value - expval += BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[iChunk-BaseState::global_chunk_index_],z_count,z_count_pair,phase); + expval = BaseState::qregs_[iChunk-BaseState::global_chunk_index_].expval_pauli(qubits_in_chunk, pauli_in_chunk,BaseState::qregs_[iChunk-BaseState::global_chunk_index_],z_count,z_count_pair,phase); } } else if(iProc == BaseState::distributed_rank_){ //pair is on this process BaseState::send_chunk(iChunk-BaseState::global_chunk_index_,pair_chunk); } - } + return expval; + }; + expval += BaseState::apply_omp_parallel_reduction((BaseState::chunk_omp_parallel_ && on_same_process),0,BaseState::num_global_chunks_/2,apply_expval_pauli_chunk); } else{ //no exchange between chunks z_mask >>= BaseState::chunk_bits_; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) reduction(+:expval) - for(i=0;i::apply_save_density_matrix(const int_t iChunk, const Oper } else{ double sum = 0.0; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:sum) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].checkpoint(); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].revert(true); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) apply_diagonal_matrix(iChunk, sub_qubits, vmat); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati exp_im += exp_tmp.imag(); } else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:exp_re,exp_im) - for(int_t i=0;i::snapshot_matrix_expval(const int_t iChunk, const Operati if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].revert(false); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::snapshot_density_matrix(const int_t iChunk, const Operat reduced_state[0] = BaseState::qregs_[iChunk].norm(); else{ double sum = 0.0; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:sum) - for(int_t i=0;i::apply_gate(const int_t iChunk, const Operations::Op &op) BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::r(op.params[0], op.params[1])); break; case Gates::mcrx: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::rx(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::x, std::real(op.params[0])); break; case Gates::mcry: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::ry(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::y, std::real(op.params[0])); break; case Gates::mcrz: - BaseState::qregs_[iChunk].apply_mcu(op.qubits, Linalg::VMatrix::rz(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::z, std::real(op.params[0])); break; case Gates::rxx: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::rxx(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::xx, std::real(op.params[0])); break; case Gates::ryy: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::ryy(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::yy, std::real(op.params[0])); break; case Gates::rzz: - BaseState::qregs_[iChunk].apply_diagonal_matrix(op.qubits, Linalg::VMatrix::rzz_diag(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::zz, std::real(op.params[0])); break; case Gates::rzx: - BaseState::qregs_[iChunk].apply_matrix(op.qubits, Linalg::VMatrix::rzx(op.params[0])); + BaseState::qregs_[iChunk].apply_rotation(op.qubits, QV::Rotation::zx, std::real(op.params[0])); break; case Gates::id: break; @@ -1664,47 +1729,90 @@ rvector_t State::measure_probs(const int_t iChunk, const reg_t &qubi BaseState::qubits_inout(qubits,qubits_in_chunk,qubits_out_chunk); + if(BaseState::chunk_omp_parallel_){ #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i,j,k) - for(i=0;i 0){ - auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + for(i=0;i 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); - if(qubits_in_chunk.size() == qubits.size()){ - for(j=0;j> i_in) & 1) << k); - i_in++; - } - else{ - if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ - idx += 1ull << k; + else{ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ + idx += 1ull << k; + } } } - } #pragma omp atomic - sum[idx] += chunkSum[j]; + sum[idx] += chunkSum[j]; + } + } + } + else{ //there is no bit in chunk + auto nr = std::real(BaseState::qregs_[i].norm()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } } +#pragma omp atomic + sum[idx] += nr; } } - else{ //there is no bit in chunk - auto nr = std::real(BaseState::qregs_[i].norm()); - int idx = 0; - for(k=0;k> qubits_out_chunk[k]) & 1){ - idx += 1ull << k; + } + else{ + for(i=0;i 0){ + auto chunkSum = BaseState::qregs_[i].probabilities(qubits_in_chunk); + + if(qubits_in_chunk.size() == qubits.size()){ + for(j=0;j> i_in) & 1) << k); + i_in++; + } + else{ + if((((i + BaseState::global_chunk_index_) << BaseState::chunk_bits_) >> qubits[k]) & 1){ + idx += 1ull << k; + } + } + } + sum[idx] += chunkSum[j]; + } } } -#pragma omp atomic - sum[idx] += nr; + else{ //there is no bit in chunk + auto nr = std::real(BaseState::qregs_[i].norm()); + int idx = 0; + for(k=0;k> qubits_out_chunk[k]) & 1){ + idx += 1ull << k; + } + } + sum[idx] += nr; + } } } @@ -1753,10 +1861,18 @@ void State::measure_reset_update(const int_t iChunk, const std::vect if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig::measure_reset_update(const int_t iChunk, const std::vect if(!BaseState::multi_chunk_distribution_) BaseState::qregs_[iChunk].apply_diagonal_matrix(qubits, mdiag); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig State::sample_measure(const reg_t &qubits, else{ std::vector chunkSum(BaseState::qregs_.size()+1,0); double sum,localSum; + //calculate per chunk sum if(BaseState::chunk_omp_parallel_){ #pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(i) @@ -1944,9 +2069,14 @@ void State::apply_initialize(const int_t iChunk, const reg_t &qubits BaseState::qubits_inout(qubits,qubits_in_chunk,qubits_out_chunk); if(qubits_out_chunk.size() == 0){ //no qubits outside of chunk -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::apply_initialize(const int_t iChunk, const reg_t &qubits perm[i] = 1.0; } -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i 0){ //then scatter outside chunk @@ -2008,9 +2144,14 @@ void State::apply_initialize(const int_t iChunk, const reg_t &qubits } //initialize by params -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::apply_kraus(const int_t iChunk, const reg_t &qubits, } else{ p = 0.0; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) reduction(+:p) - for(int_t i=0;i::apply_kraus(const int_t iChunk, const reg_t &qubits, if(!BaseState::multi_chunk_distribution_) apply_matrix(iChunk, qubits, vmat); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig::apply_kraus(const int_t iChunk, const reg_t &qubits, if(!BaseState::multi_chunk_distribution_) apply_matrix(iChunk, qubits, vmat); else{ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_ && BaseState::num_groups_ > 1) - for(int_t ig=0;ig 1){ +#pragma omp parallel for + for(int_t ig=0;ig::initialize_qreg(uint_t num_qubits) } if(BaseState::multi_chunk_distribution_){ -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); - icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - if(irow == icol) - BaseState::qregs_[iChunk].initialize(); - else - BaseState::qregs_[iChunk].zero(); + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + if(irow == icol) + BaseState::qregs_[iChunk].initialize(); + else + BaseState::qregs_[iChunk].zero(); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_)); + icol = (BaseState::global_chunk_index_ + iChunk) - (irow << ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + if(irow == icol) + BaseState::qregs_[iChunk].initialize(); + else + BaseState::qregs_[iChunk].zero(); + } } } else{ @@ -441,21 +454,40 @@ void State::initialize_qreg(uint_t num_qubits, auto input = unitary.copy_to_matrix(); uint_t mask = (1ull << (BaseState::chunk_bits_)) - 1; -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - uint_t icol = i >> (BaseState::chunk_bits_); - uint_t irow = i & mask; - uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; - tmp[i] = input[idx]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = input[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = input[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -489,21 +521,40 @@ void State::initialize_qreg(uint_t num_qubits, BaseState::qregs_[iChunk].set_num_qubits(BaseState::chunk_bits_); } -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) private(iChunk) - for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); - uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); - - //copy part of state for this chunk - uint_t i,row,col; - cvector_t tmp(1ull << BaseState::chunk_bits_); - for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ - uint_t icol = i >> (BaseState::chunk_bits_); - uint_t irow = i & mask; - uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; - tmp[i] = unitary[idx]; + if(BaseState::chunk_omp_parallel_){ +#pragma omp parallel for private(iChunk) + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = unitary[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); + } + } + else{ + for(iChunk=0;iChunk> ((BaseState::num_qubits_ - BaseState::chunk_bits_))); + uint_t icol_chunk = ((iChunk + BaseState::global_chunk_index_) & ((1ull << ((BaseState::num_qubits_ - BaseState::chunk_bits_)))-1)); + + //copy part of state for this chunk + uint_t i,row,col; + cvector_t tmp(1ull << BaseState::chunk_bits_); + for(i=0;i<(1ull << BaseState::chunk_bits_);i++){ + uint_t icol = i >> (BaseState::chunk_bits_); + uint_t irow = i & mask; + uint_t idx = ((icol+(irow_chunk << BaseState::chunk_bits_)) << (BaseState::num_qubits_)) + (icol_chunk << BaseState::chunk_bits_) + irow; + tmp[i] = unitary[idx]; + } + BaseState::qregs_[iChunk].initialize_from_vector(tmp); } - BaseState::qregs_[iChunk].initialize_from_vector(tmp); } } else{ @@ -673,7 +724,7 @@ void State::apply_matrix(const int_t iChunk, const reg_t &qubi template void State::apply_diagonal_matrix(const int_t iChunk, const reg_t &qubits, const cvector_t &diag) { - if(BaseState::thrust_optimization_){ + if(BaseState::thrust_optimization_ || !BaseState::multi_chunk_distribution_){ //GPU computes all chunks in one kernel, so pass qubits and diagonal matrix as is reg_t qubits_chunk = qubits; for(uint_t i;i void State::apply_global_phase() { if (BaseState::has_global_phase_) { -#pragma omp parallel for if(BaseState::chunk_omp_parallel_) - for(int_t i=0;i::set_num_qubits(size_t num_qubits) { BaseVector::set_num_qubits(2 * num_qubits); } -template -class trace_func : public GateFuncBase -{ -protected: - uint_t rows_; -public: - trace_func(uint_t nrow) - { - rows_ = nrow; - } - bool is_diagonal(void) - { - return true; - } - uint_t size(int num_qubits) - { - this->chunk_bits_ = num_qubits; - return rows_; - } - - __host__ __device__ double operator()(const uint_t &i) const - { - thrust::complex q; - thrust::complex* vec; - - uint_t iChunk = (i / rows_); - uint_t lid = i - (iChunk * rows_); - uint_t idx = (iChunk << this->chunk_bits_) + lid*(rows_ + 1); - - vec = this->data_; - q = vec[idx]; - return q.real(); - } - - const char* name(void) - { - return "trace"; - } -}; template std::complex UnitaryMatrixThrust::trace() const { thrust::complex sum; - double ret; - BaseVector::apply_function_sum(&ret,trace_func(rows_),false); - sum = ret; + sum = BaseVector::chunk_.trace(rows_, 1); #ifdef AER_DEBUG BaseVector::DebugMsg("trace",sum); diff --git a/test/terra/backends/aer_simulator/test_options.py b/test/terra/backends/aer_simulator/test_options.py index cf5e31ab8a..e96f2a1719 100644 --- a/test/terra/backends/aer_simulator/test_options.py +++ b/test/terra/backends/aer_simulator/test_options.py @@ -91,7 +91,9 @@ def test_device_option(self, method, device): result = backend.run(qc).result() value = result.results[0].metadata.get('device', None) - self.assertEqual(value, device) + # device = 'GPU_cuStateVec' when cuStateVec is enabled + # so check if 'GPU' is included in value from result + self.assertTrue((value in device)) @data('automatic', 'statevector', 'density_matrix', 'stabilizer', 'matrix_product_state', 'extended_stabilizer') diff --git a/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py b/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py index 57c2422168..5f79d43f83 100644 --- a/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py +++ b/test/terra/backends/aer_simulator/test_wrapper_qasm_simulator.py @@ -30,6 +30,9 @@ class TestQasmSimulator(SimulatorTestCase): def test_legacy_methods(self, method, device): """Test legacy device method options.""" backend = self.backend() + # GPU_cuStateVec is converted to GPU + if device == "GPU_cuStateVec": + device = "GPU" legacy_method = f"{method}_{device.lower()}" backend.set_options(method=legacy_method) self.assertEqual(backend.options.method, method) diff --git a/test/terra/backends/simulator_test_case.py b/test/terra/backends/simulator_test_case.py index 331fb1fcf8..9f3fa91484 100644 --- a/test/terra/backends/simulator_test_case.py +++ b/test/terra/backends/simulator_test_case.py @@ -18,6 +18,10 @@ import itertools as it from qiskit.providers.aer import AerSimulator from test.terra.common import QiskitAerTestCase +from qiskit.circuit import QuantumCircuit +from qiskit.compiler import assemble +from qiskit.providers.aer.backends.backend_utils import cpp_execute +from qiskit.providers.aer.backends.controller_wrappers import aer_controller_execute class SimulatorTestCase(QiskitAerTestCase): @@ -30,7 +34,11 @@ def backend(self, **options): """Return AerSimulator backend using current class options""" sim_options = self.OPTIONS.copy() for key, val in options.items(): - sim_options[key] = val + if 'device' == key and 'cuStateVec' in val: + sim_options['device'] = 'GPU' + sim_options['cuStateVec_enable'] = True + else: + sim_options[key] = val return self.BACKEND(**sim_options) @@ -66,12 +74,39 @@ def _method_device(methods): if not methods: methods = AerSimulator().available_methods() available_devices = AerSimulator().available_devices() + #add special test device for cuStateVec if available + cuStateVec = check_cuStateVec(available_devices) + gpu_methods = ['statevector', 'density_matrix', 'unitary'] data_args = [] for method in methods: if method in gpu_methods: for device in available_devices: data_args.append((method, device)) + #add test cases for cuStateVec if available using special device = 'GPU_cuStateVec' + #'GPU_cuStateVec' is used only inside tests not available in Aer + #and this is converted to "device='GPU'" and option "cuStateVec_enalbe = True" is added + if cuStateVec: + data_args.append((method, 'GPU_cuStateVec')) else: data_args.append((method, 'CPU')) return data_args + +def check_cuStateVec(devices): + """Return if the system supports cuStateVec or not""" + if 'GPU' in devices: + dummy_circ = QuantumCircuit(1) + dummy_circ.i(0) + qobj = assemble(dummy_circ, + optimization_level=0, + shots=1, + method="statevector", + device="GPU", + cuStateVec_enable=True) + #run dummy circuit to check if Aer is built with cuStateVec + result = cpp_execute(aer_controller_execute(), qobj) + return result.get('success', False) + else: + return False + +