Add experimental support of cuQuantum (#1400)

* Add cuStateVec support Added support for cuQuantum, NVIDIA's APIs for quantum computing, to accelerate statevector, density matrix and unitary simulators by using GPUs. To include cuQuantum, custom build of Aer is necessary by setting path of cuQuantum library to CUSTATEVEC_ROOT (Binary distribution will be after official release of cuQuantum, which is now Beta 2 (0.1.0). cuStateVector of cuQuantum is enabled by setting device='GPU' and cuStateVec_threshold options. cuStateVec is enabled when number of qubits of input circuit is equal or greater than cuStateVec_threshold. Since cuQuantum is beta version, there are some limitations: - cuStateVec is not thread safe, multi-chunk parallelization (cache blocking) is done by single thread (slow) - Multi-shots parallelization is disabled (single thread, slow) - Multi-shots batched optimization is not support for cuStateVec Co-authored-by: Christopher J. Wood <cjwood@us.ibm.com> Co-authored-by: Hiroshi Horii <hhorii@users.noreply.github.com>
Qiskit · Mar 1, 2022 · db91e7d · db91e7d
1 parent 8c400ca
commit db91e7d
Show file tree

Hide file tree

Showing 26 changed files with 5,214 additions and 3,316 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -257,6 +257,15 @@ if(AER_THRUST_SUPPORTED)
 
 		set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA)
 		set(THRUST_DEPENDENT_LIBS "")
+		if(CUSTATEVEC_ROOT)
+			set(AER_COMPILER_DEFINITIONS ${AER_COMPILER_DEFINITIONS} AER_CUSTATEVEC)
+			set(AER_COMPILER_FLAGS "${AER_COMPILER_FLAGS} -I${CUSTATEVEC_ROOT}/include")
+            if(CUSTATEVEC_STATIC)
+				set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec_static -L${CUDA_TOOLKIT_ROOT_DIR}/lib64 -lcublas")
+			else()
+				set(THRUST_DEPENDANT_LIBS "-L${CUSTATEVEC_ROOT}/lib -L${CUSTATEVEC_ROOT}/lib64 -lcustatevec")
+			endif()
+		endif()
 	elseif(AER_THRUST_BACKEND STREQUAL "TBB")
 		message(STATUS "TBB Support found!")
 		set(THRUST_DEPENDENT_LIBS AER_DEPENDENCY_PKG::tbb)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -643,6 +643,34 @@ Few notes on GPU builds:
 3. We don't need NVIDIA® drivers for building, but we need them for running simulations
 4. Only Linux platforms are supported
 
+Qiskit Aer now supports cuQuantum optimized Quantum computing APIs from NVIDIA®.
+cuStateVec APIs can be exploited to accelerate statevector, density_matrix and unitary methods.
+Because cuQuantum is beta version currently, some of the operations are not accelerated by cuStateVec.
+
+To build Qiskit Aer with cuStateVec support, please set the path to cuQuantum root directory to CUSTATEVEC_ROOT as following.
+
+For example,
+
+    qiskit-aer$ python ./setup.py bdist_wheel -- -DAER_THRUST_BACKEND=CUDA -DCUSTATEVEC_ROOT=path_to_cuQuantum
+
+if you want to link cuQuantum library statically, set `CUSTATEVEC_STATIC` to setup.py. 
+Otherwise you also have to set environmental variable LD_LIBRARY_PATH to indicate path to the cuQuantum libraries.
+
+To run with cuStateVec, set `device='GPU'` to AerSimulator option and set `cuStateVec_enable=True` to option in execute method.
+
+```
+sim = AerSimulator(method='statevector', device='GPU')
+results = execute(circuit,sim,cuStateVec_enable=True).result()
+```
+
+Also you can accelrate density matrix and unitary matrix simulations as well.
+```
+sim = AerSimulator(method='density_matrix', device='GPU')
+results = execute(circuit,sim,cuStateVec_enable=True).result()
+```
+
+
+
 ### Building with MPI support
 
 Qiskit Aer can parallelize its simulation on the cluster systems by using MPI. 

diff --git a/qiskit/providers/aer/backends/aer_simulator.py b/qiskit/providers/aer/backends/aer_simulator.py
@@ -148,6 +148,10 @@ class AerSimulator(AerBackend):
     initialization or with :meth:`set_options`. The list of supported devices
     for the current system can be returned using :meth:`available_devices`.
 
+    If AerSimulator is built with cuStateVec support, cuStateVec APIs are enabled
+    by setting ``cuStateVec_enable=True``. This is experimental implementation
+    based on cuQuantum Beta 2.
+
     **Additional Backend Options**
 
     The following simulator specific backend options are supported
@@ -216,6 +220,11 @@ class AerSimulator(AerBackend):
       values (16 Bytes). If set to 0, the maximum will be automatically
       set to the system memory size (Default: 0).
 
+    * ``cuStateVec_enable`` (bool): This option enables accelerating by
+      cuStateVec library of cuQuantum from NVIDIA, that has highly optimized
+      kernels for GPUs (Default: False). This option will be ignored
+      if AerSimulator is not built with cuStateVec support.
+
     * ``blocking_enable`` (bool): This option enables parallelization with
       multiple GPUs or multiple processes with MPI (CPU/GPU). This option
       is only available for ``"statevector"``, ``"density_matrix"`` and
@@ -514,6 +523,8 @@ def _default_options(cls):
             memory=None,
             noise_model=None,
             seed_simulator=None,
+            # cuStateVec (cuQuantum) option
+            cuStateVec_enable=False,
             # cache blocking for multi-GPUs/MPI options
             blocking_qubits=None,
             blocking_enable=False,

diff --git a/qiskit/providers/aer/backends/qasm_simulator.py b/qiskit/providers/aer/backends/qasm_simulator.py
@@ -339,9 +339,9 @@ class QasmSimulator(AerBackend):
     }
 
     _SIMULATION_METHODS = [
-        'automatic', 'statevector', 'statevector_gpu',
+        'automatic', 'statevector', 'statevector_gpu', 'statevector_custatevec',
         'statevector_thrust', 'density_matrix',
-        'density_matrix_gpu', 'density_matrix_thrust',
+        'density_matrix_gpu', 'density_matrix_custatevec', 'density_matrix_thrust',
         'stabilizer', 'matrix_product_state', 'extended_stabilizer'
     ]
 
@@ -595,7 +595,8 @@ def _basis_gates(self):
     def _method_basis_gates(self):
         """Return method basis gates and custom instructions"""
         method = self._options.get('method', None)
-        if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']:
+        if method in ['density_matrix', 'density_matrix_gpu',
+                      'density_matrix_custatevec', 'density_matrix_thrust']:
             return sorted([
                 'u1', 'u2', 'u3', 'u', 'p', 'r', 'rx', 'ry', 'rz', 'id', 'x',
                 'y', 'z', 'h', 's', 'sdg', 'sx', 'sxdg', 't', 'tdg', 'swap', 'cx',
@@ -628,15 +629,17 @@ def _custom_instructions(self):
             return self._options_configuration['custom_instructions']
 
         method = self._options.get('method', None)
-        if method in ['statevector', 'statevector_gpu', 'statevector_thrust']:
+        if method in ['statevector', 'statevector_gpu',
+                      'statevector_custatevec', 'statevector_thrust']:
             return sorted([
                 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'snapshot', 'save_expval',
                 'save_expval_var', 'save_probabilities', 'save_probabilities_dict',
                 'save_amplitudes', 'save_amplitudes_sq', 'save_state',
                 'save_density_matrix', 'save_statevector', 'save_statevector_dict',
                 'set_statevector'
             ])
-        if method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']:
+        if method in ['density_matrix', 'density_matrix_gpu',
+                      'density_matrix_custatevec', 'density_matrix_thrust']:
             return sorted([
                 'quantum_channel', 'qerror_loc', 'roerror', 'kraus', 'superop', 'snapshot',
                 'save_expval', 'save_expval_var', 'save_probabilities', 'save_probabilities_dict',
@@ -666,10 +669,12 @@ def _custom_instructions(self):
     def _set_method_config(self, method=None):
         """Set non-basis gate options when setting method"""
         # Update configuration description and number of qubits
-        if method in ['statevector', 'statevector_gpu', 'statevector_thrust']:
+        if method in ['statevector', 'statevector_gpu',
+                      'statevector_custatevec', 'statevector_thrust']:
             description = 'A C++ statevector simulator with noise'
             n_qubits = MAX_QUBITS_STATEVECTOR
-        elif method in ['density_matrix', 'density_matrix_gpu', 'density_matrix_thrust']:
+        elif method in ['density_matrix', 'density_matrix_gpu',
+                        'density_matrix_custatevec', 'density_matrix_thrust']:
             description = 'A C++ density matrix simulator with noise'
             n_qubits = MAX_QUBITS_STATEVECTOR // 2
         elif method == 'matrix_product_state':

diff --git a/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml b/releasenotes/notes/cuQuantum-support-d33abe5b1cb778a8.yaml
@@ -0,0 +1,13 @@
+---
+features:
+  - |
+    Added support for cuQuantum, NVIDIA's APIs for quantum computing,
+    to accelerate statevector, density matrix and unitary simulators
+    by using GPUs.
+    This is experiemental implementation for cuQuantum Beta 2. (0.1.0)
+    cuStateVec APIs are enabled to accelerate instead of Aer's implementations
+    by building Aer by setting path of cuQuantum to ``CUSTATEVEC_ROOT``.
+    (binary distribution is not available currently.)
+    cuStateVector is enabled by setting ``device='GPU'`` and 
+    ``cuStateVec_threshold`` options. cuStateVec is enabled when number of
+    qubits of input circuit is equal or greater than ``cuStateVec_threshold``.
diff --git a/src/controllers/aer_controller.hpp b/src/controllers/aer_controller.hpp
@@ -377,6 +377,8 @@ class Controller {
   int_t batched_shots_gpu_max_qubits_ = 16;   //multi-shot parallelization is applied if qubits is less than max qubits
   bool enable_batch_multi_shots_ = false;   //multi-shot parallelization can be applied
 
+  //settings for cuStateVec
+  bool cuStateVec_enable_ = false;
 };
 
 //=========================================================================
@@ -466,6 +468,12 @@ void Controller::set_config(const json_t &config) {
     JSON::get_value(batched_shots_gpu_max_qubits_, "batched_shots_gpu_max_qubits", config);
   }
 
+  //cuStateVec configs
+  cuStateVec_enable_ = false;
+  if(JSON::check_key("cuStateVec_enable", config)) {
+    JSON::get_value(cuStateVec_enable_, "cuStateVec_enable", config);
+  }
+
   // Override automatic simulation method with a fixed method
   std::string method;
   if (JSON::get_value(method, "method", config)) {
@@ -489,6 +497,9 @@ void Controller::set_config(const json_t &config) {
     }
   }
 
+  if(method_ == Method::density_matrix || method_ == Method::unitary)
+    batched_shots_gpu_max_qubits_ /= 2;
+
   // Override automatic simulation method with a fixed method
   if (JSON::get_value(sim_device_name_, "device", config)) {
     if (sim_device_name_ == "CPU") {
@@ -502,18 +513,37 @@ void Controller::set_config(const json_t &config) {
 #endif
     } else if (sim_device_name_ == "GPU") {
 #ifndef AER_THRUST_CUDA
-        throw std::runtime_error(
-            "Simulation device \"GPU\" is not supported on this system");
+      throw std::runtime_error(
+          "Simulation device \"GPU\" is not supported on this system");
 #else
-        int nDev;
-        if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
-            cudaGetLastError();
-            throw std::runtime_error("No CUDA device available!");
-        }
 
-        sim_device_ = Device::GPU;
+#ifndef AER_CUSTATEVEC
+      if(cuStateVec_enable_){
+        //Aer is not built for cuStateVec
+        throw std::runtime_error(
+            "Simulation device \"GPU\" does not supported cuStateVec on this system");
+      }
 #endif
+      int nDev;
+      if (cudaGetDeviceCount(&nDev) != cudaSuccess) {
+          cudaGetLastError();
+          throw std::runtime_error("No CUDA device available!");
+      }
+      sim_device_ = Device::GPU;
+
+#ifdef AER_CUSTATEVEC
+      if(cuStateVec_enable_){
+        //initialize custatevevtor handle once before actual calculation (takes long time at first call)
+        custatevecStatus_t err;
+        custatevecHandle_t stHandle;
+        err = custatevecCreate(&stHandle);
+        if(err == CUSTATEVEC_STATUS_SUCCESS){
+          custatevecDestroy(stHandle);
+        }
       }
+#endif
+#endif
+    }
     else {
       throw std::runtime_error(std::string("Invalid simulation device (\"") +
                                sim_device_name_ + std::string("\")."));
@@ -636,9 +666,16 @@ void Controller::set_parallelization_circuit(const Circuit &circ,
                                              const Method method)  
 {
   enable_batch_multi_shots_ = false;
-  if(batched_shots_gpu_ && sim_device_ == Device::GPU && circ.shots > 1 && max_batched_states_ >= num_gpus_ && 
-              batched_shots_gpu_max_qubits_ >= circ.num_qubits ){
-    enable_batch_multi_shots_ = true;
+  if(batched_shots_gpu_ && sim_device_ == Device::GPU && 
+     circ.shots > 1 && max_batched_states_ >= num_gpus_ && 
+     batched_shots_gpu_max_qubits_ >= circ.num_qubits ){
+      enable_batch_multi_shots_ = true;
+  }
+
+  if(sim_device_ == Device::GPU && cuStateVec_enable_){
+    enable_batch_multi_shots_ = false;    //cuStateVec does not support batch execution of multi-shots
+    parallel_shots_ = 1;    //cuStateVec is currently not thread safe
+    return;
   }
 
   if(explicit_parallelization_)
@@ -785,6 +822,7 @@ size_t Controller::get_gpu_memory_mb() {
   }
   num_gpus_ = nDev;
 #endif
+
 #ifdef AER_MPI
   // get minimum memory size per process
   uint64_t locMem, minMem;
@@ -866,7 +904,6 @@ Result Controller::execute(const inputdata_t &input_qobj) {
     auto time_taken =
         std::chrono::duration<double>(myclock_t::now() - timer_start).count();
     result.metadata.add(time_taken, "time_taken");
-
     return result;
   } catch (std::exception &e) {
     // qobj was invalid, return valid output containing error message
@@ -959,7 +996,7 @@ Result Controller::execute(std::vector<Circuit> &circuits,
     const int NUM_RESULTS = result.results.size();
     //following looks very similar but we have to separate them to avoid omp nested loops that causes performance degradation
     //(DO NOT use if statement in #pragma omp)
-    if (parallel_experiments_ == 1) {
+    if (parallel_experiments_ == 1 || sim_device_ == Device::ThrustCPU) {
       for (int j = 0; j < NUM_RESULTS; ++j) {
         set_parallelization_circuit(circuits[j], noise_model, methods[j]);
         run_circuit(circuits[j], noise_model,methods[j],
@@ -1439,7 +1476,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ,
   // Check if measure sampler and optimization are valid
   if (can_sample) {
     // Implement measure sampler
-    if (parallel_shots_ <= 1) {
+    if (parallel_shots_ <= 1 || sim_device_ == Device::GPU || sim_device_ == Device::ThrustCPU) {
       state.set_max_matrix_qubits(max_bits);
       RngEngine rng;
       rng.set_seed(circ.seed);
@@ -1460,7 +1497,7 @@ void Controller::run_circuit_without_sampled_noise(Circuit &circ,
         shot_state.set_parallelization(parallel_state_update_);
         shot_state.set_global_phase(circ.global_phase_angle);
 
-        state.set_max_matrix_qubits(max_bits);
+        shot_state.set_max_matrix_qubits(max_bits);
 
         RngEngine rng;
         rng.set_seed(circ.seed + i);
@@ -1736,7 +1773,12 @@ void Controller::measure_sampler(
     shots_or_index = shots;
   else
     shots_or_index = shot_index;
+
+  auto timer_start = myclock_t::now();
   auto all_samples = state.sample_measure(meas_qubits, shots_or_index, rng);
+  auto time_taken =
+      std::chrono::duration<double>(myclock_t::now() - timer_start).count();
+  result.metadata.add(time_taken, "sample_measure_time");
 
   // Make qubit map of position in vector of measured qubits
   std::unordered_map<uint_t, uint_t> qubit_map;