Skip to content

Commit

Permalink
C++ simulator performance improvements (#329)
Browse files Browse the repository at this point in the history
* C++ simulator performance: make the swap-gate run in native C++

It was defined as a BasicMathGate before which made it run as python
code through the emulate_math_wrapper. The new variant just uses its
matrix representation to run it in native code.

* C++ simulator performance: add dedicated C++ code for common math gates

The BasicMathGate uses a C++ python wrapper (emulate_math_wrapper) to allow generic
calculations which makes it very slow. This detects some math gates and
provides a native C++ implementation for it.

* C++ simulator performance: use larger memory alignment

* C++ simulator performance: recycle large StateVector memory buffers

This avoids costly std::vector copying/reallocations by using some
static std::vector to reuse the allocated buffer (just by std::swap'ing
a vector into a buffer for later use when it would be deallocated
otherwise).

* C++ simulator performance: improve compiler flags

* Add test coverage for constant math emulation

* Revert "Add test coverage for constant math emulation"

This reverts commit 3bb8a2c.

* Add test coverage for constant math emulation
  • Loading branch information
melven authored and Takishima committed Aug 20, 2019
1 parent 9db3383 commit f27286f
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 45 deletions.
154 changes: 114 additions & 40 deletions projectq/backends/_sim/_cppkernels/simulator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class Simulator{
public:
using calc_type = double;
using complex_type = std::complex<calc_type>;
using StateVector = std::vector<complex_type, aligned_allocator<complex_type,64>>;
using StateVector = std::vector<complex_type, aligned_allocator<complex_type,512>>;
using Map = std::map<unsigned, unsigned>;
using RndEngine = std::mt19937;
using Term = std::vector<std::pair<unsigned, char>>;
Expand All @@ -55,11 +55,18 @@ class Simulator{
void allocate_qubit(unsigned id){
if (map_.count(id) == 0){
map_[id] = N_++;
auto newvec = StateVector(1UL << N_);
#pragma omp parallel for schedule(static)
StateVector newvec; // avoid large memory allocations
if( tmpBuff1_.capacity() >= (1UL << N_) )
std::swap(newvec, tmpBuff1_);
newvec.resize(1UL << N_);
#pragma omp parallel for schedule(static)
for (std::size_t i = 0; i < newvec.size(); ++i)
newvec[i] = (i < vec_.size())?vec_[i]:0.;
vec_ = std::move(newvec);
std::swap(vec_, newvec);
// recycle large memory
std::swap(tmpBuff1_, newvec);
if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
std::swap(tmpBuff1_, tmpBuff2_);
}
else
throw(std::runtime_error(
Expand Down Expand Up @@ -113,12 +120,18 @@ class Simulator{
}
}
else{
StateVector newvec((1UL << (N_-1)));
#pragma omp parallel for schedule(static)
StateVector newvec; // avoid costly memory reallocations
if( tmpBuff1_.capacity() >= (1UL << (N_-1)) )
std::swap(tmpBuff1_, newvec);
newvec.resize((1UL << (N_-1)));
#pragma omp parallel for schedule(static) if(0)
for (std::size_t i = 0; i < vec_.size(); i += 2*delta)
std::copy_n(&vec_[i + static_cast<std::size_t>(value)*delta],
delta, &newvec[i/2]);
vec_ = std::move(newvec);
std::swap(vec_, newvec);
std::swap(tmpBuff1_, newvec);
if( tmpBuff1_.capacity() < tmpBuff2_.capacity() )
std::swap(tmpBuff1_, tmpBuff2_);

for (auto& p : map_){
if (p.second > pos)
Expand Down Expand Up @@ -189,8 +202,8 @@ class Simulator{
}

template <class M>
void apply_controlled_gate(M const& m, std::vector<unsigned> ids,
std::vector<unsigned> ctrl){
void apply_controlled_gate(M const& m, const std::vector<unsigned>& ids,
const std::vector<unsigned>& ctrl){
auto fused_gates = fused_gates_;
fused_gates.insert(m, ids, ctrl);

Expand All @@ -209,46 +222,85 @@ class Simulator{
}

template <class F, class QuReg>
void emulate_math(F const& f, QuReg quregs, std::vector<unsigned> ctrl,
unsigned num_threads=1){
void emulate_math(F const& f, QuReg quregs, const std::vector<unsigned>& ctrl,
bool parallelize = false){
run();
auto ctrlmask = get_control_mask(ctrl);

for (unsigned i = 0; i < quregs.size(); ++i)
for (unsigned j = 0; j < quregs[i].size(); ++j)
quregs[i][j] = map_[quregs[i][j]];

StateVector newvec(vec_.size(), 0.);
std::vector<int> res(quregs.size());

#pragma omp parallel for schedule(static) firstprivate(res) num_threads(num_threads)
for (std::size_t i = 0; i < vec_.size(); ++i){
if ((ctrlmask&i) == ctrlmask){
for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
res[qr_i] = 0;
for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i)
res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i;
}
f(res);
auto new_i = i;
for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){
if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1)))
new_i ^= (1UL << quregs[qr_i][qb_i]);
}
}
newvec[new_i] += vec_[i];
}
else
newvec[i] += vec_[i];
StateVector newvec; // avoid costly memory reallocations
if( tmpBuff1_.capacity() >= vec_.size() )
std::swap(newvec, tmpBuff1_);
newvec.resize(vec_.size());
#pragma omp parallel for schedule(static)
for (std::size_t i = 0; i < vec_.size(); i++)
newvec[i] = 0;

//#pragma omp parallel reduction(+:newvec[:newvec.size()]) if(parallelize) // requires OpenMP 4.5
{
std::vector<int> res(quregs.size());
//#pragma omp for schedule(static)
for (std::size_t i = 0; i < vec_.size(); ++i){
if ((ctrlmask&i) == ctrlmask){
for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
res[qr_i] = 0;
for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i)
res[qr_i] |= ((i >> quregs[qr_i][qb_i])&1) << qb_i;
}
f(res);
auto new_i = i;
for (unsigned qr_i = 0; qr_i < quregs.size(); ++qr_i){
for (unsigned qb_i = 0; qb_i < quregs[qr_i].size(); ++qb_i){
if (!(((new_i >> quregs[qr_i][qb_i])&1) == ((res[qr_i] >> qb_i)&1)))
new_i ^= (1UL << quregs[qr_i][qb_i]);
}
}
newvec[new_i] += vec_[i];
}
else
newvec[i] += vec_[i];
}
}
vec_ = std::move(newvec);
std::swap(vec_, newvec);
std::swap(tmpBuff1_, newvec);
}

// faster version without calling python
template<class QuReg>
inline void emulate_math_addConstant(int a, const QuReg& quregs, const std::vector<unsigned>& ctrl)
{
emulate_math([a](std::vector<int> &res){for(auto& x: res) x = x + a;}, quregs, ctrl, true);
}

// faster version without calling python
template<class QuReg>
inline void emulate_math_addConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
{
emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x + a) % N;}, quregs, ctrl, true);
}

// faster version without calling python
template<class QuReg>
inline void emulate_math_multiplyByConstantModN(int a, int N, const QuReg& quregs, const std::vector<unsigned>& ctrl)
{
emulate_math([a,N](std::vector<int> &res){for(auto& x: res) x = (x * a) % N;}, quregs, ctrl, true);
}

calc_type get_expectation_value(TermsDict const& td, std::vector<unsigned> const& ids){
run();
calc_type expectation = 0.;
auto current_state = vec_;

StateVector current_state; // avoid costly memory reallocations
if( tmpBuff1_.capacity() >= vec_.size() )
std::swap(tmpBuff1_, current_state);
current_state.resize(vec_.size());
#pragma omp parallel for schedule(static)
for (std::size_t i = 0; i < vec_.size(); ++i)
current_state[i] = vec_[i];

for (auto const& term : td){
auto const& coefficient = term.second;
apply_term(term.first, ids, {});
Expand All @@ -260,17 +312,29 @@ class Simulator{
auto const a2 = std::real(vec_[i]);
auto const b2 = std::imag(vec_[i]);
delta += a1 * a2 - b1 * b2;
// reset vec_
vec_[i] = current_state[i];
}
expectation += coefficient * delta;
vec_ = current_state;
}
std::swap(current_state, tmpBuff1_);
return expectation;
}

void apply_qubit_operator(ComplexTermsDict const& td, std::vector<unsigned> const& ids){
run();
auto new_state = StateVector(vec_.size(), 0.);
auto current_state = vec_;
StateVector new_state, current_state; // avoid costly memory reallocations
if( tmpBuff1_.capacity() >= vec_.size() )
std::swap(tmpBuff1_, new_state);
if( tmpBuff2_.capacity() >= vec_.size() )
std::swap(tmpBuff2_, current_state);
new_state.resize(vec_.size());
current_state.resize(vec_.size());
#pragma omp parallel for schedule(static)
for (std::size_t i = 0; i < vec_.size(); ++i){
new_state[i] = 0;
current_state[i] = vec_[i];
}
for (auto const& term : td){
auto const& coefficient = term.second;
apply_term(term.first, ids, {});
Expand All @@ -280,7 +344,9 @@ class Simulator{
vec_[i] = current_state[i];
}
}
vec_ = std::move(new_state);
std::swap(vec_, new_state);
std::swap(tmpBuff1_, new_state);
std::swap(tmpBuff2_, current_state);
}

calc_type get_probability(std::vector<bool> const& bit_string,
Expand Down Expand Up @@ -452,6 +518,8 @@ class Simulator{
#pragma omp parallel
kernel(vec_, ids[4], ids[3], ids[2], ids[1], ids[0], m, ctrlmask);
break;
default:
throw std::invalid_argument("Gates with more than 5 qubits are not supported!");
}

fused_gates_ = Fusion();
Expand Down Expand Up @@ -500,6 +568,12 @@ class Simulator{
unsigned fusion_qubits_min_, fusion_qubits_max_;
RndEngine rnd_eng_;
std::function<double()> rng_;

// large array buffers to avoid costly reallocations
static StateVector tmpBuff1_, tmpBuff2_;
};

Simulator::StateVector Simulator::tmpBuff1_;
Simulator::StateVector Simulator::tmpBuff2_;

#endif
3 changes: 3 additions & 0 deletions projectq/backends/_sim/_cppsim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ PYBIND11_PLUGIN(_cppsim) {
.def("measure_qubits", &Simulator::measure_qubits_return)
.def("apply_controlled_gate", &Simulator::apply_controlled_gate<MatrixType>)
.def("emulate_math", &emulate_math_wrapper<QuRegs>)
.def("emulate_math_addConstant", &Simulator::emulate_math_addConstant<QuRegs>)
.def("emulate_math_addConstantModN", &Simulator::emulate_math_addConstantModN<QuRegs>)
.def("emulate_math_multiplyByConstantModN", &Simulator::emulate_math_multiplyByConstantModN<QuRegs>)
.def("get_expectation_value", &Simulator::get_expectation_value)
.def("apply_qubit_operator", &Simulator::apply_qubit_operator)
.def("emulate_time_evolution", &Simulator::emulate_time_evolution)
Expand Down
28 changes: 25 additions & 3 deletions projectq/backends/_sim/_simulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@
TimeEvolution)
from projectq.types import WeakQubitRef

FALLBACK_TO_PYSIM = False
try:
from ._cppsim import Simulator as SimulatorBackend
except ImportError:
from ._pysim import Simulator as SimulatorBackend
FALLBACK_TO_PYSIM = True


class Simulator(BasicEngine):
Expand Down Expand Up @@ -384,14 +386,34 @@ def _handle(self, cmd):
ID = cmd.qubits[0][0].id
self._simulator.deallocate_qubit(ID)
elif isinstance(cmd.gate, BasicMathGate):
# improve performance by using C++ code for some commomn gates
from projectq.libs.math import (AddConstant,
AddConstantModN,
MultiplyByConstantModN)
qubitids = []
for qr in cmd.qubits:
qubitids.append([])
for qb in qr:
qubitids[-1].append(qb.id)
math_fun = cmd.gate.get_math_function(cmd.qubits)
self._simulator.emulate_math(math_fun, qubitids,
[qb.id for qb in cmd.control_qubits])
if FALLBACK_TO_PYSIM:
math_fun = cmd.gate.get_math_function(cmd.qubits)
self._simulator.emulate_math(math_fun, qubitids,
[qb.id for qb in cmd.control_qubits])
else:
# individual code for different standard gates to make it faster!
if isinstance(cmd.gate, AddConstant):
self._simulator.emulate_math_addConstant(cmd.gate.a, qubitids,
[qb.id for qb in cmd.control_qubits])
elif isinstance(cmd.gate, AddConstantModN):
self._simulator.emulate_math_addConstantModN(cmd.gate.a, cmd.gate.N, qubitids,
[qb.id for qb in cmd.control_qubits])
elif isinstance(cmd.gate, MultiplyByConstantModN):
self._simulator.emulate_math_multiplyByConstantModN(cmd.gate.a, cmd.gate.N, qubitids,
[qb.id for qb in cmd.control_qubits])
else:
math_fun = cmd.gate.get_math_function(cmd.qubits)
self._simulator.emulate_math(math_fun, qubitids,
[qb.id for qb in cmd.control_qubits])
elif isinstance(cmd.gate, TimeEvolution):
op = [(list(term), coeff) for (term, coeff)
in cmd.gate.hamiltonian.terms.items()]
Expand Down
52 changes: 52 additions & 0 deletions projectq/backends/_sim/_simulator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -683,3 +683,55 @@ def receive(command_list):
qubit1[0].id: qubit0[0].id}
assert (sim._convert_logical_to_mapped_qureg(qubit0 + qubit1) ==
qubit1 + qubit0)


def test_simulator_constant_math_emulation():
if "cpp_simulator" not in get_available_simulators():
pytest.skip("No C++ simulator")
return

results = [[[1, 1, 0, 0, 0]], [[0, 1, 0, 0, 0]], [[0, 1, 1, 1, 0]]]

import projectq.backends._sim._simulator as _sim
from projectq.backends._sim._pysim import Simulator as PySim
from projectq.backends._sim._cppsim import Simulator as CppSim
from projectq.libs.math import (AddConstant, AddConstantModN,
MultiplyByConstantModN)

def gate_filter(eng, cmd):
g = cmd.gate
if isinstance(g, BasicMathGate):
return False
return eng.next_engine.is_available(cmd)

def run_simulation(sim):
eng = MainEngine(sim, [])
quint = eng.allocate_qureg(5)
AddConstant(3) | quint
All(Measure) | quint
eng.flush()
results[0].append([int(qb) for qb in quint])

AddConstantModN(4, 5) | quint
All(Measure) | quint
eng.flush()
results[1].append([int(qb) for qb in quint])

MultiplyByConstantModN(15, 16) | quint
All(Measure) | quint
eng.flush()
results[2].append([int(qb) for qb in quint])

cppsim = Simulator(gate_fusion=False)
cppsim._simulator = CppSim(1)
run_simulation(cppsim)

_sim.FALLBACK_TO_PYSIM = True
pysim = Simulator()
pysim._simulator = PySim(1)
# run_simulation(pysim)

for result in results:
ref = result[0]
for res in result[1:]:
assert ref == res
3 changes: 1 addition & 2 deletions projectq/ops/_gates.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,9 @@ def __str__(self):
SqrtX = SqrtXGate()


class SwapGate(SelfInverseGate, BasicMathGate):
class SwapGate(SelfInverseGate):
""" Swap gate class (swaps 2 qubits) """
def __init__(self):
BasicMathGate.__init__(self, lambda x, y: (y, x))
SelfInverseGate.__init__(self)
self.interchangeable_qubit_indices = [[0, 1]]

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def build_extensions(self):
opts.append('/arch:AVX')
else:
opts.append('-march=native')
opts.append('-ffast-math')

opts.append(openmp)
if ct == 'unix':
Expand Down

0 comments on commit f27286f

Please sign in to comment.