Skip to content

Commit

Permalink
Fix performance of Sabre rust<->Python boundary (#10652)
Browse files Browse the repository at this point in the history
* Fix performance of Sabre rust<->Python boundary

In #10366 the SabreLayout and SabreSwap passes were refactored to
support nested control flow blocks. As part of that refactor a new
struct SabreResult was created to store the nested results for each
block. This new class however resulted in PyO3 cloning the full swap map
on every access. Since we have at least 1 access per gate in the circuit
(and another one for each swap) this extra clone() adds a lot of extra
overhead for deep circuits. In extreme cases this regression could be
quite extreme. To address this the result format is changed to be a
tuple (as it was originally), while this is less ergonomic the advantage
it provides is that for nested objects it moves the rust object to the
pyo3 interface so we avoid a copy as Python owns the object on the
return. However, for control flow blocks we're still using the
SabreResult class as it simplifies the internal implementation (which is
why #10366 introduced the struct). The potential impact of this is
mitigated by switching to only a single clone per control flow block,
by only accessing the SabreResult object's attribute on each recursive
call. However, if it is an issue in the future we can work on flattening
the nested structure before returning it to python to avoid any clones.

Fixes #10650

* Simplify recursive call logic in _apply_sabre_result

This commit simplifies the logic in the recursive call logic in
_apply_sabre_result to always use a tuple so we avoid an isinstance
check.

Co-authored-by: Kevin Hartman <kevin@hart.mn>

---------

Co-authored-by: Kevin Hartman <kevin@hart.mn>
  • Loading branch information
mtreinish and kevinhartman committed Aug 17, 2023
1 parent 998ce07 commit e6c431e
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 12 deletions.
19 changes: 15 additions & 4 deletions crates/accelerate/src/sabre_layout.rs
Expand Up @@ -12,6 +12,7 @@
#![allow(clippy::too_many_arguments)]

use ndarray::prelude::*;
use numpy::IntoPyArray;
use numpy::PyReadonlyArray2;
use pyo3::prelude::*;
use pyo3::wrap_pyfunction;
Expand All @@ -24,10 +25,12 @@ use crate::getenv_use_multiple_threads;
use crate::nlayout::NLayout;
use crate::sabre_swap::neighbor_table::NeighborTable;
use crate::sabre_swap::sabre_dag::SabreDAG;
use crate::sabre_swap::{build_swap_map_inner, Heuristic, SabreResult};
use crate::sabre_swap::swap_map::SwapMap;
use crate::sabre_swap::{build_swap_map_inner, Heuristic, NodeBlockResults, SabreResult};

#[pyfunction]
pub fn sabre_layout_and_routing(
py: Python,
dag: &SabreDAG,
neighbor_table: &NeighborTable,
distance_matrix: PyReadonlyArray2<f64>,
Expand All @@ -36,7 +39,7 @@ pub fn sabre_layout_and_routing(
num_swap_trials: usize,
num_layout_trials: usize,
seed: Option<u64>,
) -> ([NLayout; 2], SabreResult) {
) -> ([NLayout; 2], (SwapMap, PyObject, NodeBlockResults)) {
let run_in_parallel = getenv_use_multiple_threads();
let outer_rng = match seed {
Some(seed) => Pcg64Mcg::seed_from_u64(seed),
Expand All @@ -47,7 +50,7 @@ pub fn sabre_layout_and_routing(
.take(num_layout_trials)
.collect();
let dist = distance_matrix.as_array();
if run_in_parallel && num_layout_trials > 1 {
let res = if run_in_parallel && num_layout_trials > 1 {
seed_vec
.into_par_iter()
.enumerate()
Expand Down Expand Up @@ -91,7 +94,15 @@ pub fn sabre_layout_and_routing(
})
.min_by_key(|(_, result)| result.map.map.values().map(|x| x.len()).sum::<usize>())
.unwrap()
}
};
(
res.0,
(
res.1.map,
res.1.node_order.into_pyarray(py).into(),
res.1.node_block_results,
),
)
}

fn layout_trial(
Expand Down
12 changes: 9 additions & 3 deletions crates/accelerate/src/sabre_swap/mod.rs
Expand Up @@ -208,12 +208,13 @@ fn cmap_from_neighor_table(neighbor_table: &NeighborTable) -> DiGraph<(), ()> {
/// Run sabre swap on a circuit
///
/// Returns:
/// (SwapMap, gate_order): A tuple where the first element is a mapping of
/// (SwapMap, gate_order, node_block_results): A tuple where the first element is a mapping of
/// DAGCircuit node ids to a list of virtual qubit swaps that should be
/// added before that operation. The second element is a numpy array of
/// node ids that represents the traversal order used by sabre.
#[pyfunction]
pub fn build_swap_map(
py: Python,
num_qubits: usize,
dag: &SabreDAG,
neighbor_table: &NeighborTable,
Expand All @@ -223,9 +224,9 @@ pub fn build_swap_map(
num_trials: usize,
seed: Option<u64>,
run_in_parallel: Option<bool>,
) -> SabreResult {
) -> (SwapMap, PyObject, NodeBlockResults) {
let dist = distance_matrix.as_array();
build_swap_map_inner(
let res = build_swap_map_inner(
num_qubits,
dag,
neighbor_table,
Expand All @@ -235,6 +236,11 @@ pub fn build_swap_map(
layout,
num_trials,
run_in_parallel,
);
(
res.map,
res.node_order.into_pyarray(py).into(),
res.node_block_results,
)
}

Expand Down
17 changes: 12 additions & 5 deletions qiskit/transpiler/passes/routing/sabre_swap.py
Expand Up @@ -341,11 +341,14 @@ def empty_dag(node, block):
return out

def apply_inner(out_dag, current_layout, qubit_indices_inner, result, id_to_node):
for node_id in result.node_order:
node_order = result[1]
swap_map = result[0]
node_block_results = result[2]
for node_id in node_order:
node = id_to_node[node_id]
if isinstance(node.op, ControlFlowOp):
# Handle control flow op and continue.
block_results = result.node_block_results[node_id]
block_results = node_block_results[node_id]
mapped_block_dags = []
idle_qubits = set(out_dag.qubits)
for block, block_result in zip(node.op.blocks, block_results):
Expand All @@ -360,7 +363,11 @@ def apply_inner(out_dag, current_layout, qubit_indices_inner, result, id_to_node
mapped_block_dag,
mapped_block_layout,
block_qubit_indices,
block_result.result,
(
block_result.result.map,
block_result.result.node_order,
block_result.result.node_block_results,
),
block_id_to_node,
)

Expand Down Expand Up @@ -396,9 +403,9 @@ def apply_inner(out_dag, current_layout, qubit_indices_inner, result, id_to_node
continue

# If we get here, the node isn't a control-flow gate.
if node_id in result.map:
if node_id in swap_map:
process_swaps(
result.map[node_id],
swap_map[node_id],
out_dag,
current_layout,
canonical_register,
Expand Down
@@ -0,0 +1,6 @@
---
fixes:
- |
Fixed a performance regression in the :class:`~.SabreLayout` and
:class:`~.SabreSwap` transpiler passes.
Fixed `#10650 <https://github.com/Qiskit/qiskit-terra/issues/10650>`__

0 comments on commit e6c431e

Please sign in to comment.