From e6c431e30682fe7112e2a7079e520383baa982ec Mon Sep 17 00:00:00 2001 From: Matthew Treinish Date: Thu, 17 Aug 2023 11:23:45 -0400 Subject: [PATCH] Fix performance of Sabre rust<->Python boundary (#10652) * Fix performance of Sabre rust<->Python boundary In #10366 the SabreLayout and SabreSwap passes were refactored to support nested control flow blocks. As part of that refactor a new struct SabreResult was created to store the nested results for each block. This new class however resulted in PyO3 cloning the full swap map on every access. Since we have at least 1 access per gate in the circuit (and another one for each swap) this extra clone() adds a lot of extra overhead for deep circuits. In extreme cases this regression could be quite extreme. To address this the result format is changed to be a tuple (as it was originally), while this is less ergonomic the advantage it provides is that for nested objects it moves the rust object to the pyo3 interface so we avoid a copy as Python owns the object on the return. However, for control flow blocks we're still using the SabreResult class as it simplifies the internal implementation (which is why #10366 introduced the struct). The potential impact of this is mitigated by switching to only a single clone per control flow block, by only accessing the SabreResult object's attribute on each recursive call. However, if it is an issue in the future we can work on flattening the nested structure before returning it to python to avoid any clones. Fixes #10650 * Simplify recursive call logic in _apply_sabre_result This commit simplifies the logic in the recursive call logic in _apply_sabre_result to always use a tuple so we avoid an isinstance check. Co-authored-by: Kevin Hartman --------- Co-authored-by: Kevin Hartman --- crates/accelerate/src/sabre_layout.rs | 19 +++++++++++++++---- crates/accelerate/src/sabre_swap/mod.rs | 12 +++++++++--- .../transpiler/passes/routing/sabre_swap.py | 17 ++++++++++++----- ...ix-sabre-performance-1b4503b6ecde6d13.yaml | 6 ++++++ 4 files changed, 42 insertions(+), 12 deletions(-) create mode 100644 releasenotes/notes/fix-sabre-performance-1b4503b6ecde6d13.yaml diff --git a/crates/accelerate/src/sabre_layout.rs b/crates/accelerate/src/sabre_layout.rs index 7cc1ab29979..e587ff74787 100644 --- a/crates/accelerate/src/sabre_layout.rs +++ b/crates/accelerate/src/sabre_layout.rs @@ -12,6 +12,7 @@ #![allow(clippy::too_many_arguments)] use ndarray::prelude::*; +use numpy::IntoPyArray; use numpy::PyReadonlyArray2; use pyo3::prelude::*; use pyo3::wrap_pyfunction; @@ -24,10 +25,12 @@ use crate::getenv_use_multiple_threads; use crate::nlayout::NLayout; use crate::sabre_swap::neighbor_table::NeighborTable; use crate::sabre_swap::sabre_dag::SabreDAG; -use crate::sabre_swap::{build_swap_map_inner, Heuristic, SabreResult}; +use crate::sabre_swap::swap_map::SwapMap; +use crate::sabre_swap::{build_swap_map_inner, Heuristic, NodeBlockResults, SabreResult}; #[pyfunction] pub fn sabre_layout_and_routing( + py: Python, dag: &SabreDAG, neighbor_table: &NeighborTable, distance_matrix: PyReadonlyArray2, @@ -36,7 +39,7 @@ pub fn sabre_layout_and_routing( num_swap_trials: usize, num_layout_trials: usize, seed: Option, -) -> ([NLayout; 2], SabreResult) { +) -> ([NLayout; 2], (SwapMap, PyObject, NodeBlockResults)) { let run_in_parallel = getenv_use_multiple_threads(); let outer_rng = match seed { Some(seed) => Pcg64Mcg::seed_from_u64(seed), @@ -47,7 +50,7 @@ pub fn sabre_layout_and_routing( .take(num_layout_trials) .collect(); let dist = distance_matrix.as_array(); - if run_in_parallel && num_layout_trials > 1 { + let res = if run_in_parallel && num_layout_trials > 1 { seed_vec .into_par_iter() .enumerate() @@ -91,7 +94,15 @@ pub fn sabre_layout_and_routing( }) .min_by_key(|(_, result)| result.map.map.values().map(|x| x.len()).sum::()) .unwrap() - } + }; + ( + res.0, + ( + res.1.map, + res.1.node_order.into_pyarray(py).into(), + res.1.node_block_results, + ), + ) } fn layout_trial( diff --git a/crates/accelerate/src/sabre_swap/mod.rs b/crates/accelerate/src/sabre_swap/mod.rs index 0d83f901bf1..e18fe9688d3 100644 --- a/crates/accelerate/src/sabre_swap/mod.rs +++ b/crates/accelerate/src/sabre_swap/mod.rs @@ -208,12 +208,13 @@ fn cmap_from_neighor_table(neighbor_table: &NeighborTable) -> DiGraph<(), ()> { /// Run sabre swap on a circuit /// /// Returns: -/// (SwapMap, gate_order): A tuple where the first element is a mapping of +/// (SwapMap, gate_order, node_block_results): A tuple where the first element is a mapping of /// DAGCircuit node ids to a list of virtual qubit swaps that should be /// added before that operation. The second element is a numpy array of /// node ids that represents the traversal order used by sabre. #[pyfunction] pub fn build_swap_map( + py: Python, num_qubits: usize, dag: &SabreDAG, neighbor_table: &NeighborTable, @@ -223,9 +224,9 @@ pub fn build_swap_map( num_trials: usize, seed: Option, run_in_parallel: Option, -) -> SabreResult { +) -> (SwapMap, PyObject, NodeBlockResults) { let dist = distance_matrix.as_array(); - build_swap_map_inner( + let res = build_swap_map_inner( num_qubits, dag, neighbor_table, @@ -235,6 +236,11 @@ pub fn build_swap_map( layout, num_trials, run_in_parallel, + ); + ( + res.map, + res.node_order.into_pyarray(py).into(), + res.node_block_results, ) } diff --git a/qiskit/transpiler/passes/routing/sabre_swap.py b/qiskit/transpiler/passes/routing/sabre_swap.py index ea669aba0be..0c66cb9670d 100644 --- a/qiskit/transpiler/passes/routing/sabre_swap.py +++ b/qiskit/transpiler/passes/routing/sabre_swap.py @@ -341,11 +341,14 @@ def empty_dag(node, block): return out def apply_inner(out_dag, current_layout, qubit_indices_inner, result, id_to_node): - for node_id in result.node_order: + node_order = result[1] + swap_map = result[0] + node_block_results = result[2] + for node_id in node_order: node = id_to_node[node_id] if isinstance(node.op, ControlFlowOp): # Handle control flow op and continue. - block_results = result.node_block_results[node_id] + block_results = node_block_results[node_id] mapped_block_dags = [] idle_qubits = set(out_dag.qubits) for block, block_result in zip(node.op.blocks, block_results): @@ -360,7 +363,11 @@ def apply_inner(out_dag, current_layout, qubit_indices_inner, result, id_to_node mapped_block_dag, mapped_block_layout, block_qubit_indices, - block_result.result, + ( + block_result.result.map, + block_result.result.node_order, + block_result.result.node_block_results, + ), block_id_to_node, ) @@ -396,9 +403,9 @@ def apply_inner(out_dag, current_layout, qubit_indices_inner, result, id_to_node continue # If we get here, the node isn't a control-flow gate. - if node_id in result.map: + if node_id in swap_map: process_swaps( - result.map[node_id], + swap_map[node_id], out_dag, current_layout, canonical_register, diff --git a/releasenotes/notes/fix-sabre-performance-1b4503b6ecde6d13.yaml b/releasenotes/notes/fix-sabre-performance-1b4503b6ecde6d13.yaml new file mode 100644 index 00000000000..9d36940582c --- /dev/null +++ b/releasenotes/notes/fix-sabre-performance-1b4503b6ecde6d13.yaml @@ -0,0 +1,6 @@ +--- +fixes: + - | + Fixed a performance regression in the :class:`~.SabreLayout` and + :class:`~.SabreSwap` transpiler passes. + Fixed `#10650 `__