# Quantum Circuits Dataset Generation

This notebook creates graph-structured datasets of quantum circuits for evaluating the learnability of full output distributions using graph neural networks. Datasets are generated for both Class A (variational ansatz) and Class B (QAOA-like) circuit families across multiple qubit sizes (2–5), under both noiseless and hardware-calibrated noisy conditions. Each quantum circuit is converted to a directed acyclic graph with engineered node and global attributes, including gate type, parameter encoding, qubit-specific noise calibrations, and Laplacian eigenvector positional encodings. All simulations use Qiskit Aer and, for noisy cases, real hardware calibration data is saved alongside the dataset to enable reproducibility. Data is organized for direct compatibility with PyTorch Geometric, supporting training and evaluation of GNN models for quantum output distribution learning.

In [1]:
# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Library Imports
import os
import numpy as np
import random
import torch
from torch_geometric.data import Data
from qiskit import QuantumCircuit, transpile
from qiskit.converters import circuit_to_dag
from qiskit_aer import AerSimulator
from qiskit_ibm_runtime import QiskitRuntimeService
from qiskit_aer.noise import NoiseModel
from scipy.sparse.linalg import eigsh
import datetime
import time
import json

## Global Seeding

In [3]:
def set_global_seeds(seed: int):
    """
    Set global random seeds for reproducibility.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    print(f"Global seeds set to {seed} (random, numpy, torch)")

# Set default seed
set_global_seeds(42)

Global seeds set to 42 (random, numpy, torch)


## Directory Utilities

In [4]:
def ensure_dataset_dir(n_qubits, noise_type, circuit_class=None, root='../datasets'):
    """
    Ensure output directory exists.
    If circuit_class is None, create up to noise_type level (for calibration snapshot).
    """
    parts = [root, f"{n_qubits}-qubit", noise_type]
    if circuit_class:
        parts.append(circuit_class)
    folder = os.path.join(*parts)
    os.makedirs(folder, exist_ok=True)
    return folder


def save_dataset(data_list, n_qubits, noise_type, circuit_class, root='../datasets'):
    """
    Save the data_list (list of PyG Data) to disk in the correct directory.
    """
    folder = ensure_dataset_dir(n_qubits, noise_type, circuit_class, root=root)
    fname = f"dataset_{n_qubits}q_{noise_type}_{circuit_class}.pt"
    save_path = os.path.join(folder, fname)
    torch.save(data_list, save_path)
    print(f"Saved dataset ({len(data_list)} samples) to {save_path}")

## Gate Encoding and Constants

In [5]:
GATE_TYPES = ['rx', 'ry', 'rz', 'h', 'x', 'y', 'z', 'cx']
GATE_TYPE_IDX = {g: i for i, g in enumerate(GATE_TYPES)}

def one_hot_gate(name):
    """
    Return one-hot encoding for the given gate name.
    """
    v = [0.0] * len(GATE_TYPES)
    i = GATE_TYPE_IDX.get(name, -1)
    if i >= 0: v[i] = 1.0
    return v

## Quantum Circuits Generators

In [6]:
def generate_variational_ansatz(n_qubits):
    """
    Variational Ansatz for Class A.
    - Uses layers = max(1, n_qubits // 2)
    - Sparse entanglement: Alternate neighboring pairs connected
    """
    layers = max(1, n_qubits // 2)
    qc = QuantumCircuit(n_qubits)
    qubits = list(range(n_qubits))

    for _ in range(layers):
        random.shuffle(qubits)
        for q in qubits:
            theta = random.uniform(0, 2 * np.pi)
            qc.ry(theta, q)

        for i in range(0, n_qubits, 2):
            qc.cx(i, (i + 1) % n_qubits)

    return qc


def generate_qaoa_like(n_qubits):
    """
    QAOA-like Ansatz for Class B.
    - Uses p = max(1, n_qubits // 2)
    - Sparse entanglement: alternate pairs
    """
    p = max(1, n_qubits // 2)
    qc = QuantumCircuit(n_qubits)
    qc.h(range(n_qubits))

    for _ in range(p):
        gamma = random.uniform(0, 2 * np.pi)
        for i in range(0, n_qubits, 2):
            qc.cx(i, (i + 1) % n_qubits)
            qc.rz(2 * gamma, (i + 1) % n_qubits)
            qc.cx(i, (i + 1) % n_qubits)

        beta = random.uniform(0, 2 * np.pi)
        for q in range(n_qubits):
            qc.rx(2 * beta, q)

    return qc

## Noise Model and Calibration Retrieval

In [7]:
def get_noise_model_and_calib(n_qubits, backend_name="ibm_sherbrooke", noisy=True):
    """
    Get noise model, per-qubit calibration (T1, T2, readout), and gate errors.
    Returns:
        - noise_model
        - per_qubit_calib: List[[T1, T2, readout]]
        - per_gate_errors: Dict[str, float] (e.g. 'cx_0_1', 'rx_0')
    """
    if not noisy:
        per_qubit_calib = [[0.0, 0.0, 0.0] for _ in range(n_qubits)]
        per_gate_errors = {}
        for g in GATE_TYPES:
            if g == "cx":
                for q0 in range(n_qubits):
                    for q1 in range(n_qubits):
                        if q0 != q1:
                            per_gate_errors[f"cx_{q0}_{q1}"] = 0.0
            else:
                for q in range(n_qubits):
                    per_gate_errors[f"{g}_{q}"] = 0.0
        return None, per_qubit_calib, per_gate_errors

    try:
        token = os.getenv("IBM-QUANTUM-THESIS-WORK")
        if not token:
            raise RuntimeError("IBM-QUANTUM-THESIS-WORK token not set.")

        QiskitRuntimeService.save_account(token=token, instance="THESIS-WORK", overwrite=True)
        service = QiskitRuntimeService(channel="ibm_quantum_platform")
        backend = service.backend(backend_name)
        noise_model = NoiseModel.from_backend(backend)
        properties = backend.properties()

        per_qubit_calib = []
        for q in range(n_qubits):
            t1 = properties.t1(q) or 0.0
            t2 = properties.t2(q) or 0.0
            readout_err = properties.readout_error(q) or 0.0
            per_qubit_calib.append([t1/1e5, t2/1e5, readout_err])

        per_gate_errors = {}
        for g in GATE_TYPES:
            if g == "cx":
                for q0 in range(n_qubits):
                    for q1 in range(n_qubits):
                        if q0 != q1:
                            try:
                                err = properties.gate_error("cx", [q0, q1])
                            except Exception:
                                err = 0.0
                            per_gate_errors[f"cx_{q0}_{q1}"] = err
            else:
                for q in range(n_qubits):
                    try:
                        err = properties.gate_error(g, [q])
                    except Exception:
                        err = 0.0
                    per_gate_errors[f"{g}_{q}"] = err

        print(f"Loaded noise errors from {backend_name}")
        return noise_model, per_qubit_calib, per_gate_errors

    except Exception as e:
        print(f"Could not load real hardware noise: {e}")
        per_qubit_calib = [[0.0, 0.0, 0.0] for _ in range(n_qubits)]
        per_gate_errors = {}
        for g in GATE_TYPES:
            if g == "cx":
                for q0 in range(n_qubits):
                    for q1 in range(n_qubits):
                        if q0 != q1:
                            per_gate_errors[f"cx_{q0}_{q1}"] = 0.0
            else:
                for q in range(n_qubits):
                    per_gate_errors[f"{g}_{q}"] = 0.0
        return None, per_qubit_calib, per_gate_errors

In [8]:
# Calibration Snapshot Saving
def save_calibration_snapshot(per_qubit_calib, per_gate_errors, n_qubits, backend_name, root='../datasets'):
    """
    Save calibration snapshot as JSON in datasets/{n_qubit}-qubit/noisy/
    Includes T1, T2, readout and per-gate errors.
    """
    folder = ensure_dataset_dir(n_qubits, "noisy", root=root)
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    fname = f"calibration_{n_qubits}q_noisy_{backend_name}_{timestamp}.json"
    path = os.path.join(folder, fname)
    out = {
        "backend": backend_name,
        "timestamp": timestamp,
        "n_qubits": n_qubits,
        "per_qubit_calib": per_qubit_calib,
        "per_gate_errors": per_gate_errors
    }
    with open(path, 'w') as f:
        json.dump(out, f, indent=2)
    print(f"Saved calibration snapshot to {path}")

## Laplacian Eigenvector Encoding

In [9]:
def laplacian_eigenvectors_from_edge_index(edge_index, num_nodes, k=6):
    """
    Compute k non-trivial Laplacian eigenvectors for a graph.
    Returns [num_nodes, k] array.
    """
    # Adjacency matrix
    A = np.zeros((num_nodes, num_nodes))
    if edge_index.shape[1] > 0:
        src, dst = edge_index
        for i, j in zip(src, dst):
            if i != j:
                A[i, j] = 1.0

    D = np.diag(A.sum(axis=1))
    L = D - A
    try:
        vals, vecs = eigsh(L, k=k+1, which='SM')
        lap_vecs = vecs[:, 1:k+1]
    except Exception as e:
        print(f"Laplacian eigsh failed: {e}")
        lap_vecs = np.zeros((num_nodes, k))
    return lap_vecs.astype(np.float32)

## Circuit to Graph Conversion (DAG)

In [10]:
def circuit_to_graph(qc, per_qubit_calib, per_gate_errors, n_qubits, class_label, k_lap=6):

    MAX_QUBITS = 5
    num_lap = k_lap
    dag = circuit_to_dag(qc)
    nodes = list(dag.topological_op_nodes())
    if not nodes:
        return None

    node_indices = {node: i for i, node in enumerate(nodes)}
    num_nodes = len(nodes)
    ops = qc.count_ops()
    single_count = sum(ops.get(g, 0) for g in ['rx','ry','rz','h','x','y','z'])
    two_count = ops.get('cx', 0)

    edges, edge_attrs = [], []
    for src, dst, _ in dag.edges():
        if src not in node_indices or dst not in node_indices:
            continue
        i, j = node_indices[src], node_indices[dst]
        dist = abs(i - j) / max(num_nodes - 1, 1)
        src_gate_name = src.name.lower()
        is_cx = 1.0 if src_gate_name == "cx" or dst.name.lower() == "cx" else 0.0
        if src_gate_name == "cx" and len(src.qargs) == 2:
            q0 = qc.find_bit(src.qargs[0]).index
            q1 = qc.find_bit(src.qargs[1]).index
            gate_err_key = f"cx_{q0}_{q1}"
            gate_err = per_gate_errors.get(gate_err_key, 0.0)
        else:
            q = qc.find_bit(src.qargs[0]).index if src.qargs else 0
            gate_err_key = f"{src_gate_name}_{q}"
            gate_err = per_gate_errors.get(gate_err_key, 0.0)
        edges.append([i, j])
        edge_attrs.append([dist, is_cx, gate_err])

    edge_index = torch.tensor(edges, dtype=torch.long).T if edges else torch.empty((2,0), dtype=torch.long)
    edge_attr = torch.tensor(edge_attrs, dtype=torch.float) if edge_attrs else torch.empty((0,3), dtype=torch.float)

    degs = [0] * num_nodes
    for i, j in edges:
        degs[i] += 1
    max_deg = max(degs) or 1

    feats = []
    for idx, node in enumerate(nodes):
        gate_name = node.name.lower()
        qubit = qc.find_bit(node.qargs[0]).index if node.qargs else 0
        angle = float(node.op.params[0]) if hasattr(node.op, 'params') and node.op.params else None
        sincos = [np.sin(angle), np.cos(angle)] if angle is not None else [0.0, 0.0]
        gate_type = one_hot_gate(gate_name)
        qubit_onehot = [1.0 if i == qubit else 0.0 for i in range(MAX_QUBITS)]
        calib = per_qubit_calib[qubit] if per_qubit_calib and qubit < len(per_qubit_calib) else [0.0, 0.0, 0.0]
        if gate_name == "cx" and len(node.qargs) == 2:
            q0 = qc.find_bit(node.qargs[0]).index
            q1 = qc.find_bit(node.qargs[1]).index
            gate_err_key = f"cx_{q0}_{q1}"
            gate_err = per_gate_errors.get(gate_err_key, 0.0)
        else:
            gate_err_key = f"{gate_name}_{qubit}"
            gate_err = per_gate_errors.get(gate_err_key, 0.0)
        norm_layer = idx / max(num_nodes-1, 1)
        deg_norm = degs[idx] / max_deg
        feats.append(sincos + gate_type + qubit_onehot + calib + [gate_err, deg_norm, norm_layer])

    lap_pos = laplacian_eigenvectors_from_edge_index(edge_index.numpy(), num_nodes, k=num_lap)
    if lap_pos.shape[1] < num_lap:
        lap_pos = np.pad(lap_pos, ((0, 0), (0, num_lap - lap_pos.shape[1])), mode='constant')

    x_rows = []
    for i, f in enumerate(feats):
        lap = lap_pos[i] if i < len(lap_pos) else [0.0]*num_lap
        x_rows.append(f + list(lap))
    
    if len(x_rows) == 0 or len(x_rows[0]) != 27:
        return None

    x = torch.tensor(x_rows, dtype=torch.float)

    depth = qc.depth() / 200.0
    cnots = two_count / 100.0
    entangled = 1.0 if two_count > 0 else 0.0
    single_n = single_count / 50.0
    nN = num_nodes
    unique_e = len(edges) / 2
    density = unique_e / (nN*(nN-1)/2) if nN > 1 else 0.0
    avg_calib = np.mean(np.array(per_qubit_calib), axis=0) if per_qubit_calib else [0.0, 0.0, 0.0]
    avg_gate_err = np.mean(list(per_gate_errors.values())) if per_gate_errors else 0.0
    u = torch.tensor([depth, cnots, entangled, single_n, density] + list(avg_calib) + [avg_gate_err], dtype=torch.float)

    pos = torch.arange(num_nodes, dtype=torch.long)
    data = Data(
        x=x, edge_index=edge_index, edge_attr=edge_attr, pos=pos, u=u
    )
    data.class_label = int(class_label)
    return data

## Dataset Generation (Per Class)

In [11]:
def generate_dataset(
    n_qubits,
    circuit_class,
    num_samples,
    noise_type,
    class_params=None,
    backend_name="ibm_sherbrooke",
    shots=1024,
    batch_size=100,
    root='../datasets'
):
    """
    Generate and save dataset for one class (classA/classB), qubit count, and noise setting.
    Stores to datasets/{n_qubit}-qubit/{noisy|noiseless}/{classA|classB}/
    Adds .class_label: 0 (A), 1 (B).
    """
    gen_func = (
        (lambda: generate_variational_ansatz(n_qubits, **(class_params or {}))) if circuit_class == "classA"
        else (lambda: generate_qaoa_like(n_qubits, **(class_params or {})))
    )
    class_label = 0 if circuit_class == "classA" else 1
    noisy = (noise_type == "noisy")
    noise_model, per_qubit_calib, per_gate_errors = get_noise_model_and_calib(
        n_qubits, backend_name=backend_name, noisy=noisy)
    simulator = AerSimulator()

    data_list = []
    for b in range((num_samples + batch_size - 1) // batch_size):
        bs = min(batch_size, num_samples - b*batch_size)
        before = len(data_list)
        for _ in range(bs):
            qc = gen_func()
            transp = transpile(qc, basis_gates=['u1', 'u2', 'u3', 'cx'], optimization_level=0)
            qc_meas = transp.copy()
            qc_meas.measure_all()

            if noise_model is not None:
                job = simulator.run([qc_meas], shots=shots, noise_model=noise_model)
            else:
                job = simulator.run([qc_meas], shots=shots)
            result = job.result()
            counts = result.get_counts()
            if isinstance(counts, list): counts = counts[0]

            dim = 2 ** n_qubits
            y_np = np.zeros(dim)
            total = sum(counts.values())
            for bits, c in counts.items():
                idx = int(bits.replace(' ', '')[::-1], 2)
                y_np[idx] = c / total
            eps = 1e-6
            y_np = (1 - eps) * y_np + eps / dim
            y = torch.tensor(y_np, dtype=torch.float)

            graph_data = circuit_to_graph(qc, per_qubit_calib, per_gate_errors, n_qubits, class_label, k_lap=6)
            if graph_data is None: continue
            graph_data.y = y
            data_list.append(graph_data)
        generated = len(data_list) - before
        print(f"Batch {b+1} Generated {generated} samples for {n_qubits}q, {noise_type}, {circuit_class}")

    save_dataset(data_list, n_qubits, noise_type, circuit_class, root=root)
    return data_list, (per_qubit_calib, per_gate_errors) if noisy else None

## Main Dataset Generation Loop

In [12]:
#Settings
NUM_SAMPLES_PER_OUTCOME_PER_CLASS = 600  # Per class (classA, classB)
CIRCUIT_CLASSES = {"classA": {}, "classB": {}}
QUBIT_RANGE = [2, 3, 4, 5]
NOISE_TYPES = ["noiseless", "noisy"]
SHOTS_PER_CIRCUIT = 1024

start_time = time.time()

for n_qubits in QUBIT_RANGE:
    num_outcomes = 2 ** n_qubits
    samples_per_class = NUM_SAMPLES_PER_OUTCOME_PER_CLASS * num_outcomes

    for noise_type in NOISE_TYPES:
        # For noisy save calibration snapshot
        if noise_type == "noisy":
            # Fetch calibration and save snapshot before data gen
            _, per_qubit_calib, per_gate_errors = get_noise_model_and_calib(n_qubits, noisy=True)
            save_calibration_snapshot(
                per_qubit_calib,
                per_gate_errors,
                n_qubits,
                backend_name="ibm_sherbrooke"
            )

        for circuit_class, class_params in CIRCUIT_CLASSES.items():
            print(f"\nGenerating {samples_per_class} samples for {n_qubits}q, {noise_type}, {circuit_class}")
            generate_dataset(
                n_qubits=n_qubits,
                circuit_class=circuit_class,
                num_samples=samples_per_class,
                noise_type=noise_type,
                class_params=class_params,
                shots=SHOTS_PER_CIRCUIT
            )

end_time = time.time()
print(f"\nTotal datasets generation time: {(end_time - start_time) / 3600:.2f} hours")


Generating 2400 samples for 2q, noiseless, classA
Batch 1 Generated 100 samples for 2q, noiseless, classA
Batch 2 Generated 100 samples for 2q, noiseless, classA
Batch 3 Generated 100 samples for 2q, noiseless, classA
Batch 4 Generated 100 samples for 2q, noiseless, classA
Batch 5 Generated 100 samples for 2q, noiseless, classA
Batch 6 Generated 100 samples for 2q, noiseless, classA
Batch 7 Generated 100 samples for 2q, noiseless, classA
Batch 8 Generated 100 samples for 2q, noiseless, classA
Batch 9 Generated 100 samples for 2q, noiseless, classA
Batch 10 Generated 100 samples for 2q, noiseless, classA
Batch 11 Generated 100 samples for 2q, noiseless, classA
Batch 12 Generated 100 samples for 2q, noiseless, classA
Batch 13 Generated 100 samples for 2q, noiseless, classA
Batch 14 Generated 100 samples for 2q, noiseless, classA
Batch 15 Generated 100 samples for 2q, noiseless, classA
Batch 16 Generated 100 samples for 2q, noiseless, classA
Batch 17 Generated 100 samples for 2q, noisele

## Dataset Summary/Verification

In [None]:
def print_dataset_summary(root='../datasets'):
    """
    Print a summary of generated datasets and calibration files.
    """
    for n_qubits in QUBIT_RANGE:
        print(f"\n{n_qubits}-QUBIT")
        for noise_type in NOISE_TYPES:
            if noise_type == "noisy":
                folder = ensure_dataset_dir(n_qubits, noise_type, circuit_class=None, root=root)
                cals = [f for f in os.listdir(folder) if f.startswith("calibration_") and f.endswith(".json")]
                print(f"  {noise_type}: {len(cals)} calibration snapshots")
                if cals:
                    print(f" {os.path.join(folder, cals[0])}")
            for circuit_class in CIRCUIT_CLASSES.keys():
                folder = ensure_dataset_dir(n_qubits, noise_type, circuit_class, root=root)
                files = [f for f in os.listdir(folder) if f.endswith('.pt')]
                print(f"  {noise_type}/{circuit_class}: {len(files)} dataset files")
                if files:
                    print(f" {os.path.join(folder, files[0])}")

print_dataset_summary()


2-QUBIT
  noiseless/classA: 2 dataset files
 ../datasets\2-qubit\noiseless\classA\dataset_2q_noiseless_classA.pt
  noiseless/classB: 2 dataset files
 ../datasets\2-qubit\noiseless\classB\dataset_2q_noiseless_classB.pt
  noisy: 1 calibration snapshots
 ../datasets\2-qubit\noisy\calibration_2q_noisy_ibm_sherbrooke_20250714_212038.json
  noisy/classA: 2 dataset files
 ../datasets\2-qubit\noisy\classA\dataset_2q_noisy_classA.pt
  noisy/classB: 2 dataset files
 ../datasets\2-qubit\noisy\classB\dataset_2q_noisy_classB.pt

3-QUBIT
  noiseless/classA: 2 dataset files
 ../datasets\3-qubit\noiseless\classA\dataset_3q_noiseless_classA.pt
  noiseless/classB: 2 dataset files
 ../datasets\3-qubit\noiseless\classB\dataset_3q_noiseless_classB.pt
  noisy: 1 calibration snapshots
 ../datasets\3-qubit\noisy\calibration_3q_noisy_ibm_sherbrooke_20250714_225749.json
  noisy/classA: 2 dataset files
 ../datasets\3-qubit\noisy\classA\dataset_3q_noisy_classA.pt
  noisy/classB: 2 dataset files
 ../datasets\3-qu