<a href="https://colab.research.google.com/github/SoftLocked/QuGPT/blob/main/QuGPT_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install qiskit

Collecting qiskit
  Downloading qiskit-2.3.0-cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (12 kB)
Collecting rustworkx>=0.15.0 (from qiskit)
  Downloading rustworkx-0.17.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting stevedore>=3.0.0 (from qiskit)
  Downloading stevedore-5.6.0-py3-none-any.whl.metadata (2.3 kB)
Downloading qiskit-2.3.0-cp310-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rustworkx-0.17.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading stevedore-5.6.0-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling coll

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import math
import random
from dataclasses import dataclass
from typing import List, Tuple, Dict, Set
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from qiskit import QuantumCircuit
from qiskit.quantum_info import Operator

# Token vocabulary (25 tokens)

In [3]:
@dataclass
class GateInfo:
  '''Describes a token in our vocabulary'''
  name: str       # The token's name (e.g. "h_q0")
  qasm_name: str  # The token's name in qasm 2.0 (e.g. "h")
  qubits: tuple   # Which qubits it affects (e.g. "(0,)" or "(0, 1)")
  is_t_gate: bool # True for T and Tdg
  description: str

In [4]:
def build_vocabulary() -> Tuple[Dict[int, GateInfo], Dict[str, int], int, set]:
  '''
  Build the Clifford+T vocabulary for 2-qubit circuits

  Returns:
    token_to_gate: dict[int, GateInfo]
    gate_to_token: dict[str, int]
    vocab_size: int
    t_gate_token_ids: set
  '''

  gates: List[GateInfo] = []

  # Special tokens
  special = [
      ("<pad>", "Padding"),
      ("<start>", "Start of sequence"),
      ("<end>", "End of sequence")
  ]

  # 1-qubit Clifford tokens
  clifford_1q = [
        ("h",   "Hadamard — maps X↔Z, creates superposition"),
        ("s",   "S = √Z — phase gate, quarter-turn around Z"),
        ("sdg", "S† — inverse of S"),
        ("x",   "Pauli X — bit flip"),
        ("y",   "Pauli Y — bit + phase flip"),
        ("z",   "Pauli Z — phase flip"),
  ]

  # 2-qubit Clifford tokens
  clifford_2q = [
        ("cx",   "CNOT — flips target if control is |1⟩"),
        ("cz",   "CZ — applies Z to target if control is |1⟩"),
        ("swap", "SWAP — exchanges the two qubits"),
  ]

  # T tokens
  t = [
      ("t", "T = √S"),
       ("tdg", "T† = inverse of T")
  ]



  # Apply special tokens
  for name, description in special:
    gates.append(GateInfo(name, name, (), False, description))

  # Apply 1-qubit Clifford tokens
  for gate_name, desc in clifford_1q:
        for q in [0, 1]:
            gates.append(GateInfo(
                name=f"{gate_name}_q{q}",
                qasm_name=gate_name,
                qubits=(q,),
                is_t_gate=False,
                description=f"{desc} on q{q}",
            ))

  # Apply 2-qubit Clifford tokens
  for gate_name, desc in clifford_2q:
        for q0, q1 in [(0, 1), (1, 0)]:
            gates.append(GateInfo(
                name=f"{gate_name}_q{q0}_q{q1}",
                qasm_name=gate_name,
                qubits=(q0, q1),
                is_t_gate=False,
                description=f"{desc}: q{q0}→q{q1}",
            ))
  # Apply T tokens
  for gate_name, desc in t:
        for q in [0, 1]:
            gates.append(GateInfo(
                name=f"{gate_name}_q{q}",
                qasm_name=gate_name,
                qubits=(q,),
                is_t_gate=True,
                description=f"{desc} on q{q}",
            ))

  # Build lookups
  token_to_gate = {i: g for i, g in enumerate(gates)}
  gate_to_token = {g.name: i for i, g in enumerate(gates)}
  vocab_size = len(gates)

  # Identify which token ID's are T gates (to penalize their use)
  t_gate_token_ids = {i for i, g in enumerate(gates) if g.is_t_gate}

  return token_to_gate, gate_to_token, vocab_size, t_gate_token_ids

# Define constants
TOKEN_TO_GATE, GATE_TO_TOKEN, VOCAB_SIZE, T_GATE_TOKENS = build_vocabulary()
PAD_TOKEN   = GATE_TO_TOKEN['<pad>']
START_TOKEN = GATE_TO_TOKEN['<start>']
END_TOKEN   = GATE_TO_TOKEN['<end>']


In [5]:
# Prints the full vocabulary for inspection
print(f"Vocabulary: {VOCAB_SIZE} tokens")
print(f"T-gate token IDs: {T_GATE_TOKENS}\n")
for i in range(VOCAB_SIZE):
    g = TOKEN_TO_GATE[i]
    t_marker = " ← T GATE (expensive!)" if g.is_t_gate else ""
    print(f"  [{i:2d}] {g.name:18s}  qubits={str(g.qubits):10s}  {g.description}{t_marker}")


Vocabulary: 25 tokens
T-gate token IDs: {24, 21, 22, 23}

  [ 0] <pad>               qubits=()          Padding
  [ 1] <start>             qubits=()          Start of sequence
  [ 2] <end>               qubits=()          End of sequence
  [ 3] h_q0                qubits=(0,)        Hadamard — maps X↔Z, creates superposition on q0
  [ 4] h_q1                qubits=(1,)        Hadamard — maps X↔Z, creates superposition on q1
  [ 5] s_q0                qubits=(0,)        S = √Z — phase gate, quarter-turn around Z on q0
  [ 6] s_q1                qubits=(1,)        S = √Z — phase gate, quarter-turn around Z on q1
  [ 7] sdg_q0              qubits=(0,)        S† — inverse of S on q0
  [ 8] sdg_q1              qubits=(1,)        S† — inverse of S on q1
  [ 9] x_q0                qubits=(0,)        Pauli X — bit flip on q0
  [10] x_q1                qubits=(1,)        Pauli X — bit flip on q1
  [11] y_q0                qubits=(0,)        Pauli Y — bit + phase flip on q0
  [12] y_q1          

# Input Representation

In [6]:
def unitary_to_tensor(U: np.ndarray) -> torch.Tensor:
  '''Convert 4x4 complex unitary to a flat 32-value real tensor'''
  real = torch.tensor(U.real, dtype=torch.float32)
  imag = torch.tensor(U.imag, dtype=torch.float32)
  return torch.stack([real, imag], dim=-1).flatten()

In [7]:
# Test unitary to tensor converter
test_unitary = np.array(
    [ [1+1j, 2+2j, 3+3j, 4+4j],
      [5+5j, 6+6j, 7+7j, 8+8j],
      [9+9j, 10+10j, 11+11j, 12+12j],
      [13+13j, 14+14j, 15+15j, 16+16j]
     ]
)
test_tensor = unitary_to_tensor(test_unitary)
print(test_tensor)

tensor([ 1.,  1.,  2.,  2.,  3.,  3.,  4.,  4.,  5.,  5.,  6.,  6.,  7.,  7.,
         8.,  8.,  9.,  9., 10., 10., 11., 11., 12., 12., 13., 13., 14., 14.,
        15., 15., 16., 16.])


# Encoder (read the unitary)

In [8]:
# Each row of the unitary is a token
class UnitaryEncoder(nn.Module):
  '''
  (batch, 32) to (batch, 4, d_model)
  '''

  def __init__(self, d_model=128, nhead=4, num_layers=4,
                 dim_feedforward=512, dropout=0.1):
    super().__init__()

    self.row_proj = nn.Sequential(
      nn.Linear(8, d_model), nn.GELU(),
      nn.Linear(d_model, d_model), nn.GELU(),
      nn.Linear(d_model, d_model),
    )

    self.pos_emb = nn.Embedding(4, d_model)

    enc_layer = nn.TransformerEncoderLayer(
      d_model=d_model,
      nhead=nhead,
      dim_feedforward=dim_feedforward,
      dropout=dropout,
      activation='gelu',
      batch_first=True,
    )

    self.transformer = nn.TransformerEncoder(enc_layer, num_layers)

    self.norm = nn.LayerNorm(d_model)

  def forward(self, unitary_flat: torch.Tensor) -> torch.Tensor:
    B = unitary_flat.size(0)
    rows = unitary_flat.view(B, 4, 8)
    x = self.row_proj(rows) + self.pos_emb(torch.arange(4, device=rows.device))
    return self.norm(self.transformer(x))

# Decoder (generate circuit tokens)

In [9]:
class GateDecoder(nn.Module):
    '''
    Autoregressive decoder: generates one gate token at a time.
    '''

    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=4,
                 dim_feedforward=512, max_seq_len=200, dropout=0.1):
      super().__init__()
      self.d_model = d_model
      self.tok_emb = nn.Embedding(vocab_size, d_model)
      self.pos_emb = nn.Embedding(max_seq_len, d_model)

      dec_layer = nn.TransformerDecoderLayer(
        d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward,
        dropout=dropout, activation='gelu', batch_first=True,
      )
      self.transformer = nn.TransformerDecoder(dec_layer, num_layers)
      self.norm = nn.LayerNorm(d_model)

      # Single output head: which gate comes next?
      self.gate_head = nn.Linear(d_model, vocab_size)

    def forward(self, tgt_tokens: torch.Tensor,
                encoder_output: torch.Tensor) -> torch.Tensor:
      '''
      Args:
          tgt_tokens:     (batch, seq_len)
          encoder_output: (batch, 4, d_model)
      Returns:
          gate_logits: (batch, seq_len, vocab_size)
      '''
      S = tgt_tokens.size(1)
      device = tgt_tokens.device

      x = self.tok_emb(tgt_tokens) * math.sqrt(self.d_model)
      x = x + self.pos_emb(torch.arange(S, device=device))

      mask = nn.Transformer.generate_square_subsequent_mask(S, device=device)
      x = self.norm(self.transformer(tgt=x, memory=encoder_output, tgt_mask=mask))
      return self.gate_head(x)

# Put together the full model

In [10]:
class CliffordTSynthesizer(nn.Module):
  '''
  Unitary Matrix → Clifford+T Circuit (minimizing T-count).
  '''

  def __init__(self, d_model=128, nhead=4, num_encoder_layers=4,
                num_decoder_layers=5, dim_feedforward=512,
                max_seq_len=200, dropout=0.1):
    super().__init__()
    self.encoder = UnitaryEncoder(d_model, nhead, num_encoder_layers,
                                    dim_feedforward, dropout)
    self.decoder = GateDecoder(VOCAB_SIZE, d_model, nhead,
                                num_decoder_layers, dim_feedforward,
                                max_seq_len, dropout)

  def forward(self, unitary_flat: torch.Tensor,
              tgt_tokens: torch.Tensor) -> torch.Tensor:
    '''
    Training forward pass.
    Returns gate_logits: (batch, seq_len, vocab_size)
    '''

    enc_out = self.encoder(unitary_flat)
    return self.decoder(tgt_tokens, enc_out)


# Loss Function

The loss has two components:

1. L = L_gate + λ_T · L_T_count

    - L_gate: standard cross-entropy for next-gate prediction.

    - L_T_count: a soft penalty on the model's tendency to predict T gates. At each decoding position, we look at the probability the model assigns to T/Tdg tokens. Summing these across the sequence gives the "expected T-count", which we penalize.

        - This is differentiable (unlike counting discrete tokens), so the model can learn to avoid T gates through gradient descent.

2. The λ_T weight controls the tradeoff:
    - λ_T = 0:    pure accuracy, no T minimization
    - λ_T = 0.1:  mild preference for fewer T gates
    - λ_T = 0.5+: aggressive T reduction (may hurt accuracy)

During training, we anneal λ_T from 0 → target value so the model
first learns correct circuits, then learns to compress T-count.

In [11]:
class CliffordTLoss(nn.Module):
    '''
    Combined loss: cross-entropy + T-count penalty.
    '''

    def __init__(self, t_gate_token_ids: Set[int], t_penalty_weight: float = 0.1):
        super().__init__()
        self.ce_loss = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
        self.t_gate_token_ids = sorted(t_gate_token_ids)
        self.t_penalty_weight = t_penalty_weight

    def forward(self, gate_logits: torch.Tensor,
                target_tokens: torch.Tensor,
                t_penalty_scale: float = 1.0) -> Tuple[torch.Tensor, ...]:
        '''
        Args:
            gate_logits:     (batch, seq_len, vocab_size)
            target_tokens:   (batch, seq_len)
            t_penalty_scale: multiplier for annealing (0→1 during training)

        Returns:
            total_loss, gate_loss, t_penalty
        '''
        B, S, V = gate_logits.shape

        # Standard cross-entropy: did we predict the right gate?
        gate_loss = self.ce_loss(
            gate_logits.reshape(-1, V), target_tokens.reshape(-1)
        )

        # T-count penalty: discourage predicting T gates
        # Get probability distribution at each position
        probs = F.softmax(gate_logits, dim=-1)  # (B, S, V)

        # Sum probabilities assigned to T and Tdg tokens
        t_probs = probs[:, :, self.t_gate_token_ids]  # (B, S, num_t_tokens)
        expected_t_per_position = t_probs.sum(dim=-1)  # (B, S)

        # Mask out padding positions
        pad_mask = (target_tokens != PAD_TOKEN).float()  # (B, S)
        expected_t_count = (expected_t_per_position * pad_mask).sum(dim=-1)  # (B,)

        # Average over batch
        t_penalty = expected_t_count.mean()

        # Combine with annealing
        effective_t_weight = self.t_penalty_weight * t_penalty_scale
        total_loss = gate_loss + effective_t_weight * t_penalty

        return total_loss, gate_loss, t_penalty


# Training data generation

Generate random Clifford+T circuits, compute their unitaries, and use the unitary and gate sequence for training

Data is biased for circuits to have varied T-gate counts.

In [None]:
def apply_gate(qc: QuantumCircuit, gate_name: str, qubits: tuple):
    '''
    Apply a gate to a Qiskit circuit. No parameters needed!
    '''
    q = list(qubits)
    gate_map = {
        "h": lambda: qc.h(q[0]),       "s": lambda: qc.s(q[0]),
        "sdg": lambda: qc.sdg(q[0]),   "x": lambda: qc.x(q[0]),
        "y": lambda: qc.y(q[0]),       "z": lambda: qc.z(q[0]),
        "t": lambda: qc.t(q[0]),       "tdg": lambda: qc.tdg(q[0]),
        "cx": lambda: qc.cx(q[0], q[1]),
        "cz": lambda: qc.cz(q[0], q[1]),
        "swap": lambda: qc.swap(q[0], q[1]),
    }
    gate_map[gate_name]()

In [None]:
# Gate names grouped by type for weighted sampling
CLIFFORD_1Q_TOKENS = [f"{g}_q{q}" for g in ["h","s","sdg","x","y","z"] for q in [0,1]]
T_GATE_TOKENS_LIST = [f"{g}_q{q}" for g in ["t","tdg"] for q in [0,1]]
CLIFFORD_2Q_TOKENS = [f"{g}_q{q0}_q{q1}" for g in ["cx","cz","swap"]
                      for q0, q1 in [(0,1),(1,0)]]
ALL_GATE_TOKENS = CLIFFORD_1Q_TOKENS + T_GATE_TOKENS_LIST + CLIFFORD_2Q_TOKENS

In [None]:
def generate_random_sample(max_gates: int = 20,
                           max_t_count: int = 6) -> Tuple[np.ndarray, List[int]]:
    '''
    Generate a random Clifford+T circuit with controlled T-count.

    The max_t_count parameter limits how many T/Tdg gates appear,
    biasing the dataset toward T-efficient circuits.

    Returns:
        unitary:   4×4 complex numpy array
        token_ids: list of ints [START, gate1, ..., gateN, END]
    '''

    qc = QuantumCircuit(2)
    token_ids = [START_TOKEN]
    t_count = 0
    num_gates = random.randint(1, max_gates)

    for _ in range(num_gates):
        # Weighted sampling: Clifford gates are free, T gates are expensive
        # We allow T gates only if we haven't hit the limit
        if t_count < max_t_count and random.random() < 0.2:
            # 20% chance of T gate (if under limit)
            gate_name = random.choice(T_GATE_TOKENS_LIST)
            t_count += 1
        elif random.random() < 0.3:
            # 30% chance of 2-qubit Clifford
            gate_name = random.choice(CLIFFORD_2Q_TOKENS)
        else:
            # 50% chance of 1-qubit Clifford
            gate_name = random.choice(CLIFFORD_1Q_TOKENS)

        token_id = GATE_TO_TOKEN[gate_name]
        gate_info = TOKEN_TO_GATE[token_id]
        apply_gate(qc, gate_info.qasm_name, gate_info.qubits)
        token_ids.append(token_id)

    token_ids.append(END_TOKEN)
    unitary = Operator(qc).data
    return unitary, token_ids

In [None]:
def generate_dataset_with_varied_t_counts(num_samples: int = 10000
                                           ) -> List[Tuple[np.ndarray, List[int]]]:
    '''
    Generate training data with a mix of T-counts to teach the model
    that low-T solutions exist.

    Distribution:
      - 30% pure Clifford (T-count = 0)
      - 30% low T-count (1-2)
      - 25% medium T-count (3-4)
      - 15% higher T-count (5-8)
    '''

    data = []
    print(f"Generating {num_samples} training samples...")
    for i in range(num_samples):
        if (i + 1) % 2000 == 0:
            print(f"  {i+1}/{num_samples}")

        r = random.random()
        if r < 0.30:
            max_t, max_g = 0, 15     # Pure Clifford
        elif r < 0.60:
            max_t, max_g = 2, 15     # Low T
        elif r < 0.85:
            max_t, max_g = 4, 20     # Medium T
        else:
            max_t, max_g = 8, 25     # Higher T

        U, tokens = generate_random_sample(max_gates=max_g, max_t_count=max_t)
        data.append((U, tokens))

    return data