# Version 2 of the QTE model

## Install and Imports

In [2]:
!which python

/opt/anaconda3/envs/qte_py310/bin/python


In [3]:
import qiskit
qiskit.__version__

'1.3.2'

In [4]:
###############################################################################
# CELL 1: Install and Imports
###############################################################################

# If running in Google Colab or a fresh environment, uncomment to install needed packages:
# !pip install datasets transformers qiskit qiskit-ibm-runtime nltk --quiet

import logging
import warnings
import numpy as np
import nltk

# Hugging Face Datasets & Transformers
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel

# NLTK for tokenization (optional, since HF can also tokenize)
nltk.download('punkt', quiet=True)

# Dimensionality reduction
from sklearn.decomposition import PCA

# Qiskit
import qiskit
from qiskit import QuantumCircuit
#from qiskit.providers.fake_provider import FakeAthens --> dont really need this because we are using a real backend
from qiskit.primitives import Sampler
from qiskit_ibm_runtime import QiskitRuntimeService
from qiskit.circuit import Parameter
from qiskit.circuit.library import RZGate, CRZGate

warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

logging.basicConfig(level=logging.INFO)


## Phase 1 - Dataset Integration (CulturaX)

In [5]:
###############################################################################
# CELL 2: Phase 1 - Dataset Integration (CulturaX)
###############################################################################
# 
# This cell demonstrates how to load, split, and preprocess the CulturaX dataset.
# We'll show an example in English, but you can replace "en" with other languages
# or iterate over multiple subsets for multilingual analysis.
###############################################################################

def load_culturax_dataset(language="en", use_auth_token=True, sample_size=None):
    """
    Loads and returns the CulturaX dataset in the specified language.
    Optionally downsamples the dataset to 'sample_size' examples for testing.
    """
    logging.info("Loading CulturaX dataset from Hugging Face...")

    # If 'uonlp/CulturaX' requires authentication, ensure you have a valid token set.
    dataset = load_dataset("uonlp/CulturaX", language, use_auth_token=use_auth_token)
    
    # Example: split into train/test
    # Some versions of CulturaX might have pre-defined splits. Adjust as needed.
    if "train" not in dataset:
        dataset = dataset["data"].train_test_split(test_size=0.2)
    else:
        dataset = dataset
    
    train_ds = dataset["train"]
    test_ds = dataset["test"]

    # Optional: further downsample (if sample_size is provided)
    if sample_size is not None:
        logging.info(f"Downsampling dataset to {sample_size} samples for quick tests.")
        train_ds = train_ds.select(range(min(sample_size, len(train_ds))))
        test_ds = test_ds.select(range(min(sample_size // 4, len(test_ds))))  # smaller test

    return train_ds, test_ds


class TransformerEmbedder:
    """
    A simple wrapper that uses a pre-trained Transformer (e.g., DistilBERT) from Hugging Face
    to produce embeddings for each text.
    """
    def __init__(self, model_name="distilbert-base-uncased", device="cpu"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.model.eval()
        self.device = device
        self.model.to(device)

    def tokenize(self, text: str):
        # Optional: Use NLTK or rely purely on HF tokenizer. We'll do a minimal approach here.
        tokens = nltk.word_tokenize(text.lower())
        return tokens

    def get_embeddings(self, text: str):
        """
        Returns a 2D array of shape (seq_len, hidden_dim) representing subword embeddings.
        """
        inputs = self.tokenizer(text, return_tensors="pt", max_length=256, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with np.no_grad():
            outputs = self.model(**inputs)
        last_hidden = outputs.last_hidden_state.squeeze(0)  # (seq_len, hidden_dim)
        return last_hidden.cpu().numpy()

def preprocess_dataset(dataset, model_name="distilbert-base-uncased", device="cpu"):
    """
    Applies the TransformerEmbedder to each entry of the dataset's 'text' column.
    Stores the resulting embeddings in a new column 'embeddings'.
    """
    embedder = TransformerEmbedder(model_name=model_name, device=device)
    
    def _process_batch(batch):
        texts = batch["text"]
        all_embeddings = []
        for txt in texts:
            # Extract embeddings
            emb = embedder.get_embeddings(txt)
            all_embeddings.append(emb)
        batch["embeddings"] = all_embeddings
        return batch
    
    # We map the process onto the entire dataset (vectorized approach).
    dataset = dataset.map(_process_batch, batched=True, batch_size=8)
    return dataset


## Phase 2 - Model Enhancement & Training Pipeline 

In [6]:
###############################################################################
# CELL 3: Phase 2 - Model Enhancement & Training Pipeline
# 
# This includes:
#   1) Dimensionality reduction (PCA).
#   2) Quantum circuits (amplitude encoding, positional encoding).
#   3) Hierarchical grouping and quantum entangling.
###############################################################################

class DimensionalityReducer:
    """
    PCA-based approach to reduce embeddings to a dimension = 2^n_qubits.
    If the user wants target_power=3 => 2^3=8 dims, for example.
    """
    def __init__(self, target_power=3):
        self.target_dim = 2 ** target_power
        self.pca = PCA(n_components=self.target_dim)

    def fit_transform(self, embeddings):
        # embeddings can be shape: (seq_len, hidden_dim)
        seq_len = embeddings.shape[0]
        if seq_len < self.target_dim:
            self.pca.n_components = max(1, seq_len)
            logging.warning(f"Reducing PCA dimension to {self.pca.n_components} due to limited seq_len.")
        reduced = self.pca.fit_transform(embeddings)
        return reduced


class QuantumEncoder:
    """
    Encodes a single vector (dim = 2^n_qubits) into an amplitude-encoded state,
    adds sinusoidal positional encoding.
    """
    def __init__(self, n_qubits, backend=None):
        self.n_qubits = n_qubits
        if backend is None:
            # By default, use a mock backend. We'll override for real quantum hardware.
            backend = FakeAthens()
        self.backend = backend
        self.sampler = Sampler()

    def amplitude_encode(self, vector):
        # Trim or pad to 2^n_qubits
        dim = 2 ** self.n_qubits
        vec = vector
        if len(vec) < dim:
            tmp = np.zeros(dim)
            tmp[: len(vec)] = vec
            vec = tmp
        elif len(vec) > dim:
            vec = vec[:dim]
        # Normalize
        norm = np.linalg.norm(vec)
        if norm < 1e-8:
            vec = np.ones(dim) / np.sqrt(dim)
        else:
            vec /= norm
        return vec

    def prepare_circuit(self, vector):
        qc = QuantumCircuit(self.n_qubits)
        amp_vec = self.amplitude_encode(vector)
        qc.initialize(amp_vec, range(self.n_qubits))
        return qc

    def sinusoidal_position_encoding(self, qc, position):
        """
        Applies RZ rotations with angles derived from sin/cos of 'position'.
        """
        for q_idx in range(self.n_qubits):
            angle = np.sin(position*0.1 + q_idx) + np.cos(position*0.05 + q_idx)
            angle *= (np.pi / 4.0)
            qc.rz(angle, q_idx)

    def encode_with_position(self, vector, position):
        qc = self.prepare_circuit(vector)
        self.sinusoidal_position_encoding(qc, position)
        return qc


class ParametricEntangler:
    """
    Demonstrates parametric CRZ gates between adjacent qubits. 
    You could train/tune these parameters in a hybrid quantum-classical loop.
    """
    def __init__(self, n_qubits):
        self.n_qubits = n_qubits
        self.params = [Parameter(f"theta_{i}_{i+1}") for i in range(n_qubits-1)]

    def apply(self, qc: QuantumCircuit, param_values=None):
        if param_values is None:
            param_values = [np.random.uniform(0, np.pi/2) for _ in self.params]
        for i in range(self.n_qubits - 1):
            crz_gate = CRZGate(self.params[i])
            qc.append(crz_gate, [i, i+1])
            qc = qc.assign_parameters({self.params[i]: param_values[i]}, inplace=False)
        return qc


class HierarchicalQuantumEncoder:
    """
    Splits a sequence of vectors -> groups them -> amplitude encodes each group 
    -> entangles groups -> optionally merges them at a higher level.
    """
    def __init__(self, base_encoder: QuantumEncoder, group_size=2, multi_level=True):
        self.base_encoder = base_encoder
        self.n_qubits = base_encoder.n_qubits
        self.group_size = group_size
        self.multi_level = multi_level
        self.entangler = ParametricEntangler(n_qubits=self.n_qubits)
        self.sampler = base_encoder.sampler

    def _merge_vectors(self, vectors):
        # Simple average
        return np.mean(vectors, axis=0) if len(vectors) else None

    def build_group_circuit(self, group_vectors, group_positions):
        merged_vec = self._merge_vectors(group_vectors)
        avg_pos = np.mean(group_positions)
        return self.base_encoder.encode_with_position(merged_vec, avg_pos)

    def build_level_circuit(self, vectors, positions):
        """
        1) Group the vectors
        2) Build subcircuits
        3) Compose them into a single circuit
        4) Parametric entangle consecutive groups
        """
        group_circuits = []
        for start_idx in range(0, len(vectors), self.group_size):
            end_idx = start_idx + self.group_size
            sub_vecs = vectors[start_idx:end_idx]
            sub_pos = positions[start_idx:end_idx]
            group_circuits.append(self.build_group_circuit(sub_vecs, sub_pos))

        num_groups = len(group_circuits)
        total_qubits = num_groups * self.n_qubits
        qc_level = QuantumCircuit(total_qubits)

        # Place each group circuit
        for i, sub_qc in enumerate(group_circuits):
            offset = i * self.n_qubits
            qc_level.compose(sub_qc, qubits=range(offset, offset+self.n_qubits), inplace=True)

        # Entangle consecutive groups
        for g_idx in range(num_groups - 1):
            ctrl_q = (g_idx+1)*self.n_qubits - 1
            targ_q = (g_idx+1)*self.n_qubits
            # Param values
            angle_value = np.random.uniform(0, np.pi/2)
            # We'll just use the first param for demonstration
            param_gate = CRZGate(self.entangler.params[0])
            qc_level.append(param_gate, [ctrl_q, targ_q])
            qc_level = qc_level.assign_parameters({self.entangler.params[0]: angle_value}, inplace=False)

        return qc_level, group_circuits

    def encode_multilevel(self, vectors, positions):
        # First level
        level1_qc, subcircuits = self.build_level_circuit(vectors, positions)
        if not self.multi_level or len(subcircuits) <= 1:
            level1_qc.measure_all()
            job = self.sampler.run([level1_qc])
            res = job.result()
            quasi = res.quasi_dists[0]
            probs = np.array([quasi.get(i, 0.0) for i in range(2**level1_qc.num_qubits)])
            probs = probs / np.sum(probs) if np.sum(probs) > 0 else probs
            return probs

        # Second level example
        # In a real pipeline, you'd store the merged vectors at each group to re-encode them.
        # We'll do a placeholder approach.
        group_level_vectors = []
        group_level_positions = []
        for i, _ in enumerate(subcircuits):
            # Dummy spike vector
            v = np.zeros(2**self.n_qubits)
            v[0] = 1.0
            group_level_vectors.append(v)
            group_level_positions.append(i)

        level2_qc, _ = self.build_level_circuit(group_level_vectors, group_level_positions)
        level2_qc.measure_all()
        job = self.sampler.run([level2_qc])
        res = job.result()
        quasi = res.quasi_dists[0]
        probs = np.array([quasi.get(i, 0.0) for i in range(2**level2_qc.num_qubits)])
        probs = probs / np.sum(probs) if np.sum(probs) > 0 else probs
        return probs


## Phase 3 - Quantum Backend Configuration & Batch Processing

In [7]:
###############################################################################
# CELL 4: Phase 3 - Quantum Backend Configuration & Batch Processing
#
# 1) Integrate with IBM Quantum or local simulator.
# 2) Provide functions to process a dataset in batches, building quantum circuits.
# 3) Provide a skeleton for parallelizing circuit execution on HPC or real quantum hardware.
###############################################################################

def get_ibm_backend(min_qubits=5):
    """
    Example function to get a real IBM quantum backend with at least 'min_qubits' qubits available.
    Adjust QiskitRuntimeService credentials as needed.
    """
    try:
        service = QiskitRuntimeService(channel="ibm_quantum")
        backends = service.backends(filters=lambda b: not b.configuration().simulator
                                    and b.configuration().num_qubits >= min_qubits
                                    and b.status().operational==True)
        if backends:
            backend = sorted(backends, key=lambda b: b.status().pending_jobs)[0]
            logging.info(f"Using backend {backend.name}")
            return backend
        else:
            logging.warning("No suitable real backend found. Falling back to FakeAthens.")
            return FakeAthens()
    except Exception as e:
        logging.warning(f"Failed to load real IBM backend: {e}")
        return FakeAthens()


def process_batch_with_quantum_encoder(batch_embeddings, n_qubits=3, group_size=4, multi_level=True, backend=None):
    """
    Takes a list of (seq_len, embed_dim) arrays (one per example in the batch),
    and produces final quantum probability distributions for each example.
    """
    results = []
    base_encoder = QuantumEncoder(n_qubits=n_qubits, backend=backend)
    hierarchical_encoder = HierarchicalQuantumEncoder(
        base_encoder=base_encoder, group_size=group_size, multi_level=multi_level
    )

    for emb_2d in batch_embeddings:
        if emb_2d is None or len(emb_2d) == 0:
            results.append(None)
            continue
        
        # 1) Flatten or reduce each example's embeddings dimension to 2^n_qubits
        #    We'll do an additional PCA step for each example
        #    (Alternatively, you could do a single PCA over the entire dataset.)
        reducer = DimensionalityReducer(target_power=n_qubits)
        try:
            reduced = reducer.fit_transform(emb_2d)  # shape => (seq_len, 2^n_qubits)
        except ValueError as ve:
            logging.error(f"PCA Error: {ve}")
            results.append(None)
            continue

        # 2) positions = simple [0..seq_len-1]
        positions = list(range(len(reduced)))

        # 3) encode with hierarchical quantum circuit
        prob_dist = hierarchical_encoder.encode_multilevel(reduced, positions)
        results.append(prob_dist)
    return results


## Phase 4 - Putting It All Together (Demo Pipeline)

In [8]:


from datasets import load_dataset

ds = load_dataset("PleIAs/common_corpus")


Resolving data files:   0%|          | 0/10009 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/10000 [00:00<?, ?files/s]

subset_14_4.parquet:  41%|####1     | 136M/332M [00:00<?, ?B/s]

subset_14_5.parquet:   0%|          | 0.00/326M [00:00<?, ?B/s]

subset_14_6.parquet:   0%|          | 0.00/330M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [8]:
###############################################################################
# Updated Cell: Load Metadata and Small Subset for Testing
#
#   1) Load a small portion of Common Corpus dataset (metadata or specific rows)
#   2) Preprocess (HF embeddings for each text)
#   3) Dimensionality reduce & quantum-encode in batches
#   4) Compare classical & quantum results, etc.
###############################################################################

from datasets import load_dataset

def load_common_corpus_metadata(sample_size=100):
    """
    Load metadata or a small subset of the Common Corpus dataset.
    :param sample_size: Number of samples to load for demonstration/testing purposes.
    :return: Small subset of the dataset for testing.
    """
    logging.info("Loading Common Corpus dataset metadata...")

    # Load a small subset of the dataset
    dataset = load_dataset("PleIAs/common_corpus", split=f"train[:{sample_size}]")

    logging.info(f"Loaded {len(dataset)} samples from the dataset.")
    return dataset


def preprocess_dataset(dataset, model_name="distilbert-base-uncased", device="cpu"):
    """
    Preprocesses a dataset using Hugging Face Transformers.
    Extracts embeddings for each text sample.
    """
    logging.info(f"Preprocessing dataset with model: {model_name}...")
    embedder = TransformerEmbedder(model_name=model_name, device=device)
    processed_data = []
    
    for sample in dataset:
        text = sample["text"]  # Assuming the dataset contains a "text" field
        embeddings = embedder.get_token_embeddings(text)
        processed_data.append({"text": text, "embeddings": embeddings})
    
    return processed_data


def classical_baseline(embeddings):
    """
    Example classical baseline function. 
    Computes the average of embeddings as a simple baseline.
    """
    return np.mean(embeddings, axis=(0, 1)) if len(embeddings) else None


def main_pipeline_demo(sample_size=10, use_ibm=False):
    """
    Demonstrates the entire pipeline with minimal data.
    1) Load Common Corpus dataset metadata or small subset
    2) Preprocess to get Hugging Face embeddings
    3) Quantum encode in batches
    4) Compare with classical baseline
    """
    # Step 1: Load metadata or subset
    dataset = load_common_corpus_metadata(sample_size=sample_size)
    
    # Step 2: Preprocess the dataset
    logging.info("Preprocessing dataset with Transformers...")
    dataset = preprocess_dataset(dataset, model_name="distilbert-base-uncased", device="cpu")

    # Step 3: Choose the quantum backend
    backend = get_ibm_backend(min_qubits=5) if use_ibm else FakeAthens()

    # Step 4: Process batches with the quantum encoder
    embeddings = [sample["embeddings"] for sample in dataset]
    quantum_results = process_batch_with_quantum_encoder(
        batch_embeddings=embeddings,
        n_qubits=3,
        group_size=4,
        multi_level=True,
        backend=backend
    )

    # Step 5: Compute classical baseline
    classical_result = classical_baseline(embeddings)

    # Step 6: Log and display results
    logging.info("===== SAMPLE OUTPUT =====")
    logging.info(f"Classical baseline representation shape: {classical_result.shape if classical_result is not None else None}")
    for i, qr in enumerate(quantum_results[:3]):
        if qr is not None:
            logging.info(f"Sample {i} -> Quantum prob dist length: {len(qr)}; sum={np.sum(qr):.3f}")
        else:
            logging.info(f"Sample {i} -> No quantum result (likely empty)")

    return quantum_results, classical_result

# --- Let's run the pipeline on a small demonstration ---
if __name__ == "__main__":
    quantum_dists, cls_baseline = main_pipeline_demo(sample_size=100, use_ibm=False)


INFO:root:Loading Common Corpus dataset metadata...


Resolving data files:   0%|          | 0/10009 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/10000 [00:00<?, ?files/s]

subset_100_1.parquet:   0%|          | 0.00/328M [00:00<?, ?B/s]

subset_100_10.parquet:   0%|          | 0.00/330M [00:00<?, ?B/s]

subset_100_2.parquet:   0%|          | 0.00/332M [00:00<?, ?B/s]

subset_100_3.parquet:   0%|          | 0.00/326M [00:00<?, ?B/s]

subset_100_4.parquet:   0%|          | 0.00/330M [00:00<?, ?B/s]

subset_100_5.parquet:   0%|          | 0.00/322M [00:00<?, ?B/s]

subset_100_6.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

subset_100_7.parquet:   0%|          | 0.00/328M [00:00<?, ?B/s]

subset_100_8.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

subset_100_9.parquet:   0%|          | 0.00/334M [00:00<?, ?B/s]

subset_10_1.parquet:   0%|          | 0.00/326M [00:00<?, ?B/s]

subset_10_10.parquet:   0%|          | 0.00/332M [00:00<?, ?B/s]

subset_10_2.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

subset_10_3.parquet:   0%|          | 0.00/326M [00:00<?, ?B/s]

subset_10_4.parquet:   0%|          | 0.00/330M [00:00<?, ?B/s]

subset_10_5.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

subset_10_6.parquet:   0%|          | 0.00/328M [00:00<?, ?B/s]

subset_10_7.parquet:   0%|          | 0.00/328M [00:00<?, ?B/s]

subset_10_8.parquet:   0%|          | 0.00/332M [00:00<?, ?B/s]

subset_10_9.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

subset_11_1.parquet:   0%|          | 0.00/330M [00:00<?, ?B/s]

subset_11_10.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

subset_11_2.parquet:   0%|          | 0.00/335M [00:00<?, ?B/s]

subset_11_3.parquet:   0%|          | 0.00/335M [00:00<?, ?B/s]

subset_11_4.parquet:   0%|          | 0.00/327M [00:00<?, ?B/s]

subset_11_5.parquet:   0%|          | 0.00/335M [00:00<?, ?B/s]

subset_11_6.parquet:   0%|          | 0.00/332M [00:00<?, ?B/s]

subset_11_7.parquet:   0%|          | 0.00/330M [00:00<?, ?B/s]

subset_11_8.parquet:   0%|          | 0.00/328M [00:00<?, ?B/s]

subset_11_9.parquet:   0%|          | 0.00/327M [00:00<?, ?B/s]

subset_12_1.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

subset_12_10.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

subset_12_2.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

subset_12_3.parquet:   0%|          | 0.00/333M [00:00<?, ?B/s]

subset_12_4.parquet:   0%|          | 0.00/332M [00:00<?, ?B/s]

subset_12_5.parquet:   0%|          | 0.00/330M [00:00<?, ?B/s]

subset_12_6.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

subset_12_7.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

subset_12_8.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

subset_12_9.parquet:   0%|          | 0.00/334M [00:00<?, ?B/s]

subset_13_1.parquet:   0%|          | 0.00/324M [00:00<?, ?B/s]

subset_13_10.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

subset_13_2.parquet:   0%|          | 0.00/332M [00:00<?, ?B/s]

subset_13_3.parquet:   0%|          | 0.00/332M [00:00<?, ?B/s]

subset_13_4.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

subset_13_5.parquet:   0%|          | 0.00/328M [00:00<?, ?B/s]

subset_13_6.parquet:   0%|          | 0.00/334M [00:00<?, ?B/s]

subset_13_7.parquet:   0%|          | 0.00/334M [00:00<?, ?B/s]

subset_13_8.parquet:   0%|          | 0.00/333M [00:00<?, ?B/s]

subset_13_9.parquet:   0%|          | 0.00/327M [00:00<?, ?B/s]

subset_14_1.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

subset_14_10.parquet:   0%|          | 0.00/330M [00:00<?, ?B/s]

subset_14_2.parquet:   0%|          | 0.00/329M [00:00<?, ?B/s]

subset_14_3.parquet:   0%|          | 0.00/330M [00:00<?, ?B/s]

subset_14_4.parquet:   0%|          | 0.00/332M [00:00<?, ?B/s]