<a href="https://colab.research.google.com/github/Vaishnavi481/QML-and-Classical-ML/blob/main/QMLvsML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
# @title Install required libraries
# Qiskit & Qiskit Machine Learning for quantum models
# scikit-learn, pandas, numpy, matplotlib for classical ML and plotting

!pip install -q qiskit qiskit-aer qiskit-machine-learning scikit-learn pandas numpy matplotlib
!pip install -q qiskit-ibm-runtime qiskit-algorithms



In [50]:
# @title Imports, configuration, and result structure

import time
from dataclasses import dataclass, asdict
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from qiskit import transpile


# --- scikit-learn: datasets, preprocessing, models, metrics ---
from sklearn.datasets import load_iris, load_breast_cancer, load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# --- Qiskit Machine Learning: quantum kernel machinery ---
from qiskit.circuit.library import ZZFeatureMap
from qiskit_machine_learning.kernels import FidelityQuantumKernel

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)


@dataclass
class ExperimentResult:
    """
    Container for one experiment run: one dataset, one model, one train_fraction.
    This will be turned into rows of a Pandas DataFrame at the end.
    """
    dataset: str
    model: str
    train_fraction: float

    accuracy: float
    f1_macro: float

    train_time_sec: float
    test_time_sec: float

    # Quantum-specific fields (mostly None for classical models)
    n_qubits: Optional[int] = None
    n_shots: Optional[int] = None
    n_train_samples: Optional[int] = None
    n_test_samples: Optional[int] = None

In [51]:
!pip install -q qiskit-ibm-runtime qiskit-algorithms

In [52]:
# @title Initialize IBM Quantum account and Sampler for kernels

from qiskit_ibm_runtime import QiskitRuntimeService, SamplerV2 as Sampler
from qiskit_algorithms.state_fidelities import ComputeUncompute

QiskitRuntimeService.save_account(
     channel= "ibm_quantum_platform",
     instance= "crn:v1:bluemix:public:quantum-computing:us-east:a/00a4a61538784fb4902efee3054ee134:cf3f5ce5-3fea-4de7-a38a-dbf4fe160608::",
     token='0fdbqWmD6AnaCgHimNoU8_XyQ-KWNl-8lInS8TmqQLv2',
     set_as_default=True,
     overwrite=True,
)
service = QiskitRuntimeService()
backend = service.least_busy(operational=True, simulator=False)
backend.name

# Build a SamplerV2 on this backend
sampler_ibm = Sampler(mode=backend)

# Build a ComputeUncompute fidelity object using this sampler
fidelity_ibm = ComputeUncompute(sampler=sampler_ibm)


In [53]:
# @title Dataset loading utilities (Iris, Breast Cancer, Wine, Banknote)

def load_iris_data() -> Tuple[np.ndarray, np.ndarray]:
    """
    Iris dataset
    - 150 samples, 4 numeric features, 3 classes
    """
    iris = load_iris()
    X, y = iris.data, iris.target
    return X, y


def load_breast_cancer_data() -> Tuple[np.ndarray, np.ndarray]:
    """
    Breast Cancer Wisconsin (Diagnostic)
    - 569 samples, 30 numeric features, binary labels (0/1)
    """
    bc = load_breast_cancer()
    X, y = bc.data, bc.target
    return X, y

    # Columns: variance, skewness, curtosis, entropy, class
    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values
    return X, y


In [54]:
# @title QSVM using IBM Runtime backend (noisy or high-shot simulator)

def run_quantum_kernel_svm_ibm(
    dataset_name: str,
    X_train: np.ndarray,
    X_val: np.ndarray,
    X_test: np.ndarray,
    y_train: np.ndarray,
    y_val: np.ndarray,
    y_test: np.ndarray,
    train_fraction: float,
    max_features: int = 4,
    C_values: List[float] = (0.1, 1.0, 10.0),
) -> List[ExperimentResult]:
    """
    Quantum kernel SVM, but kernel evaluations are done on an IBM Runtime backend
    via a SamplerV2 + ComputeUncompute fidelity.

    We also do a tiny grid search over C on the *validation* set to give QML a fair shot.
    """
    results: List[ExperimentResult] = []

    # Subsample train set
    if train_fraction < 1.0:
        X_train_full = X_train
        y_train_full = y_train
        X_train_sub, _, y_train_sub, _ = train_test_split(
            X_train_full,
            y_train_full,
            train_size=train_fraction,
            stratify=y_train_full,
            random_state=RANDOM_STATE,
        )
    else:
        X_train_sub, y_train_sub = X_train, y_train

    # PCA + [0, π] scaling for quantum features
    X_train_q, X_val_q, X_test_q, n_features_q = get_quantum_features(
        X_train_sub,
        X_val,
        X_test,
        max_features=max_features,
    )

    # Feature map + IBM-based fidelity kernel
    feature_map = ZZFeatureMap(
        feature_dimension=n_features_q,
        reps=2,
        entanglement="full",
    )

    # FidelityQuantumKernel that uses the IBM sampler-based fidelity
    qkernel_ibm = FidelityQuantumKernel(
        feature_map=feature_map,
        fidelity=fidelity_ibm,   # <-- uses your IBM backend
    )

    print(
        f"[QUANTUM-IBM] Dataset={dataset_name}, train_fraction={train_fraction}, "
        f"n_qubits={n_features_q}, backend={backend.name}"
    )

    # --- Tiny hyperparameter search on C, using validation set ---
    best_C = None
    best_val_acc = -np.inf
    best_clf = None

    for C in C_values:
        print(f"  - Trying C={C} ... (kernel matrices on IBM backend)")

        # Precompute kernel matrices (train/val)
        K_train = qkernel_ibm.evaluate(x_vec=X_train_q)
        K_val = qkernel_ibm.evaluate(x_vec=X_val_q, y_vec=X_train_q)

        clf = SVC(kernel="precomputed", C=C)

        start_train = time.time()
        clf.fit(K_train, y_train_sub)
        end_train = time.time()

        y_val_pred = clf.predict(K_val)
        val_acc = accuracy_score(y_val, y_val_pred)

        print(f"    -> val_acc={val_acc:.3f}, train_time={end_train - start_train:.2f}s")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_C = C
            best_clf = clf

    # --- Evaluate on test set with best C ---
    # Recompute kernels for best C (same quantum kernel, just new K_test)
    K_train = qkernel_ibm.evaluate(x_vec=X_train_q)
    K_test = qkernel_ibm.evaluate(x_vec=X_test_q, y_vec=X_train_q)

    start_test = time.time()
    y_test_pred = best_clf.predict(K_test)
    end_test = time.time()

    acc = accuracy_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred, average="macro")

    print(
        f"  [IBM-QSVM result] C={best_C}, test_acc={acc:.3f}, "
        f"test_f1={f1:.3f}, val_acc={best_val_acc:.3f}"
    )

    # NOTE: kernel evaluation time lives inside fidelity / sampler; we do not split it cleanly.
    # For reporting, you can measure wall-clock around evaluate() separately if you want.
    results.append(
        ExperimentResult(
            dataset=dataset_name,
            model="QSVM_FidelityKernel_IBM",
            train_fraction=train_fraction,
            accuracy=acc,
            f1_macro=f1,
            train_time_sec=np.nan,   # optional: fill with measured time
            test_time_sec=end_test - start_test,
            n_qubits=n_features_q,
            n_shots=None,            # you can set sampler_ibm.options.default_shots
            n_train_samples=X_train_sub.shape[0],
            n_test_samples=X_test.shape[0],
        )
    )

    return results


In [55]:
# @title Train/val/test split and quantum feature preprocessing

def train_val_test_split(
    X: np.ndarray,
    y: np.ndarray,
    train_size: float = 0.6,
    val_size: float = 0.2,
    test_size: float = 0.2,
    random_state: int = RANDOM_STATE,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    """
    Split data into train / val / test for each dataset.
    By default: 60% train, 20% val, 20% test (as in the plan).
    """
    assert abs(train_size + val_size + test_size - 1.0) < 1e-8

    # First split off the training set
    X_train, X_temp, y_train, y_temp = train_test_split(
        X,
        y,
        train_size=train_size,
        stratify=y,
        random_state=random_state,
    )

    # Then split the remaining into val and test
    relative_val = val_size / (val_size + test_size)

    X_val, X_test, y_val, y_test = train_test_split(
        X_temp,
        y_temp,
        train_size=relative_val,
        stratify=y_temp,
        random_state=random_state,
    )

    return X_train, X_val, X_test, y_train, y_val, y_test


def get_quantum_features(
    X_train: np.ndarray,
    X_val: np.ndarray,
    X_test: np.ndarray,
    max_features: int = 4,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, int]:
    """
    Prepare features for quantum models.

    Steps:
    1. If number of features > max_features, reduce using PCA
       (we keep max_features principal components).
    2. Scale each feature into [0, π] for use as rotation angles
       in the quantum feature map.
    3. Return transformed train/val/test + the final feature dimension.

    This function is intentionally separate so classical models
    can still use the full feature space if desired.
    """
    from sklearn.decomposition import PCA

    n_features = X_train.shape[1]

    # Optional PCA for dimensionality reduction
    if n_features > max_features:
        pca = PCA(n_components=max_features, random_state=RANDOM_STATE)
        X_train_red = pca.fit_transform(X_train)
        X_val_red = pca.transform(X_val)
        X_test_red = pca.transform(X_test)
        out_dim = max_features
    else:
        X_train_red = X_train
        X_val_red = X_val
        X_test_red = X_test
        out_dim = n_features

    # Scale features to [0, π] for rotation angles
    scaler = MinMaxScaler(feature_range=(0.0, np.pi))
    X_train_q = scaler.fit_transform(X_train_red)
    X_val_q = scaler.transform(X_val_red)
    X_test_q = scaler.transform(X_test_red)

    return X_train_q, X_val_q, X_test_q, out_dim


In [56]:
# @title Classical model definitions and runner

def build_classical_models() -> Dict[str, Pipeline]:
    """
    Build the set of classical baseline models:
      - Logistic Regression
      - SVM with RBF kernel
      - Random Forest
      - Small MLP

    Each model is wrapped in a Pipeline that applies StandardScaler
    before the classifier.
    """
    models = {
        "LogReg": LogisticRegression(
            max_iter=1000,
            multi_class="auto",
            random_state=RANDOM_STATE
        ),
        "SVM_RBF": SVC(
            kernel="rbf",
            probability=False,
            random_state=RANDOM_STATE
        ),
        "RandomForest": RandomForestClassifier(
            n_estimators=200,
            random_state=RANDOM_STATE
        ),
        "MLP": MLPClassifier(
            hidden_layer_sizes=(64, 32),
            activation="relu",
            max_iter=500,
            random_state=RANDOM_STATE
        ),
    }

    pipelines = {}
    for name, clf in models.items():
        pipelines[name] = Pipeline(
            [
                ("scaler", StandardScaler()),
                ("clf", clf),
            ]
        )
    return pipelines


def run_classical_models(
    dataset_name: str,
    X_train: np.ndarray,
    X_val: np.ndarray,
    X_test: np.ndarray,
    y_train: np.ndarray,
    y_val: np.ndarray,
    y_test: np.ndarray,
    train_fraction: float,
) -> List[ExperimentResult]:
    """
    Run all classical baselines on a given dataset for a given train_fraction.

    Steps:
    - Optionally subsample the training set according to train_fraction.
    - Train each classical model.
    - Evaluate on the test set.
    - Record accuracy, macro F1, and timing.

    Returns a list of ExperimentResult objects.
    """
    results: List[ExperimentResult] = []
    models = build_classical_models()

    # Subsample the training set for low-data regimes
    if train_fraction < 1.0:
        X_train_sub, _, y_train_sub, _ = train_test_split(
            X_train,
            y_train,
            train_size=train_fraction,
            stratify=y_train,
            random_state=RANDOM_STATE,
        )
    else:
        X_train_sub, y_train_sub = X_train, y_train

    for name, pipe in models.items():
        print(f"[CLASSICAL] Dataset={dataset_name}, Model={name}, train_fraction={train_fraction}")

        start_train = time.time()
        pipe.fit(X_train_sub, y_train_sub)
        end_train = time.time()

        start_test = time.time()
        y_pred = pipe.predict(X_test)
        end_test = time.time()

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro")

        results.append(
            ExperimentResult(
                dataset=dataset_name,
                model=name,
                train_fraction=train_fraction,
                accuracy=acc,
                f1_macro=f1,
                train_time_sec=end_train - start_train,
                test_time_sec=end_test - start_test,
                n_qubits=None,
                n_shots=None,
                n_train_samples=X_train_sub.shape[0],
                n_test_samples=X_test.shape[0],
            )
        )

    return results


In [57]:
# @title Quantum Kernel SVM (QSVM-style) runner

def run_quantum_kernel_svm(
    dataset_name: str,
    X_train: np.ndarray,
    X_val: np.ndarray,
    X_test: np.ndarray,
    y_train: np.ndarray,
    y_val: np.ndarray,
    y_test: np.ndarray,
    train_fraction: float,
    max_features: int = 4,
) -> List[ExperimentResult]:
    """
    Quantum kernel SVM experiment.

    Steps:
    - Optionally subsample training data (train_fraction).
    - Apply PCA + scaling to [0, π] to get low-dimensional quantum features.
    - Build a ZZFeatureMap with feature_dimension = number of quantum features.
    - Create a FidelityQuantumKernel using this feature map.
    - Train an SVC using the quantum kernel as a callable kernel.
    - Evaluate on the test set.

    Returns a list with a single ExperimentResult for QSVM.
    """
    results: List[ExperimentResult] = []

    # Subsample for low-data regimes
    if train_fraction < 1.0:
        X_train_full = X_train
        y_train_full = y_train
        X_train_sub, _, y_train_sub, _ = train_test_split(
            X_train_full,
            y_train_full,
            train_size=train_fraction,
            stratify=y_train_full,
            random_state=RANDOM_STATE,
        )
    else:
        X_train_sub, y_train_sub = X_train, y_train

    # Quantum feature preprocessing (PCA + scaling)
    X_train_q, X_val_q, X_test_q, n_features_q = get_quantum_features(
        X_train_sub,
        X_val,
        X_test,
        max_features=max_features,
    )

    # Feature map: encodes classical data into quantum states with entangling ZZ interactions
    feature_map = ZZFeatureMap(
        feature_dimension=n_features_q,
        reps=2,
        entanglement="full",
    )

    # FidelityQuantumKernel uses an internal statevector or sampler backend by default
    qkernel = FidelityQuantumKernel(feature_map=feature_map)

    print(
        f"[QUANTUM] Dataset={dataset_name}, Model=QSVM_FidelityKernel, "
        f"train_fraction={train_fraction}, n_qubits={n_features_q}"
    )

    # SVC with callable kernel = quantum kernel evaluate function
    qsvc = SVC(kernel=qkernel.evaluate)

    start_train = time.time()
    qsvc.fit(X_train_q, y_train_sub)
    end_train = time.time()

    start_test = time.time()
    y_pred = qsvc.predict(X_test_q)
    end_test = time.time()

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")

    results.append(
        ExperimentResult(
            dataset=dataset_name,
            model="QSVM_FidelityKernel",
            train_fraction=train_fraction,
            accuracy=acc,
            f1_macro=f1,
            train_time_sec=end_train - start_train,
            test_time_sec=end_test - start_test,
            n_qubits=n_features_q,
            n_shots=None,  # using analytic primitives; shots could be added with an explicit Sampler
            n_train_samples=X_train_sub.shape[0],
            n_test_samples=X_test.shape[0],
        )
    )

    return results


In [58]:
# @title Quantum Kernel SVM (ideal, statevector-based) runner

def run_quantum_kernel_svm(
    dataset_name: str,
    X_train: np.ndarray,
    X_val: np.ndarray,
    X_test: np.ndarray,
    y_train: np.ndarray,
    y_val: np.ndarray,
    y_test: np.ndarray,
    train_fraction: float,
    max_features: int = 4,
) -> List[ExperimentResult]:
    """
    Quantum kernel SVM experiment (ideal, statevector-based).

    - Subsample training set (train_fraction).
    - PCA + scaling to [0, π] for 2–4 quantum features.
    - ZZFeatureMap + FidelityQuantumKernel (default statevector fidelity).
    - SVC with kernel=qkernel.evaluate.
    """
    results: List[ExperimentResult] = []

    # 1. Subsample for low-data regimes
    if train_fraction < 1.0:
        X_train_full = X_train
        y_train_full = y_train
        X_train_sub, _, y_train_sub, _ = train_test_split(
            X_train_full,
            y_train_full,
            train_size=train_fraction,
            stratify=y_train_full,
            random_state=RANDOM_STATE,
        )
    else:
        X_train_sub, y_train_sub = X_train, y_train

    # 2. Quantum feature preprocessing (PCA + scaling to [0, π])
    X_train_q, X_val_q, X_test_q, n_features_q = get_quantum_features(
        X_train_sub,
        X_val,
        X_test,
        max_features=max_features,
    )

    # 3. Feature map: encode data into quantum states
    feature_map = ZZFeatureMap(
        feature_dimension=n_features_q,
        reps=2,
        entanglement="full",
    )

    # 4. FidelityQuantumKernel with default statevector fidelity
    qkernel = FidelityQuantumKernel(feature_map=feature_map)

    print(
        f"[QUANTUM] Dataset={dataset_name}, Model=QSVM_FidelityKernel, "
        f"train_fraction={train_fraction}, n_qubits={n_features_q}"
    )

    # 5. Train SVC with callable kernel
    qsvc = SVC(kernel=qkernel.evaluate)

    start_train = time.time()
    qsvc.fit(X_train_q, y_train_sub)
    end_train = time.time()

    start_test = time.time()
    y_pred = qsvc.predict(X_test_q)
    end_test = time.time()

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="macro")

    results.append(
        ExperimentResult(
            dataset=dataset_name,
            model="QSVM_FidelityKernel",
            train_fraction=train_fraction,
            accuracy=acc,
            f1_macro=f1,
            train_time_sec=end_train - start_train,
            test_time_sec=end_test - start_test,
            n_qubits=n_features_q,
            n_shots=None,
            n_train_samples=X_train_sub.shape[0],
            n_test_samples=X_test.shape[0],
        )
    )

    return results


In [None]:
# @title Run all experiments across datasets and training fractions

def run_all_experiments(
    output_csv: str = "results_qml_vs_classical.csv",
    train_fractions: Optional[List[float]] = None,
) -> pd.DataFrame:
    """
    High-level experiment driver.

    For each dataset:
      - Load data
      - Split into train/val/test
      - For each train_fraction:
          * Run classical baselines
          * Run quantum kernel SVM
    Save all results into a CSV and also return as a DataFrame.
    """
    if train_fractions is None:
        # Fractions of the training set to actually use
        train_fractions = [0.1, 0.3, 0.6, 1.0]

    # Map dataset names to loader functions
    datasets_config = {
        "iris": load_iris_data,
        "breast_cancer": load_breast_cancer_data,
        "banknote": load_banknote_data,
        "wine": load_wine_data,
    }

    all_results: List[ExperimentResult] = []

    for ds_name, loader in datasets_config.items():
        print(f"\n================ Dataset: {ds_name} ================")
        X, y = loader()

        # Same split for all models
        X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, y)

        for frac in train_fractions:
            # Run classical models
            classical_results = run_classical_models(
                dataset_name=ds_name,
                X_train=X_train,
                X_val=X_val,
                X_test=X_test,
                y_train=y_train,
                y_val=y_val,
                y_test=y_test,
                train_fraction=frac,
            )
            all_results.extend(classical_results)

            # Run quantum kernel SVM
            quantum_results = run_quantum_kernel_svm(
                dataset_name=ds_name,
                X_train=X_train,
                X_val=X_val,
                X_test=X_test,
                y_train=y_train,
                y_val=y_val,
                y_test=y_test,
                train_fraction=frac,
                max_features=4,  # 2–4 qubits as per plan
            )
            all_results.extend(quantum_results)

    # Convert to DataFrame and save
    df = pd.DataFrame([asdict(r) for r in all_results])
    df.to_csv(output_csv, index=False)
    print(f"\n[INFO] Saved all results to {output_csv}")
    return df


# Actually run everything
df_results = run_all_experiments()
df_results.head()
# @title Quick sanity check and overall summary

# Take a peek at the first few rows
display(df_results.head())

print("\nModels:", sorted(df_results["model"].unique()))
print("Datasets:", sorted(df_results["dataset"].unique()))
print("Train fractions:", sorted(df_results["train_fraction"].unique()))

# Overall mean accuracy / F1 by model (across datasets & fractions)
summary_overall = (
    df_results.groupby("model")[["accuracy", "f1_macro"]]
    .mean()
    .sort_values("accuracy", ascending=False)
    .round(3)
)
print("\nOverall mean performance by model:")
display(summary_overall)




[CLASSICAL] Dataset=iris, Model=LogReg, train_fraction=0.1
[CLASSICAL] Dataset=iris, Model=SVM_RBF, train_fraction=0.1
[CLASSICAL] Dataset=iris, Model=RandomForest, train_fraction=0.1




[CLASSICAL] Dataset=iris, Model=MLP, train_fraction=0.1
[QUANTUM] Dataset=iris, Model=QSVM_FidelityKernel, train_fraction=0.1, n_qubits=4
[CLASSICAL] Dataset=iris, Model=LogReg, train_fraction=0.3
[CLASSICAL] Dataset=iris, Model=SVM_RBF, train_fraction=0.3
[CLASSICAL] Dataset=iris, Model=RandomForest, train_fraction=0.3




[CLASSICAL] Dataset=iris, Model=MLP, train_fraction=0.3
[QUANTUM] Dataset=iris, Model=QSVM_FidelityKernel, train_fraction=0.3, n_qubits=4
[CLASSICAL] Dataset=iris, Model=LogReg, train_fraction=0.6
[CLASSICAL] Dataset=iris, Model=SVM_RBF, train_fraction=0.6
[CLASSICAL] Dataset=iris, Model=RandomForest, train_fraction=0.6




[CLASSICAL] Dataset=iris, Model=MLP, train_fraction=0.6
[QUANTUM] Dataset=iris, Model=QSVM_FidelityKernel, train_fraction=0.6, n_qubits=4
[CLASSICAL] Dataset=iris, Model=LogReg, train_fraction=1.0
[CLASSICAL] Dataset=iris, Model=SVM_RBF, train_fraction=1.0
[CLASSICAL] Dataset=iris, Model=RandomForest, train_fraction=1.0




[CLASSICAL] Dataset=iris, Model=MLP, train_fraction=1.0
[QUANTUM] Dataset=iris, Model=QSVM_FidelityKernel, train_fraction=1.0, n_qubits=4

[CLASSICAL] Dataset=breast_cancer, Model=LogReg, train_fraction=0.1
[CLASSICAL] Dataset=breast_cancer, Model=SVM_RBF, train_fraction=0.1
[CLASSICAL] Dataset=breast_cancer, Model=RandomForest, train_fraction=0.1




[CLASSICAL] Dataset=breast_cancer, Model=MLP, train_fraction=0.1
[QUANTUM] Dataset=breast_cancer, Model=QSVM_FidelityKernel, train_fraction=0.1, n_qubits=4
[CLASSICAL] Dataset=breast_cancer, Model=LogReg, train_fraction=0.3
[CLASSICAL] Dataset=breast_cancer, Model=SVM_RBF, train_fraction=0.3
[CLASSICAL] Dataset=breast_cancer, Model=RandomForest, train_fraction=0.3




[CLASSICAL] Dataset=breast_cancer, Model=MLP, train_fraction=0.3
[QUANTUM] Dataset=breast_cancer, Model=QSVM_FidelityKernel, train_fraction=0.3, n_qubits=4
[CLASSICAL] Dataset=breast_cancer, Model=LogReg, train_fraction=0.6
[CLASSICAL] Dataset=breast_cancer, Model=SVM_RBF, train_fraction=0.6
[CLASSICAL] Dataset=breast_cancer, Model=RandomForest, train_fraction=0.6




[CLASSICAL] Dataset=breast_cancer, Model=MLP, train_fraction=0.6
[QUANTUM] Dataset=breast_cancer, Model=QSVM_FidelityKernel, train_fraction=0.6, n_qubits=4
[CLASSICAL] Dataset=breast_cancer, Model=LogReg, train_fraction=1.0
[CLASSICAL] Dataset=breast_cancer, Model=SVM_RBF, train_fraction=1.0
[CLASSICAL] Dataset=breast_cancer, Model=RandomForest, train_fraction=1.0




[CLASSICAL] Dataset=breast_cancer, Model=MLP, train_fraction=1.0
[QUANTUM] Dataset=breast_cancer, Model=QSVM_FidelityKernel, train_fraction=1.0, n_qubits=4


In [None]:
# @title Main results table (train_fraction = 1.0) and LaTeX export

# Filter to full-training runs
df_full = df_results[df_results["train_fraction"] == 1.0].copy()

# Average over multiple random splits if you ever run more than once
table_full = (
    df_full
    .groupby(["dataset", "model"])[["accuracy", "f1_macro"]]
    .mean()
    .reset_index()
)

# Round for nicer display
table_full["accuracy"] = table_full["accuracy"].round(3)
table_full["f1_macro"] = table_full["f1_macro"].round(3)

print("Main results (full training data):")
display(table_full)


In [None]:
# @title LaTeX table: dataset × model (full training data)

latex_full = table_full.to_latex(
    index=False,
    caption="Classification accuracy and macro-F1 at full training data for all datasets and models.",
    label="tab:full_results",
    float_format="%.3f",
)
print(latex_full)


In [None]:
# @title Wide LaTeX table: models as rows, datasets as columns (accuracy)

# Pivot on accuracy
acc_pivot = (
    df_full
    .groupby(["model", "dataset"])["accuracy"]
    .mean()
    .unstack("dataset")
    .round(3)
)

print("Accuracy (full training) per model and dataset:")
display(acc_pivot)

latex_acc = acc_pivot.to_latex(
    caption="Test accuracy at full training fraction, for each model and dataset.",
    label="tab:accuracy_full_wide",
    float_format="%.3f",
)
print(latex_acc)


In [None]:
# @title Quick accuracy plots: accuracy vs train_fraction per dataset

def plot_accuracy_vs_fraction(df: pd.DataFrame, dataset_name: str):
    """
    For a given dataset, plot accuracy vs train_fraction for each model.
    """
    sub = df[df["dataset"] == dataset_name]
    models = sorted(sub["model"].unique())

    plt.figure(figsize=(7, 4))
    for m in models:
        mdata = sub[sub["model"] == m].sort_values("train_fraction")
        plt.plot(
            mdata["train_fraction"],
            mdata["accuracy"],
            marker="o",
            label=m,
        )

    plt.title(f"Accuracy vs Train Fraction ({dataset_name})")
    plt.xlabel("Train fraction of training set")
    plt.ylabel("Accuracy")
    plt.grid(True)
    plt.legend()
    plt.show()


for ds in sorted(df_results["dataset"].unique()):
    plot_accuracy_vs_fraction(df_results, ds)


In [None]:
# @title Contrast table: accuracy at 10% vs 100% train fraction

low_frac = 0.1
high_frac = 1.0

df_low = df_results[df_results["train_fraction"] == low_frac]
df_high = df_results[df_results["train_fraction"] == high_frac]

# Mean across any internal randomness
df_low_mean = (
    df_low.groupby(["dataset", "model"])[["accuracy", "f1_macro"]]
    .mean()
    .rename(columns={"accuracy": "acc_low", "f1_macro": "f1_low"})
)
df_high_mean = (
    df_high.groupby(["dataset", "model"])[["accuracy", "f1_macro"]]
    .mean()
    .rename(columns={"accuracy": "acc_high", "f1_high": "f1_high"})
)

contrast = (
    df_low_mean
    .join(df_high_mean)
    .reset_index()
)

for col in ["acc_low", "acc_high", "f1_low", "f1_high"]:
    contrast[col] = contrast[col].round(3)

print(f"Accuracy/F1 at {int(low_frac*100)}% vs {int(high_frac*100)}% of training data:")
display(contrast)


In [None]:
# @title Plot accuracy and macro-F1 vs train fraction (per dataset) and save as PDF

def plot_acc_f1_vs_fraction(df: pd.DataFrame, dataset_name: str, save_pdf: bool = True):
    """
    Generates a 2-row figure for one dataset:
      - Top: accuracy vs train_fraction for each model
      - Bottom: macro F1 vs train_fraction for each model

    If save_pdf=True, saves as '{dataset_name}_acc_f1_vs_fraction.pdf'.
    """
    sub = df[df["dataset"] == dataset_name].copy()
    models = sorted(sub["model"].unique())

    fig, axes = plt.subplots(2, 1, figsize=(7, 6), sharex=True)

    # Accuracy subplot
    ax_acc = axes[0]
    for m in models:
        mdata = (
            sub[sub["model"] == m]
            .sort_values("train_fraction")
        )
        ax_acc.plot(
            mdata["train_fraction"],
            mdata["accuracy"],
            marker="o",
            linestyle="-",
            label=m,
        )
    ax_acc.set_ylabel("Accuracy")
    ax_acc.set_title(f"{dataset_name}: Accuracy vs training fraction")
    ax_acc.grid(True)
    ax_acc.legend(loc="best")

    # F1 subplot
    ax_f1 = axes[1]
    for m in models:
        mdata = (
            sub[sub["model"] == m]
            .sort_values("train_fraction")
        )
        ax_f1.plot(
            mdata["train_fraction"],
            mdata["f1_macro"],
            marker="o",
            linestyle="-",
            label=m,
        )
    ax_f1.set_xlabel("Train fraction of training split")
    ax_f1.set_ylabel("Macro F1")
    ax_f1.set_title(f"{dataset_name}: Macro F1 vs training fraction")
    ax_f1.grid(True)

    fig.tight_layout()

    if save_pdf:
        filename = f"{dataset_name}_acc_f1_vs_fraction.pdf"
        fig.savefig(filename, bbox_inches="tight")
        print(f"[INFO] Saved figure to {filename}")

    plt.show()


for ds in sorted(df_results["dataset"].unique()):
    plot_acc_f1_vs_fraction(df_results, ds, save_pdf=True)


In [None]:
# @title Plot training time vs train fraction (per dataset)

def plot_train_time_vs_fraction(df: pd.DataFrame, dataset_name: str, save_pdf: bool = False):
    sub = df[df["dataset"] == dataset_name].copy()
    models = sorted(sub["model"].unique())

    plt.figure(figsize=(7, 4))
    for m in models:
        mdata = (
            sub[sub["model"] == m]
            .sort_values("train_fraction")
        )
        plt.plot(
            mdata["train_fraction"],
            mdata["train_time_sec"],
            marker="o",
            linestyle="-",
            label=m,
        )

    plt.title(f"{dataset_name}: Training time vs training fraction")
    plt.xlabel("Train fraction of training split")
    plt.ylabel("Training time (seconds)")
    plt.grid(True)
    plt.legend(loc="best")
    plt.tight_layout()

    if save_pdf:
        filename = f"{dataset_name}_train_time_vs_fraction.pdf"
        plt.savefig(filename, bbox_inches="tight")
        print(f"[INFO] Saved figure to {filename}")

    plt.show()


for ds in sorted(df_results["dataset"].unique()):
    plot_train_time_vs_fraction(df_results, ds, save_pdf=False)
