This cell is responsible for managing imports.

In [1]:
from qiskit import Aer          # for simulator backend
from qiskit_machine_learning.algorithms import QSVC # quantum support vector classifier class
from qiskit_machine_learning.kernels.quantum_kernel import QuantumKernel # wraps feature map and backend combination to give to QSVC
import qiskit.circuit.library   # for feature maps
import numpy as np
import joblib                   # for persistence

# for data sets and data set processing
import sklearn.datasets
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from qiskit_machine_learning.datasets.dataset_helper import features_and_labels_transform

The functions in this cell are responsible for loading, preparing, and splitting data sets for input to the QSVM classifier.

In [2]:
# TODO: change this to allow specifying which classes to extract for binary classification,
# rather than simply extracting 2 arbitrary classes. If 2 arbitrary classes are specified anyway,
# at least make them manually selected and identifiable rather than seemingly random.
def extract_binary_classes(feature_array, label_array):
    """Takes a numpy array of feature vectors and a numpy array of labels
    and returns transformed numpy arrays with the number of classes reduced
    to 2."""
    classes = list(set(label_array))[:2] # get the first 2 unique labels as classes
    class_map = {classes[0]:0, classes[1]:1} # convert labels to 0 and 1 (needed for training step)
    # construct a feature and label description with information from only the first 2 classes
    features = []
    labels = []
    for (feature, label) in zip(feature_array, label_array):
        if label in classes:
            features.append(feature)
            labels.append(label)
    return (np.array(features), np.array(labels))

def process_dataset(dataset, qubit_count=4, binary_classification=True):
    """Performs scaling and dimensionality reduction on all feature vectors of a data set,
    then returns the processed vectors. It will also extract 2 classes from the data set
    if binary_classification is True, rather than leaving the data set as a multi-class data set.
    Some of this code is modified from the qiskit_machine_learning data set loading
    source code (check qiskit_machine_learning.datasets.digits source code for exact location
    of what was modified from)."""
    feature_vectors = dataset.data
    labels = dataset.target

    # maybe extract classes for binary classification
    if binary_classification:
        feature_vectors, labels = extract_binary_classes(feature_vectors, labels)

    # Now we standardize for gaussian around 0 with unit variance
    scaler = StandardScaler()
    scaler.fit(feature_vectors)
    feature_vectors = scaler.transform(feature_vectors)

    # Now reduce number of features to number of qubits
    pca = PCA(n_components=qubit_count)
    pca.fit(feature_vectors)
    feature_vectors = pca.transform(feature_vectors)

    # Scale to the range (-1,+1)
    minmax_scaler = MinMaxScaler((-1, 1)).fit(feature_vectors)
    feature_vectors = minmax_scaler.transform(feature_vectors)

    # perform some other transformation on the feature and label vectors
    # as was done in the qiskit_machine_learning source code
    dataset_dict = {label:np.array([feature_vector for feature_vector, feature_vector_label in zip(feature_vectors, labels)
                                    if feature_vector_label == label])
                    for label in list(set(labels))}
    feature_vectors, labels = features_and_labels_transform(dataset_dict, labels, one_hot=False)

    return feature_vectors, labels

def cross_fold_sets(data, labels, k=5, seed=22):
    "Given a data set's feature array, yield training and testing feature arrays for k-fold validation. If the same seed is used then the same subsets should be returned across different calls."
    kf = KFold(n_splits=k, shuffle=True, random_state=seed)
    # for each of the k train-test splits:
    for train_indices, test_indices in kf.split(data, labels):
        # helper function
        extract_elements = lambda array, indices: np.array([array[i] for i in indices])
        # get training and testing feature vectors
        train_features = extract_elements(data, train_indices)
        test_features = extract_elements(data, test_indices)
        # get training and testing labels
        train_labels = extract_elements(labels, train_indices)
        test_labels = extract_elements(labels, test_indices)
        # return current split values
        yield (train_features, train_labels, test_features, test_labels)

This cell defines a function that can be given some parameters determining a classifier, like the feature map to use, the data to train on, and the backend to run the training on.

In [3]:
# MAYBE DO: make batch size a parameter
def make_classifier(feature_map_instance, training_features, training_labels, backend):
    """Given a feature map instance, training features and labels, and a quantum backend,
    creates, trains, and returns a QSVM classifier."""
    # Create a quantum kernel from the feature map and
    # backend to give to the QSVC class.
    batch_size = 1000           # this is the QuantumKernel default
    quantum_kernel = QuantumKernel(feature_map=feature_map_instance, batch_size=batch_size, quantum_instance=backend)
    # Create a QSVC instance
    qsvc = QSVC(quantum_kernel=quantum_kernel)
    # Perform training
    qsvc.fit(training_features, training_labels)
    # return classifier instance
    return qsvc

This cell is similar to the above cell in that it in effect takes a specification for a classifier, but the function instead returns the generalisation metrics of the classifier that is described.

In [4]:
# TODO: finalize what generalisation metrics should be used and calculate them (maybe also look at margin size)
# MAYBE DO: put parameters like feature count and  number of repetitions in
# the argument list to make them independent variables of the experiments rather
# than constants.
def process_combination(feature_map_class, data_split_tuple, repetitions, backend_instance, qubit_count=4):
    """Takes a feature map class, dataset loading function, and a backend, and
    returns the generalisation metrics of the combination of arguments."""
    # Create the feature map instance.
    feature_count = qubit_count
    feature_map_instance = feature_map_class(feature_dimension=feature_count, reps=repetitions)
    # unpack the data split for binary classification
    train_features, train_labels, test_features, test_labels = data_split_tuple

    # create the classifier
    qsvc = make_classifier(feature_map_instance, train_features, train_labels, backend_instance)

    # get the classification accuracy on training and testing data as generalisation metrics
    train_accuracy = qsvc.score(train_features, train_labels)
    test_accuracy = qsvc.score(test_features, test_labels)
    # return the generalisation metrics and the trained model
    return train_accuracy, test_accuracy, qsvc

This cell defines a function that collects the generalisation information of all tested classifier configurations.

In [5]:
# TODO: make k-fold validation gather statistics of results in this
# function or later, rather than treating each of the k runs as a separate
# combination.
combination_number = 0 # must be global to work around python scope limitations for nested functions
def process_all_combinations(do_binary_classification=True):
    """Runs all experiments."""
    global combination_number
    # get a list of datasets loaded into memory
    datasets = [sklearn.datasets.load_breast_cancer(),
                sklearn.datasets.load_digits(),
                sklearn.datasets.load_iris(),
                sklearn.datasets.load_wine()]
    # corresponding human-readable names for recording results
    dataset_names = ["cancer", "digits", "iris", "wine"]

    # get a list of feature maps
    feature_map_classes = [qiskit.circuit.library.PauliFeatureMap,
                           qiskit.circuit.library.ZFeatureMap,
                           qiskit.circuit.library.ZZFeatureMap]
    # corresponding human-readable names for recording results
    feature_map_names = ["Pauli", "Z", "ZZ"]

    # Number of qubits to simulate / number of features to reduce to.
    qubit_count = 4

    # create a quantum backend
    backend = Aer.get_backend("aer_simulator_statevector") # should configure this to mimic IBMQ backend
    #backend.set_options(device='GPU')                      # enable GPU acceleration (comment this line to disable)

    # Define choice of k for k-fold cross-validation. This
    # number determines how many equally sized disjoint
    # subsets to split the dataset into, after which each
    # is used as the testing set in turn with the remaining
    # subsets being used as the training set.
    cross_validation_splits = 5
    
    # Do some output to the user to give them a sense of how long running the experiments will take
    number_of_investigated_repetition_values = 4 # for trying depth = 2, 3, 4, and 5
    combination_count = len(datasets) * cross_validation_splits * (number_of_investigated_repetition_values + len(feature_map_classes)-1)
    print(f"Running with {len(datasets)} datasets, {len(feature_map_classes)} feature maps, {number_of_investigated_repetition_values} different encoding repetitions for the ZZ feature map, and {cross_validation_splits}-fold cross validation, requiring the training of {combination_count} classifiers in total.")
    
    ## Process each combination.
    results = {}
    combination_number = 0      # set global combination number to 0
    # For each dataset
    for dataset, dataset_name in zip(datasets, dataset_names):
        # Perform dimensionality reduction and scaling on the features,
        # and transform the labels as done in the qiskit_machine_learning
        # data set loading source code. Also prepares the data for
        # binary rather than multi-class classification if enabled.
        features, labels = process_dataset(dataset, binary_classification=do_binary_classification, qubit_count=qubit_count)
        print(f"{dataset_name} dataset: {len(features)} feature vectors.")
        # For each k-fold split of the data into training and testing sets
        for (split_number, split_tuple) in enumerate(cross_fold_sets(features, labels, k=cross_validation_splits)):
            # For each feature map
            for feature_map_class, feature_map_name in zip(feature_map_classes, feature_map_names):
                def process_with_repetitions(repetitions):
                    global combination_number # use reference to global variable
                    # show a debug / progress message
                    print(f"{combination_number}: Processing data set {dataset_name}, feature map {feature_map_name}, cross-validation split {split_number}, and {repetitions} encoding repetitions...")
                    combination_number += 1 # for debug / progress output
                    # run experiment for this combination
                    train_accuracy, test_accuracy, model = process_combination(feature_map_class, split_tuple, repetitions, backend, qubit_count)
                    # record results using human-readable names and values (other than the trained classifier)
                    results[(dataset_name, split_number, feature_map_name, repetitions)] = (train_accuracy, test_accuracy)
                # For ZZ feature map, try multiple encoding repetitions. For other feature maps,
                # just do 2 repetitions.
                if feature_map_name == "ZZ":
                    for reps in range(2, 2+number_of_investigated_repetition_values):
                        process_with_repetitions(reps)
                else:
                    process_with_repetitions(2)
    return results

This cell defines a function that performs the experiments and reports their results.

In [6]:
def main ():
    results = process_all_combinations()
    for combination in results:
        print(f"Combination {combination} has results {results[combination]}.")
    return results

This cell can be evaluated to actually perform the experiments. It can take a few hours to run so loading pre-computed results is preferred.

In [None]:
#results = main()                # uncomment this when you want to run it (to prevent running accidentally)

The next 3 cells should be run selectively to save and load pre-computed results.

In [7]:
def save_results():
    """Saves results to a file if there are any, overwriting previous results."""
    if results != None:
        joblib.dump(results, "results.z")
def load_results():
    """Loads results from a file."""
    global results
    results = joblib.load("results.z")

In [13]:
#save_results() # uncomment this when you want to run it (to prevent accidentally running it)

In [8]:
#load_results() # uncomment this when you want to run it (to prevent accidentally running it)

These 2 cells read the results variable and combine cross-validation runs to get statistics.

In [14]:
def combine_cross_validations():
    def extract_cross_validations(combination):
        """Returns a list of training and a list of testing accuracies for the given
        combination, using the values in the results from the different cross-validation
        runs."""
        dataset_name, feature_map_name, repetitions = combination
        train_list = []
        test_list = []
        for key in results:
            (d_name, run, f_name, rep) = key
            if d_name == dataset_name and f_name == feature_map_name and rep == repetitions:
                train_accuracy, test_accuracy = results[key]
                train_list.append(train_accuracy)
                test_list.append(test_accuracy)
        return train_list, test_list
    # extract combinations to get statistics about from the results variable
    combinations = set()
    for key in results:
        (d_name, run, f_name, rep) = key
        combinations.add((d_name, f_name, rep))
    stats = {}
    for c in combinations:
        train_accuracies, test_accuracies = extract_cross_validations(c)
        stats[c] = ((np.mean(train_accuracies), np.std(train_accuracies), np.var(train_accuracies)),
                    (np.mean(test_accuracies), np.std(test_accuracies), np.var(test_accuracies)))
    return stats

This should only run after computing or loading a value for the results variable.

In [21]:
stats = combine_cross_validations()
# TODO: this below loop should be made more robust if it stays in the code
for feature_map in ("Pauli", "Z", "ZZ"):
    for repetition in range(2, 6 if feature_map == "ZZ" else 3):
        for dataset in ("wine", "cancer", "iris", "digits"):
            key = (dataset, feature_map, repetition)
            value = stats[key]
            print(f"Stats for {dataset}, {feature_map}, {repetition} repetitions:")
            print("Mean classifier accuracies:")
            print(f"{value[0][0]} | {value[1][0]}")
            print("Standard deviations:")
            print(f"{value[0][1]} | {value[1][1]}")
            print("Variances:")
            print(f"{value[0][2]} | {value[1][2]}")
            print()

Stats for wine, Pauli, 2 repetitions:
Mean classifier accuracies:
0.9673076923076923 | 0.7461538461538462
Standard deviations:
0.014390989949130531 | 0.06249260311258431
Variances:
0.00020710059171597596 | 0.003905325443786982

Stats for cancer, Pauli, 2 repetitions:
Mean classifier accuracies:
0.9507923655292077 | 0.8541996584381307
Standard deviations:
0.008737959335838347 | 0.03808109446036603
Variances:
7.635193335476453e-05 | 0.0014501697552993203

Stats for iris, Pauli, 2 repetitions:
Mean classifier accuracies:
0.9875 | 0.7
Standard deviations:
0.007905694150420955 | 0.0894427190999916
Variances:
6.250000000000011e-05 | 0.008000000000000004

Stats for digits, Pauli, 2 repetitions:
Mean classifier accuracies:
0.992361111111111 | 0.9583333333333334
Standard deviations:
0.004049272149198111 | 0.008784104611578835
Variances:
1.639660493827149e-05 | 7.716049382716056e-05

Stats for wine, Z, 2 repetitions:
Mean classifier accuracies:
0.9442307692307693 | 0.9230769230769231
Standard de