# m_qubits_QVC_Breast_Cancer

In [44]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split 
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn import svm

import scipy
from scipy.linalg import expm
import scikitplot as skplt

from h_partitioned import *
#from W_unitary import *
#from U_unitary import *
from qiskit_algorithms.optimizers import COBYLA, ADAM, SPSA, SLSQP, POWELL, L_BFGS_B, TNC, AQGD

%matplotlib inline
plt.rcParams['figure.figsize'] = (6,4)
plt.rcParams['figure.dpi'] = 100
sns.set()

### helper functions

In [45]:
def make_meshgrid(x1, x2, h=0.2):
    
    x1_min, x1_max = x1.min() - 1, x1.max() + 1
    x2_min, x2_max = x2.min() - 1, x2.max() + 1
    x1x1, x2x2 = np.meshgrid(np.arange(x1_min, x1_max, h), np.arange(x2_min, x2_max, h))
    
    return x1x1, x2x2


def training_split(X_train, y_train, n_batches):
    
    if len(X_train)%n_batches == 0:
        
        X_batches = np.split(X_train, n_batches)
        y_batches = np.split(y_train, n_batches)
        
    else:
        print('Warning: the training set must be divided into equally sized batches')
    
    return X_batches, y_batches


def k_fold_split(X, y, ele_per_split, i):
    
    k_X_train = np.concatenate( (X[:ele_per_split*i, :], X[ele_per_split*(i+1):, :]) )
    k_X_test = X[ele_per_split*i:ele_per_split*(i+1), :]
    
    k_y_train = np.concatenate( (y[:ele_per_split*i], y[ele_per_split*(i+1):]) )
    k_y_test = y[ele_per_split*i:ele_per_split*(i+1)]
    
    return k_X_train, k_X_test, k_y_train, k_y_test


def PCA1(X_train, X_test, y_train, y_test, n_dimensions):
    
    # Now the dataset's features will be standardized
    # to fit a normal distribution.
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # To be able to use this data with the given
    # number of qubits, the data must be broken down from
    # 30 dimensions to `n` dimensions.
    # This is done with Principal Component Analysis (PCA),
    # which finds patterns while keeping variation.
    pca = PCA(n_dimensions).fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)

    # The last step in the data processing is
    # to scale the data to be between -1 and 1
    samples = np.append(X_train, X_test, axis=0)
    minmax_scale = MinMaxScaler((-1, 1)).fit(samples)
    X_train = minmax_scale.transform(X_train)
    X_test = minmax_scale.transform(X_test)
    
    return X_train, X_test, y_train, y_test 

def PCA2(X, y, n_dimensions):
    
    # Now the dataset's features will be standardized
    # to fit a normal distribution.
    scaler = StandardScaler().fit(X)
    X = scaler.transform(X)
    
    # To be able to use this data with the given
    # number of qubits, the data must be broken down from
    # 30 dimensions to `n` dimensions.
    # This is done with Principal Component Analysis (PCA),
    # which finds patterns while keeping variation.
    pca = PCA(n_dimensions).fit(X)
    X = pca.transform(X)

    # The last step in the data processing is
    # to scale the data to be between -1 and 1
    samples = X
    minmax_scale = MinMaxScaler((-1, 1)).fit(samples)
    X = minmax_scale.transform(X)
    
    return X, y

### dataset

In [46]:
dataset = load_breast_cancer()

used_points = 40 ## Must be multiple of 70

X = dataset.data[:used_points]
y = dataset.target[:used_points]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

n_dimensions = 4
X_train, X_test, y_train, y_test = PCA1(X_train, X_test, y_train, y_test, n_dimensions)

n_batches = 2 
X_batches, y_batches = training_split(X_train, y_train, n_batches)

## Classical SVM

### linear kernel

In [47]:
linear_kernel = svm.LinearSVC()
linear_kernel.fit(X_train, y_train);



In [48]:
accuracy_train = linear_kernel.score(X_train, y_train)
accuracy_test = linear_kernel.score(X_test, y_test)

print(accuracy_train)
print(accuracy_test)

1.0
0.9166666666666666


### gaussian kernel

In [49]:
gaussian_kernel = svm.SVC(gamma = 'scale')
gaussian_kernel.fit(X_train, y_train);

In [50]:
accuracy_train = gaussian_kernel.score(X_train, y_train)
accuracy_test = gaussian_kernel.score(X_test, y_test)

print(accuracy_train)
print(accuracy_test)

1.0
0.9166666666666666


## Quantum SVM (explicit approach)

In [51]:
RANDOM_STATE = 42
seed = np.random.seed(RANDOM_STATE)

n = 4
d = 2
n_part = 2

init_theta = 2*np.pi*np.random.random(n*d*3)
print('Initial parameters: '+ str(init_theta))

Initial parameters: [2.35330497 5.97351416 4.59925358 3.76148219 0.98029403 0.98014248
 0.3649501  5.44234523 3.77691701 4.44895122 0.12933619 6.09412333
 5.23039137 1.33416598 1.14243996 1.15236452 1.91161039 3.2971419
 2.71399059 1.82984665 3.84438512 0.87646578 1.83559896 2.30191935]


#### training

In [52]:
def MSE_loss(theta, data, labels):
    
    tmp = []
    for i in range(0, len(labels)):
        if labels[i] == 0:
            tmp.append(-1)
        else:
            tmp.append(1)
    
    predictions = []
    for i in range(0, len(labels)):
        predictions.append(h_partitioned(data[i], n, n_part, d, shots, theta))
    
    error = []
    for i in range(0, len(predictions)):
        parity = predictions[i] - tmp[i]
        error.append(parity)

    norm = np.linalg.norm(error)
    
    return norm 

In [53]:
epochs = n_batches
shots = 1024
theta = init_theta
thetas = []
predictions = []
training_accuracies = []
testing_accuracies = []

for i in tqdm(range(0, epochs)):
    
    prediction = np.zeros(len(y_batches[i]))
    data = X_batches[i]
    labels = y_batches[i]
    
    for j in range(0, len(data)):
        prediction[j] = h_partitioned(data[j], n, n_part, d, shots, theta)
        
        objective_function = lambda theta: MSE_loss(theta, data, labels)
        optimizer = COBYLA(maxiter=100)
        
        theta_opt = optimizer.minimize(objective_function, theta).x
    
    h_subtest = np.zeros(len(y_test))
    for j in range(0, len(y_test)):
        h_subtest[j] = h_partitioned(X_test[j], n, n_part, d, shots, theta_opt)
    
    train_result = 1 - ( ((sum(np.abs(2*labels-1-prediction)))/2) / len(labels) ) 
    test_result = 1 -  ( ((sum(np.abs(2*y_test-1-h_subtest)))/2) / len(y_test) ) 
    training_accuracies.append(train_result)
    testing_accuracies.append(test_result)
    
    thetas.append(theta_opt)
    predictions.append(prediction)
    theta = theta_opt

  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print('Training accuracies: '+ str(training_accuracies))
print('Training mean: '+ str(np.mean(training_accuracies)))
print('Testing accuracies: '+ str(testing_accuracies))
print('Testing mean: '+ str(np.mean(testing_accuracies)))

In [None]:
plt.plot(training_accuracies, color='blue', label='training')
plt.plot(testing_accuracies, color='red', linestyle=':',label='testing')
plt.xlabel('epochs')
plt.ylabel('accuracies')
plt.legend(loc=0, frameon=False)
plt.savefig('BC_learning_partitioned.pdf')
plt.show()

#### testing

In [None]:
tmp = []
for i in range(0, len(predictions)):
    for j in range(0, len(predictions[i])):
        tmp.append(predictions[i][j])
        
accuracy_train = 1 - ( ((sum(np.abs(2*y_train-1-tmp)))/2) / len(y_train) )
print('Training accuracy: '+ str(accuracy_train))

In [None]:
h_test = []
for i in range(0, len(y_test)):
    h_test.append(h_partitioned(X_test[i], n, n_part, d, shots, theta_opt))

accuracy_test = 1 - ( ((sum(np.abs(2*y_test-1-h_test)))/2) / len(y_test) ) 
print('Testing accuracy: '+ str(accuracy_test))

### Cross-Validation

### split

In [None]:
k = 10
assert len(X)%k == 0 
ele_per_split = int(len(X)/k)

### training

In [None]:
theta = init_theta
k_thetas = []
k_training_accuracies = []
k_testing_accuracies = []

for i in tqdm(range(k)):
    
    k_X_train, k_X_test, k_y_train, k_y_test = k_fold_split(X, y, ele_per_split, i)
    k_X_train, k_X_test, k_y_train, k_y_test = PCA1(k_X_train, k_X_test, k_y_train, k_y_test, n_dimensions)

    objective_function = lambda theta: MSE_loss(theta, k_X_train, k_y_train)
    optimizer = COBYLA(maxiter=100)
    
    theta_opt = optimizer.minimize(objective_function, theta).x

    k_thetas.append(theta_opt)

    k_train_predictions = np.zeros(len(k_y_train))
    for j in range(0, len(k_y_train)):
        k_train_predictions[j] = h_partitioned(k_X_train[j], n, n_part, d, shots, theta_opt) 

    k_test_predictions = np.zeros(len(k_y_test))
    for j in range(0, len(k_y_test)):
        k_test_predictions[j] = h_partitioned(k_X_test[j], n, n_part, d, shots, theta_opt) 
        
    k_train_result = 1 - ( ((sum(np.abs(2*k_y_train-1-k_train_predictions)))/2) / len(k_y_train) )
    k_test_result = 1 - ( ((sum(np.abs(2*k_y_test-1-k_test_predictions)))/2) / len(k_y_test) )
    k_training_accuracies.append(k_train_result)
    k_testing_accuracies.append(k_test_result)

In [None]:
print('Training accuracies: '+ str(k_training_accuracies))
print('Training mean: '+ str(np.mean(k_training_accuracies)))
print('Testing accuracies: '+ str(k_testing_accuracies))
print('Testing mean: '+ str(np.mean(k_testing_accuracies)))

In [None]:
plt.plot(k_training_accuracies, color='blue', label='training')
plt.plot(k_testing_accuracies, color='red', linestyle=':',label='testing')
plt.xlabel('k')
plt.ylabel('accuracies')
plt.legend(loc=0, frameon=False)
plt.savefig('BC_cv_partitioned.pdf')
plt.show()

### valuation

In [None]:
max_testing_accuracy = max(k_testing_accuracies)
index = k_testing_accuracies.index(max_testing_accuracy)
k_theta_opt = k_thetas[index]

dataset = load_breast_cancer()
X_val = dataset.data[:used_points]
y_val = dataset.target[:used_points]

X_val, y_val = PCA2(X_val, y_val, n_dimensions)

h_val = np.zeros(len(y_val))
for i in range(0, len(y_val)):
    h_val[i] = h_partitioned(X_val[i], n, n_part, d, shots, k_theta_opt) 

validation_accuracy = 1 - ( ((sum(np.abs(2*y_val-1-h_val)))/2) / len(y_val) )

print("Optimal parameters: "+ str(k_theta_opt))
print("Validation accuracy: "+ str(validation_accuracy))

In [None]:
tmp = []
for i in range(0, len(h_val)):
    if h_val[i] == 1:
        tmp.append(0)
    else:
        tmp.append(1)
        
skplt.metrics.plot_confusion_matrix(y_val, tmp, normalize=True, title = 'Breast Cancer (after cross-validation)');
plt.savefig('BC_cf_partitioned.pdf')