
# Quantum machine learning using variational quantum circuits.

## Libraries

The set of libraries required to run the notebook

In [1]:

from srcCode import dataManagement as dm
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss



## Handling the credit card dataset

### Loading the dataset

The first step is to load the dataset and split the content into feature space and class, so that it is easier to handle the information afterwards.

In [2]:

# The dataset folder is included outside the workspace in a folder named datasets.
datafile_fraud = "../datasets/creditcard.csv"
start_index = 1
end_index = 29
class_index = 30
xVals, yVals = dm.read_data(datafile_fraud, start_index=start_index, end_index=end_index, class_index=class_index, k_lines=4)

# Fraud cases
iFraud = yVals[yVals == 1]
indexesFraud = list(iFraud.keys())

# Clean cases
iClean = yVals[yVals == 0]
indexesClean = list(iClean.keys())

print("Fraud cases: ", len(indexesFraud))
print("Clean cases: ", len(indexesClean))

# Shift label from {0, 1} to {-1, 1}
yVals = yVals * 2 - np.ones(len(yVals))  # shift label from {0, 1} to {-1, 1}


            Time        V1        V2        V3        V4        V5        V6  \
0            0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388   
1            0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361   
...          ...       ...       ...       ...       ...       ...       ...   
284805  172788.0 -0.240440  0.530483  0.702510  0.689799 -0.377961  0.623708   
284806  172792.0 -0.533413 -0.189733  0.703337 -0.506271 -0.012546 -0.649617   

              V7        V8        V9       V10       V11       V12       V13  \
0       0.239599  0.098698  0.363787  0.090794 -0.551600 -0.617801 -0.991390   
1      -0.078803  0.085102 -0.255425 -0.166974  1.612727  1.065235  0.489095   
...          ...       ...       ...       ...       ...       ...       ...   
284805 -0.686180  0.679145  0.392087 -0.399126 -1.933849 -0.962886 -1.042082   
284806  1.577006 -0.414650  0.486180 -0.915427 -1.040458 -0.031513 -0.188093   

             V14       V15       V16  


### Preparing the train and test set

In order to balance the dataset a oversampling technique (SMOTE) is utilized in combination with an undersampling one (Near Miss). The sampling technique achieves a middle point where the newly sampled of the minority class cases and the undersampled cases of the majority class are equal without forcing one another.

In [3]:

X_train, X_test, y_train, y_test = train_test_split(xVals, yVals, test_size = 0.25, random_state = 0)
print("Before OverSampling, counts of label FRAUD('1') in the training set: {}".format(sum(y_train == 1)))
print("Before OverSampling, counts of label CLEAN('0') in the training: {} \n".format(sum(y_train == -1)))

sm = SMOTE(sampling_strategy = 0.05, random_state = 6)
X_train_os, y_train_os = sm.fit_resample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_os.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_os.shape))
  
print("After OverSampling, counts of label FRAUD('1'): {}".format(sum(y_train_os == 1)))
print("After OverSampling, counts of label CLEAN('0'): {}".format(sum(y_train_os == -1)))

nr = NearMiss(sampling_strategy = 1.0)
X_train_balanced, y_train_balanced = nr.fit_resample(X_train_os, y_train_os.ravel())

print('After Undersampling, the shape of train_X: {}'.format(X_train_balanced.shape))
print('After Undersampling, the shape of train_y: {} \n'.format(y_train_balanced.shape))
  
print("After Undersampling, counts of label FRAUD('1'): {}".format(sum(y_train_balanced == 1)))
print("After Undersampling, counts of label CLEAN('0'): {}".format(sum(y_train_balanced == -1)))


Before OverSampling, counts of label FRAUD('1') in the training set: 372
Before OverSampling, counts of label CLEAN('0') in the training: 213233 

After OverSampling, the shape of train_X: (223894, 28)
After OverSampling, the shape of train_y: (223894,) 

After OverSampling, counts of label FRAUD('1'): 10661
After OverSampling, counts of label CLEAN('0'): 213233
After Undersampling, the shape of train_X: (21322, 28)
After Undersampling, the shape of train_y: (21322,) 

After Undersampling, counts of label FRAUD('1'): 10661
After Undersampling, counts of label CLEAN('0'): 10661


## Training the quantum circuit

In [4]:
print(yVals)

0        -1.0
1        -1.0
2        -1.0
3        -1.0
4        -1.0
         ... 
284802   -1.0
284803   -1.0
284804   -1.0
284805   -1.0
284806   -1.0
Name: Class, Length: 284807, dtype: float64
