In [1]:
from typing import Tuple
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import recall_score, precision_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, f1_score
from sklearn.utils import resample
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from qiskit.circuit.library import ZZFeatureMap
from qiskit_machine_learning.kernels import FidelityQuantumKernel

In [2]:
data = pd.read_csv("/Users/miloszglowacki/Desktop/uam/ibm/algo/Project_9/dataset/creditcard.csv")

data = resample(data, n_samples=200, replace=False, stratify=data["Class"], random_state=0)

In [3]:
print("DATA COLUMNS:", data.columns)
print("\nDATA SIZE:", data.shape)
print("\nFIRST 5 ROWS:", data.head)
print(data["Class"].value_counts())

DATA COLUMNS: Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

DATA SIZE: (200, 31)

FIRST 5 ROWS: <bound method NDFrame.head of             Time        V1        V2        V3        V4        V5        V6  \
266543  162374.0  1.915075 -0.263068 -1.765741  0.332474  0.033409 -0.754076   
102875   68408.0 -0.328174  0.719165  1.111028 -0.434435 -0.085913 -0.888919   
198910  132709.0  2.062016  0.017803 -1.041502  0.409546 -0.064367 -1.196602   
91346    63424.0 -0.531320  0.667302  1.858070 -0.530679  0.303545  0.077263   
5489      5528.0 -1.062678  0.977074  1.922641  0.026997  0.449646 -0.288029   
...          ...       ...       ...       ...       ...       ...       ...   
247071  153446.0 -1.103430  1.121015  1.295806 -0.616834  0.100726  0.536032   
176968  122985

In [4]:
def prepare_data(
    data: pd.DataFrame,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    X_test = data
    X_train = data[data["Class"] != 1]

    y_test = X_test["Class"]
    y_train = X_train["Class"]

    X_test = X_test.drop(columns=["Class"])
    X_train = X_train.drop(columns=["Class"])

    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_transformed_train = scaler.transform(X_train)
    X_transformed_test = scaler.transform(X_test)

    pca = PCA(n_components=30)
    pca.fit(X_transformed_train)
    X_transormed_train = pca.transform(X_transformed_train)
    X_transformed_test = pca.transform(X_transformed_test)

    return (X_transormed_train, y_train, X_transformed_test, y_test, scaler, pca)


def prepare_data_supervised(data: pd.DataFrame, test_size=0.3, random_state=0, pca_components=5):
    X = data.drop(columns=["Class"])
    y = data["Class"]

    X_train_df, X_test_df, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train_df)
    X_test_scaled = scaler.transform(X_test_df)

    pca = PCA(n_components=pca_components, random_state=random_state)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    # dopasowanie pod feature map: [0, pi]
    fm_scaler = MinMaxScaler(feature_range=(0, np.pi))
    X_train_final = fm_scaler.fit_transform(X_train_pca)
    X_test_final = fm_scaler.transform(X_test_pca)

    return X_train_final, y_train, X_test_final, y_test, scaler, pca, fm_scaler


def make_qsvc_dataset(df, n_norm=200, n_anom=50, random_state=0):
    df0 = df[df["Class"] == 0]
    df1 = df[df["Class"] == 1]
    if len(df1) == 0:
        raise ValueError("Brak anomalii w danych wej≈õciowych (Class==1).")

    n_anom = min(n_anom, len(df1))
    df1s = df1.sample(n=n_anom, random_state=random_state)
    df0s = df0.sample(n=n_norm, random_state=random_state)

    out = pd.concat([df0s, df1s], axis=0).sample(frac=1.0, random_state=random_state).reset_index(drop=True)
    return out

In [5]:
def eval_binary(y_true, y_pred, name="model"):
    # metryki liczone dla klasy pozytywnej = 1 (anomalia)
    p = precision_score(y_true, y_pred, pos_label=1, zero_division=0)
    r = recall_score(y_true, y_pred, pos_label=1, zero_division=0)
    f1 = f1_score(y_true, y_pred, pos_label=1, zero_division=0)
    acc = accuracy_score(y_true, y_pred)

    print(f"\n[{name}] accuracy={acc:.4f}  precision(1)={p:.4f}  recall(1)={r:.4f}  f1(1)={f1:.4f}")
    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))


def train_classical_model(X_train, y_train, X_test, y_test):
    clf_svm = OneClassSVM(kernel="rbf", degree=3, gamma=0.1, nu=0.01)
    clf_svm.fit(X_train)

    # OneClassSVM: 1=inlier (normal), -1=outlier (anomalia)
    y_predict = clf_svm.predict(X_test)
    y_pred = pd.Series(y_predict).replace({1: 0, -1: 1}).to_numpy()

    eval_binary(y_test, y_pred, name="OneClassSVM")


def train_qsvc(X_train, y_train, X_test, y_test, n_qubits=5, reps=2, entanglement="full"):
    feature_map = ZZFeatureMap(feature_dimension=n_qubits, reps=reps, entanglement=entanglement)
    quantum_kernel = FidelityQuantumKernel(feature_map=feature_map)

    K_train = quantum_kernel.evaluate(x_vec=X_train)
    model = SVC(kernel="precomputed")
    model.fit(K_train, y_train)

    K_test = quantum_kernel.evaluate(x_vec=X_test, y_vec=X_train)
    y_pred = model.predict(K_test)

    eval_binary(y_test, y_pred, name="QSVC (quantum kernel)")
    return model

In [6]:
(X_train, y_train, X_test, y_test, _, _) = prepare_data(data)

print("TRAIN", X_train[:5])
print("Y TRAIN", y_train[:5])

print("TEST", X_test[:5])
print("Y TEST", len(y_test[y_test[:] == 1]))

train_classical_model(X_train, y_train, X_test, y_test)

TRAIN [[ 4.62640659e-01 -1.78552509e-01  1.53398396e-01  1.91905576e-01
   1.14524432e-01 -2.28885409e-01  1.94929162e-01 -4.50150017e-02
  -1.99579883e-01  9.28729950e-03  2.61412919e-01  2.42209439e-01
  -1.55531247e-01  8.17148144e-02 -3.68155701e-02 -7.33500010e-03
   2.38743727e-02  9.87869835e-02  1.09019084e-01  1.11290941e-02
  -2.10935428e-02  4.24538570e-02  1.73823210e-02 -6.16913797e-02
   5.29200009e-03  5.43937478e-03  5.06209405e-03 -3.94683602e-03
   1.20982405e-03 -1.02421152e-04]
 [-1.26115452e-01  1.93318791e-01 -7.27653214e-02  2.83918706e-01
  -1.81431060e-01 -7.26536000e-03  1.41405373e-01  4.09487685e-02
  -1.34734856e-01 -1.06933115e-01  2.38537191e-01 -2.11342934e-01
   1.65118131e-03 -1.81319034e-01 -1.31995573e-01 -2.13443300e-01
  -1.70524330e-01 -1.11624615e-01  8.40591652e-04  8.19683120e-02
  -7.38152554e-02  2.12223676e-02 -4.66683105e-02 -9.35273927e-02
   6.63427068e-02 -2.77331621e-02  8.83615684e-04 -2.12285322e-02
  -2.77758250e-04  5.33674354e-04]


In [15]:
data_full = pd.read_csv("/Users/miloszglowacki/Desktop/uam/ibm/algo/Project_9/dataset/creditcard.csv")
data_qsvc = make_qsvc_dataset(data_full, n_norm=200, n_anom=50, random_state=0)
print(data_qsvc["Class"].value_counts())

n_qubits = 2

X_train, y_train, X_test, y_test, scaler, pca, fm_scaler = prepare_data_supervised(
    data_qsvc,
    test_size=0.7,
    random_state=0,
    pca_components=n_qubits,
)

qsvc_model = train_qsvc(
    X_train, y_train,
    X_test, y_test,
    n_qubits=n_qubits,
    reps=2,
    entanglement="linear",
)

Class
0    200
1     50
Name: count, dtype: int64


  feature_map = ZZFeatureMap(feature_dimension=n_qubits, reps=reps, entanglement=entanglement)



[QSVC (quantum kernel)] accuracy=0.9371  precision(1)=1.0000  recall(1)=0.6857  f1(1)=0.8136
[[140   0]
 [ 11  24]]
              precision    recall  f1-score   support

           0     0.9272    1.0000    0.9622       140
           1     1.0000    0.6857    0.8136        35

    accuracy                         0.9371       175
   macro avg     0.9636    0.8429    0.8879       175
weighted avg     0.9417    0.9371    0.9325       175

