In [1]:
# End-to-end trainable quantum-kernel SVM with Qiskit + PyTorch
# - Learns SVM params (beta, b) and quantum feature map params theta
# - Backprop: parameter-shift through the kernel into the circuit
import math, numpy as np, torch
from dataclasses import dataclass
from typing import Tuple

from qiskit import QuantumCircuit
from qiskit.quantum_info import Statevector

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# === Save processed dataset, splits, and pipeline ===
import os, json
import joblib

import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


# # Set the path to the file you'd like to load
# file_path = "Synthetic_Financial_datasets_log.csv"

# # Load the latest version
# df = kagglehub.load_dataset(
#   KaggleDatasetAdapter.PANDAS,
#   "sriharshaeedala/financial-fraud-detection-dataset",
#   file_path,
#   # Provide any additional arguments like 
#   # sql_query or pandas_kwargs. See the 
#   # documenation for more information:
#   # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
# )

# print("First 5 records:", df.head())


In [3]:
# # === DataFrame-in / DataFrame-out cleaner ===
# import re, numpy as np, pandas as pd
# from typing import Optional, Tuple, Dict, List

# # -------- helpers --------
# def _snake(s: str) -> str:
#     s = re.sub(r"[^\w]+", "_", str(s).strip())
#     s = re.sub(r"_+", "_", s).strip("_")
#     return s.lower()

# def standardize_colnames(df: pd.DataFrame) -> pd.DataFrame:
#     return df.rename(columns={c: _snake(c) for c in df.columns})

# def coerce_numeric_strings(s: pd.Series) -> pd.Series:
#     x = s.astype(str).str.strip()
#     x = x.str.replace(r"\(([^)]+)\)", r"-\1", regex=True)  # "(1,234)" -> -1234
#     x = x.str.replace(r"[^\d\.\-eE]", "", regex=True)      # remove currency, commas, spaces
#     return pd.to_numeric(x, errors="coerce")

# def maybe_parse_datetime(col: pd.Series) -> Tuple[bool, Dict[str, pd.Series]]:
#     dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
#     if dt.notna().mean() < 0.5:
#         return False, {}
#     feats = {
#         "year": dt.dt.year.astype("Int64"),
#         "month": dt.dt.month.astype("Int8"),
#         "day": dt.dt.day.astype("Int8"),
#         "dayofweek": dt.dt.dayofweek.astype("Int8"),
#         "hour": dt.dt.hour.fillna(0).astype("Int8"),
#         "dayofyear": dt.dt.dayofyear.astype("Int16"),
#         "is_month_end": dt.dt.is_month_end.astype("Int8"),
#         "is_month_start": dt.dt.is_month_start.astype("Int8"),
#         "is_weekend": dt.dt.dayofweek.isin([5,6]).astype("Int8"),
#         "epoch_seconds": (dt.view("int64") // 10**9).astype("Int64"),
#     }
#     return True, feats

# def drop_obvious_ids(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
#     id_like = [c for c in df.columns if re.search(r"(?:^|_)(id|uuid|guid|hash|account|acct|customer|merchant)(?:_|$)", c)]
#     return df.drop(columns=id_like, errors="ignore"), id_like

# def detect_label(df: pd.DataFrame, hint: Optional[str]) -> Optional[str]:
#     if hint and hint in df.columns: return hint
#     for c in ["is_fraud","fraud","fraud_bool","fraudulent","class","target","label","y"]:
#         if c in df.columns: return c
#     for c in df.columns:
#         if "fraud" in c and df[c].nunique(dropna=True) <= 2:
#             return c
#     return None

# def split_feature_types(df: pd.DataFrame, max_onehot: int = 30) -> Dict[str, List[str]]:
#     types = {"numeric": [], "boolean": [], "cat_low": [], "cat_high": []}
#     for c in df.columns:
#         s = df[c]
#         if pd.api.types.is_bool_dtype(s) or (s.dropna().isin([0,1,True,False]).all() and s.nunique(dropna=True) <= 2):
#             types["boolean"].append(c)
#         elif pd.api.types.is_numeric_dtype(s):
#             types["numeric"].append(c)
#         else:
#             nun = s.nunique(dropna=True)
#             (types["cat_low"] if nun <= max_onehot else types["cat_high"]).append(c)
#     return types

# # -------- main: DataFrame -> cleaned DataFrame --------
# def clean_to_dataframe(
#     df_raw: pd.DataFrame,
#     label_hint: Optional[str] = None,
#     max_onehot_cardinality: int = 30,
#     clip_quantiles: Tuple[float,float] = (0.001, 0.999),
#     scale_numeric: bool = False,    # set True to z-score numeric columns
#     keep_label: bool = True         # append label back as last column if detected
# ) -> pd.DataFrame:
#     df = standardize_colnames(df_raw).copy()
#     df = df.drop_duplicates()

#     # Parse object columns: datetimes or numeric-like strings; else leave for categorical
#     new_cols = {}
#     drop_cols = []
#     for c in list(df.columns):
#         if pd.api.types.is_object_dtype(df[c]):
#             ok_dt, feats = maybe_parse_datetime(df[c])
#             if ok_dt:
#                 for k, v in feats.items():
#                     new_cols[f"{c}__{k}"] = v
#                 drop_cols.append(c)
#             else:
#                 coerced = coerce_numeric_strings(df[c])
#                 if coerced.notna().mean() >= 0.5:
#                     df[c] = coerced

#     if new_cols:
#         for k, v in new_cols.items():
#             df[k] = v
#     if drop_cols:
#         df.drop(columns=drop_cols, inplace=True, errors="ignore")

#     # Bool-like strings → 0/1
#     for c in df.select_dtypes(include="object").columns:
#         lc = df[c].str.lower()
#         mask_bool = lc.isin(["true","false","yes","no","y","n","t","f"])
#         if mask_bool.mean() > 0.8:
#             df[c] = lc.map({"true":1,"false":0,"yes":1,"no":0,"y":1,"n":0,"t":1,"f":0}).astype("Int8")

#     # Drop obvious IDs
#     df, dropped_ids = drop_obvious_ids(df)

#     # Detect & sanitize label
#     y_name = detect_label(df, label_hint)
#     y = None
#     if y_name:
#         y = df[y_name].copy()
#         if not pd.api.types.is_numeric_dtype(y):
#             if y.dropna().isin([0,1]).all():
#                 y = y.astype("Int8")
#             elif y.nunique(dropna=True) == 2:
#                 y = pd.Series(pd.Categorical(y).codes, index=y.index).astype("Int8")
#             else:
#                 y = pd.Series(pd.Categorical(y).codes, index=y.index).astype("Int16")
#         df = df.drop(columns=[y_name])

#     # Fill missing in remaining object columns so encoders work
#     for c in df.select_dtypes(include="object").columns:
#         df[c] = df[c].fillna("<<missing>>")

#     # Split types
#     types = split_feature_types(df, max_onehot_cardinality)

#     # Impute numerics with median, clip outliers
#     if types["numeric"]:
#         num_med = df[types["numeric"]].median()
#         df[types["numeric"]] = df[types["numeric"]].fillna(num_med)
#         q_lo = df[types["numeric"]].quantile(clip_quantiles[0])
#         q_hi = df[types["numeric"]].quantile(clip_quantiles[1])
#         df[types["numeric"]] = np.clip(df[types["numeric"]], q_lo, q_hi, axis=1)

#     # Impute booleans with mode and cast to Int8
#     for c in types["boolean"]:
#         mode = df[c].mode(dropna=True)
#         fillv = int(mode.iloc[0]) if not mode.empty else 0
#         df[c] = df[c].fillna(fillv).astype("Int8")

#     # One-hot encode low-card categoricals
#     if types["cat_low"]:
#         dummies = pd.get_dummies(
#             df[types["cat_low"]].astype("category"),
#             prefix=[c for c in types["cat_low"]],
#             prefix_sep="__",
#             dtype="Int8"
#         )
#         df = df.drop(columns=types["cat_low"]).join(dummies)

#     # Ordinal encode high-card categoricals
#     for c in types["cat_high"]:
#         s = df[c].astype("category")
#         df[f"{c}__ordinal"] = s.cat.codes.astype("Int32")  # stable integer codes
#     if types["cat_high"]:
#         df = df.drop(columns=types["cat_high"])

#     # Optional scaling of numeric columns (z-score), booleans untouched
#     if scale_numeric and types["numeric"]:
#         num_cols = [c for c in df.columns if c in types["numeric"]]
#         df[num_cols] = (df[num_cols] - df[num_cols].mean()) / df[num_cols].std(ddof=0)

#     # Append label at the end if requested
#     if keep_label and y is not None:
#         df[y_name] = y.values

#     # Final guarantee: all numeric dtypes
#     for c in df.columns:
#         if pd.api.types.is_object_dtype(df[c]):
#             # any leftover object (shouldn't happen) → category codes
#             df[c] = df[c].astype("category").cat.codes.astype("Int32")

#     print(f"[clean] rows={len(df_raw)} → {len(df)} after dedupe")
#     print(f"[clean] dropped ids: {dropped_ids}")
#     print(f"[types] numeric={len(types['numeric'])}, boolean={len(types['boolean'])}, "
#           f"cat_low={len(types['cat_low'])}, cat_high={len(types['cat_high'])}")
#     print(f"[encode] final shape: {df.shape}, label: {y_name!r}")
#     return df




In [4]:
# # ---- usage ----
# clean_df = clean_to_dataframe(df, label_hint=None, max_onehot_cardinality=30, scale_numeric=False)
# print(clean_df.dtypes.head())
# print(clean_df.head())

In [5]:
# os.makedirs("processed", exist_ok=True)
# clean_df.to_csv("processed/financial_fraud_processed.csv", index=False)

In [6]:
num = pd.read_csv("processed/financial_fraud_processed.csv")
num = num.head(300)

In [7]:


# Example: ready for train/test split (uses detected label if present)
from sklearn.model_selection import train_test_split

label_name = "isfraud"
if label_name is not None:
    y = num[label_name].to_numpy()
    X = num.drop(columns=[label_name]).to_numpy()
else:
    y = None
    X = num.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y if y is not None else np.zeros(len(X)),  # dummy if no label
    test_size=0.2, random_state=42, stratify=y if (y is not None and len(np.unique(y))>1) else None
)

print("Train shape:", X_train.shape, "| Test shape:", X_test.shape)

Train shape: (240, 14) | Test shape: (60, 14)


In [None]:



torch.manual_seed(0)
np.random.seed(0)

# -----------------------------
# 1) Toy data (binary labels ±1)
# -----------------------------
# X, y = make_moons(n_samples=1000, noise=0.9, random_state=0)
X = StandardScaler().fit_transform(X).astype(np.float32)
y = 2.0*y.astype(np.float32) - 1.0  # {0,1} -> {-1,+1}

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
Xtr = torch.tensor(Xtr)            # [n_train, d]
Xte = torch.tensor(Xte)            # [n_test, d]
ytr = torch.tensor(ytr)            # [n_train]
yte = torch.tensor(yte)            # [n_test]

n_train, d = Xtr.shape
n_qubits = d
n_layers = 2   # feel free to increase (cost grows with params × kernel evals)

# ----------------------------------------------------
# 2) A parameterized Qiskit feature map  |phi_theta(x)>
#    - Fixed data encoders (RX(x_i), RZ(x_i^2))
#    - Trainable single-qubit layers + RZZ ring per layer
#    All trainables are pure rotation angles -> clean shift rule.
# ----------------------------------------------------
@dataclass
class AnsatzShape:
    n_layers: int
    n_qubits: int

    @property
    def sizes(self):
        # For each layer and qubit: RY(theta_y), RZ(theta_z)
        # And an RZZ entangler per (layer, qubit) along a ring
        return {
            "theta_y": (self.n_layers, self.n_qubits),
            "theta_z": (self.n_layers, self.n_qubits),
            "theta_zz": (self.n_layers, self.n_qubits),
        }

    @property
    def total_params(self):
        L, Q = self.n_layers, self.n_qubits
        return L * Q * 3

def unflatten_theta(theta_flat: np.ndarray, shape: AnsatzShape):
    L, Q = shape.n_layers, shape.n_qubits
    assert theta_flat.size == shape.total_params
    k = 0
    th_y = theta_flat[k:k+L*Q].reshape(L, Q); k += L*Q
    th_z = theta_flat[k:k+L*Q].reshape(L, Q); k += L*Q
    th_zz = theta_flat[k:k+L*Q].reshape(L, Q)
    return th_y, th_z, th_zz

def build_feature_map(x_vec: np.ndarray, theta_flat: np.ndarray, shape: AnsatzShape) -> QuantumCircuit:
    """Return a circuit preparing |phi_theta(x)> from |0...0>."""
    L, Q = shape.n_layers, shape.n_qubits
    th_y, th_z, th_zz = unflatten_theta(theta_flat, shape)

    qc = QuantumCircuit(Q)
    # Fixed data encoding
    for q in range(Q):
        qc.rx(float(x_vec[q]), q)     # RX(x_i)
        qc.rz(float(x_vec[q]**2), q)  # RZ(x_i^2)

    # Trainable layers
    for l in range(L):
        for q in range(Q):
            qc.ry(float(th_y[l, q]), q)
            qc.rz(float(th_z[l, q]), q)
        # Entangling ring
        for q in range(Q):
            r = (q + 1) % Q
            qc.rzz(float(th_zz[l, q]), q, r)
    return qc

def statevector_from_circuit(qc: QuantumCircuit) -> np.ndarray:
    return Statevector.from_instruction(qc).data  # complex vector (2^n,)

def batch_states(X_np: np.ndarray, theta_np: np.ndarray, shape: AnsatzShape) -> np.ndarray:
    """Return array [N, 2^n] of complex statevectors for all samples."""
    return np.stack([statevector_from_circuit(build_feature_map(x, theta_np, shape))
                     for x in X_np], axis=0)

def kernel_from_states(S: np.ndarray) -> np.ndarray:
    """K_ij = |<phi_i|phi_j>|^2 from S [N, D]."""
    G = S @ S.conj().T             # [N,N] complex Gram
    K = np.abs(G)**2               # fidelity kernel
    # Stabilizer to keep PSD and help optimization:
    K += 1e-6 * np.eye(K.shape[0], dtype=K.dtype)
    return K.real.astype(np.float32)

# ---------------------------------------------------------
# 3) Autograd: Quantum kernel forward + parameter-shift back
# ---------------------------------------------------------
class QKernelShift(torch.autograd.Function):
    @staticmethod
    def forward(ctx, X: torch.Tensor, theta: torch.Tensor, n_layers_q: int):
        # Save static config
        shape = AnsatzShape(n_layers=int(n_layers_q), n_qubits=X.shape[1])
        ctx.shape = shape

        # Compute kernel K(theta) on CPU with Qiskit
        X_np = X.detach().cpu().numpy()
        th_np = theta.detach().cpu().numpy()
        S = batch_states(X_np, th_np, shape)
        K = kernel_from_states(S)

        # For backward
        ctx.save_for_backward(X.detach(), theta.detach())
        return torch.from_numpy(K)  # [N,N] float32

    @staticmethod
    def backward(ctx, grad_output: torch.Tensor):
        # dL/dtheta = sum_ij (dL/dK_ij) * (dK_ij/dtheta)
        X, theta = ctx.saved_tensors
        shape: AnsatzShape = ctx.shape
        X_np = X.cpu().numpy()
        th_np = theta.cpu().numpy().copy()
        gK = grad_output.detach().cpu().numpy()

        shift = math.pi/2
        P = th_np.size
        grad_theta = np.zeros_like(th_np, dtype=np.float64)

        # Helper to compute K for a given theta
        def K_of(theta_vec: np.ndarray) -> np.ndarray:
            S = batch_states(X_np, theta_vec, shape)
            return kernel_from_states(S)

        # Precompute K at current theta? Not necessary for central diff, but optional.

        # Parameter-shift loop (exact for single-parameter rotation gates)
        for p in range(P):
            th_plus = th_np.copy();   th_plus[p]  += shift
            th_minus = th_np.copy();  th_minus[p] -= shift

            Kp = K_of(th_plus)
            Km = K_of(th_minus)
            dK = 0.5*(Kp - Km)       # (f(theta+π/2) - f(theta-π/2))/2

            grad_theta[p] = np.sum(gK * dK, dtype=np.float64)

        # No gradients for X (not learning the raw inputs)
        grad_X = None
        return grad_X, torch.from_numpy(grad_theta.astype(np.float32)), None

def quantum_kernel_matrix(X: torch.Tensor, theta: torch.Tensor, n_layers_q: int) -> torch.Tensor:
    return QKernelShift.apply(X, theta, n_layers_q)  # [N,N]

# --------------------------------------------
# 4) SVM-in-primal parameters (β in R^n, b∈R)
# --------------------------------------------
shape = AnsatzShape(n_layers=n_layers, n_qubits=n_qubits)
theta = torch.nn.Parameter(0.3*torch.randn(shape.total_params))  # trainable feature map
beta = torch.nn.Parameter(torch.zeros(n_train))                  # SVM coeffs
b = torch.nn.Parameter(torch.zeros(()))                          # bias

C = 5.0     # soft-margin weight
lr = 0.05
optim = torch.optim.Adam([theta, beta, b], lr=lr)

def svm_primal_loss(K: torch.Tensor, y: torch.Tensor, beta: torch.Tensor, b: torch.Tensor, C: float):
    # scores s = K β + b
    s = K @ beta + b                  # [n]
    hinge = torch.clamp(1.0 - y * s, min=0.0)
    reg = 0.5 * (beta @ (K @ beta))   # 0.5 * β^T K β
    return reg + C * hinge.mean(), s

# -----------------------------------------
# 5) Training loop (backprop end-to-end)
# -----------------------------------------
for step in range(200):
    optim.zero_grad()

    K_tr = quantum_kernel_matrix(Xtr, theta, n_layers)  # backpropagates via parameter-shift
    loss, scores = svm_primal_loss(K_tr, ytr, beta, b, C)

    loss.backward()
    optim.step()

    if (step+1) % 1 == 0:
        with torch.no_grad():
            # Evaluate cross-kernel K(Xte, Xtr) to classify test points
            # (no gradient needed)
            # Build states once per set for efficiency
            Xtr_np = Xtr.cpu().numpy()
            Xte_np = Xte.cpu().numpy()
            th_np = theta.detach().cpu().numpy()
            S_tr = batch_states(Xtr_np, th_np, shape)
            S_te = batch_states(Xte_np, th_np, shape)
            G_te_tr = S_te @ S_tr.conj().T
            K_te_tr = (np.abs(G_te_tr)**2).astype(np.float32)

            preds = np.sign(K_te_tr @ beta.detach().cpu().numpy() + b.detach().cpu().numpy())
            acc = (preds.squeeze() == yte.cpu().numpy()).mean()
        print(f"step {step+1:3d} | loss {loss.item():.4f} | test acc {acc*100:.1f}%")

print("Done.")


step   1 | loss 5.0000 | test acc 98.9%
step   2 | loss 9.6139 | test acc 98.9%
step   3 | loss 4.2449 | test acc 98.9%
step   4 | loss 3.8314 | test acc 23.3%
step   5 | loss 5.9280 | test acc 56.7%
step   6 | loss 5.0442 | test acc 98.9%
step   7 | loss 2.7819 | test acc 98.9%
step   8 | loss 2.4403 | test acc 98.9%
step   9 | loss 3.7748 | test acc 98.9%
step  10 | loss 3.3955 | test acc 98.9%
step  11 | loss 1.9303 | test acc 98.9%
step  12 | loss 1.5944 | test acc 98.9%
step  13 | loss 2.3226 | test acc 98.9%
step  14 | loss 2.1537 | test acc 98.9%
step  15 | loss 1.1593 | test acc 98.9%
step  16 | loss 1.1328 | test acc 98.9%
step  17 | loss 1.6699 | test acc 98.9%
step  18 | loss 1.5950 | test acc 98.9%
step  19 | loss 0.9840 | test acc 98.9%
step  20 | loss 0.4177 | test acc 98.9%
step  21 | loss 1.0820 | test acc 98.9%
step  22 | loss 1.1234 | test acc 98.9%
step  23 | loss 0.1975 | test acc 98.9%
step  24 | loss 0.3473 | test acc 98.9%
step  25 | loss 0.4888 | test acc 98.9%
