In [None]:
# End-to-end trainable quantum-kernel SVM with Qiskit + PyTorch
# - Learns SVM params (beta, b) and quantum feature map params theta
# - Backprop: parameter-shift through the kernel into the circuit
import math, numpy as np, torch
from dataclasses import dataclass
from typing import Tuple

from qiskit import QuantumCircuit
from qiskit.quantum_info import Statevector

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# === Save processed dataset, splits, and pipeline ===
import os, json
import joblib


  from .autonotebook import tqdm as notebook_tqdm


In [3]:


# Set the path to the file you'd like to load
file_path = "Synthetic_Financial_datasets_log.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "sriharshaeedala/financial-fraud-detection-dataset",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())


  df = kagglehub.load_dataset(


First 5 records:    step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [4]:
# === Make everything numeric: cleaner + encoder ===
import re, warnings, numpy as np, pandas as pd
from dataclasses import dataclass
from typing import Optional, List, Dict, Tuple
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold

warnings.filterwarnings("ignore", category=FutureWarning)

# ---------- helpers ----------
def _snake(s: str) -> str:
    s = re.sub(r"[^\w]+", "_", s.strip())
    s = re.sub(r"_+", "_", s).strip("_")
    return s.lower()

def standardize_colnames(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    mapping = {c: _snake(str(c)) for c in df.columns}
    df.rename(columns=mapping, inplace=True)
    return df

def detect_label(df: pd.DataFrame, label_hint: Optional[str] = None) -> Optional[str]:
    if label_hint and label_hint in df.columns:
        return label_hint
    candidates = [
        "is_fraud","fraud","fraud_bool","fraudulent","class","target","label","y"
    ]
    for c in candidates:
        if c in df.columns:
            return c
    # fallback: any boolean-ish column with 2 unique values named like fraud
    for c in df.columns:
        if "fraud" in c and df[c].nunique(dropna=True) <= 2:
            return c
    return None

def coerce_numeric_strings(s: pd.Series) -> pd.Series:
    # strip currency, commas, spaces, and parentheses for negatives
    # e.g., "(1,234.50)" -> -1234.50, "$1,234" -> 1234
    x = s.astype(str)
    x = x.str.strip()
    x = x.str.replace(r"\(([^)]+)\)", r"-\1", regex=True)
    x = x.str.replace(r"[^\d\.\-eE]", "", regex=True)
    out = pd.to_numeric(x, errors="coerce")
    return out

def maybe_parse_datetime(col: pd.Series) -> Tuple[Optional[pd.Series], Dict[str, pd.Series]]:
    dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
    ok_ratio = dt.notna().mean()
    if ok_ratio < 0.5:
        return None, {}
    dt = dt.dt.tz_convert("UTC")  # fixed tz baseline
    feats = {
        "year": dt.dt.year,
        "month": dt.dt.month,
        "day": dt.dt.day,
        "dayofweek": dt.dt.dayofweek,
        "hour": dt.dt.hour.fillna(0).astype("Int64"),
        "dayofyear": dt.dt.dayofyear,
        "is_month_end": dt.dt.is_month_end.astype("Int8"),
        "is_month_start": dt.dt.is_month_start.astype("Int8"),
        "is_weekend": dt.dt.dayofweek.isin([5,6]).astype("Int8"),
        "epoch_seconds": (dt.view("int64") // 10**9)
    }
    return dt, feats

def split_feature_types(df: pd.DataFrame, max_onehot_cardinality: int = 30) -> Dict[str, List[str]]:
    types = {"numeric": [], "categorical_low": [], "categorical_high": [], "boolean": []}
    for c in df.columns:
        s = df[c]
        if pd.api.types.is_bool_dtype(s) or (s.dropna().isin([0,1,True,False]).all() and s.nunique(dropna=True)<=2):
            types["boolean"].append(c)
        elif pd.api.types.is_numeric_dtype(s):
            types["numeric"].append(c)
        elif pd.api.types.is_object_dtype(s) or pd.api.types.is_categorical_dtype(s):
            nun = s.nunique(dropna=True)
            (types["categorical_low"] if nun <= max_onehot_cardinality else types["categorical_high"]).append(c)
        else:
            # treat the rest (e.g., Int64/boolean with NA) conservatively
            if s.nunique(dropna=True) <= max_onehot_cardinality:
                types["categorical_low"].append(c)
            else:
                types["categorical_high"].append(c)
    return types

def drop_obvious_ids(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
    id_like = [c for c in df.columns if re.search(r"(?:^|_)(id|uuid|guid|hash|account|acct|customer|merchant)(?:_|$)", c)]
    return df.drop(columns=id_like, errors="ignore"), id_like

# ---------- main numericizer ----------
@dataclass
class Numericized:
    X: np.ndarray
    y: Optional[np.ndarray]
    X_df: pd.DataFrame
    feature_names: List[str]
    label_name: Optional[str]
    pipeline: ColumnTransformer
    splits: Dict[str, List[str]]
    dropped_cols: List[str]

def make_all_numeric(
    df_raw: pd.DataFrame,
    label_hint: Optional[str] = None,
    max_onehot_cardinality: int = 30,
    clip_quantiles: Tuple[float,float] = (0.001, 0.999),
    scale_numeric: bool = True
) -> Numericized:
    df = standardize_colnames(df_raw).copy()
    df = df.drop_duplicates()
    
    # Try to parse datetimes & numeric-like strings in object columns
    new_cols = {}
    datetime_orig_cols = []
    for c in list(df.columns):
        if pd.api.types.is_object_dtype(df[c]):
            # datetime?
            _, dt_feats = maybe_parse_datetime(df[c])
            if dt_feats:
                datetime_orig_cols.append(c)
                for k,v in dt_feats.items():
                    new_cols[f"{c}__{k}"] = v
                continue
            # numeric-like strings?
            coerced = coerce_numeric_strings(df[c])
            if coerced.notna().mean() >= 0.5:
                df[c] = coerced
    if new_cols:
        for k,v in new_cols.items():
            df[k] = v
        df.drop(columns=datetime_orig_cols, inplace=True, errors="ignore")

    # Convert 'yes/no', 'true/false' strings to booleans if present
    for c in df.select_dtypes(include="object").columns:
        lc = df[c].str.lower()
        mask_bool = lc.isin(["true","false","yes","no","y","n","t","f"])
        if mask_bool.mean() > 0.8:
            df[c] = lc.map({"true":1,"false":0,"yes":1,"no":0,"y":1,"n":0,"t":1,"f":0}).astype("Int8")

    # Drop obvious ID/high-leakage identifiers
    df, dropped_ids = drop_obvious_ids(df)

    # Detect label (optional)
    y_name = detect_label(df, label_hint)
    y = None
    if y_name:
        y = df[y_name].copy()
        # if label still not numeric, coerce
        if not pd.api.types.is_numeric_dtype(y):
            if y.dropna().isin([0,1]).all():
                y = y.astype(int)
            else:
                # binary map if possible
                if y.nunique(dropna=True)==2:
                    y = pd.Series(pd.Categorical(y).codes, index=y.index)
                else:
                    # multi-class: ordinal codes
                    y = pd.Series(pd.Categorical(y).codes, index=y.index)
        df = df.drop(columns=[y_name])

    # Final type split
    splits = split_feature_types(df, max_onehot_cardinality=max_onehot_cardinality)

    # Optional winsorization/clipping for numeric to tame outliers
    if splits["numeric"]:
        q_lo = df[splits["numeric"]].quantile(clip_quantiles[0])
        q_hi = df[splits["numeric"]].quantile(clip_quantiles[1])
        df[splits["numeric"]] = np.clip(df[splits["numeric"]], q_lo, q_hi, axis=1)

    # Build preprocessing
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        *([("scaler", StandardScaler())] if scale_numeric else [])
    ])

    bool_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent"))  # booleans already 0/1 mostly
    ])

    # OneHot for low-card; Ordinal for high-card to avoid huge dimensionality
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)

    cat_low_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", ohe)
    ])

    cat_high_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
    ])

    transformers = []
    if splits["numeric"]: transformers.append(("num", num_pipe, splits["numeric"]))
    if splits["boolean"]: transformers.append(("bool", bool_pipe, splits["boolean"]))
    if splits["categorical_low"]: transformers.append(("cat_low", cat_low_pipe, splits["categorical_low"]))
    if splits["categorical_high"]: transformers.append(("cat_high", cat_high_pipe, splits["categorical_high"]))

    pre = ColumnTransformer(transformers, remainder="drop", sparse_threshold=0.0)

    pipeline = Pipeline([
        ("pre", pre),
        ("varth", VarianceThreshold(threshold=0.0))
    ])

    X = pipeline.fit_transform(df)

    # Construct feature names (best-effort)
    feat_names = []
    pre_fitted = pipeline.named_steps["pre"]
    for name, trans, cols in pre_fitted.transformers_:
        if name == "num" or name == "bool":
            feat_names.extend(cols)
        elif name == "cat_low":
            ohe_step = trans.named_steps["ohe"]
            cats = ohe_step.categories_
            for c_name, cat_vals in zip(cols, cats):
                feat_names.extend([f"{c_name}__{str(cv)}" for cv in cat_vals])
        elif name == "cat_high":
            feat_names.extend([f"{c}__ordinal" for c in cols])

    # Align names with VarianceThreshold drop
    kept_mask = pipeline.named_steps["varth"].get_support()
    if len(feat_names) == kept_mask.shape[0]:
        feat_names = [n for n, keep in zip(feat_names, kept_mask) if keep]

    X_df = pd.DataFrame(X, columns=[str(c) for c in feat_names])

    y_arr = y.to_numpy() if y is not None else None

    print(f"[clean] Rows: {len(df_raw)} -> {len(df)} after dedupe")
    print(f"[clean] Dropped ID-like cols: {dropped_ids}")
    print(f"[types] numeric={len(splits['numeric'])}, boolean={len(splits['boolean'])}, "
          f"cat_low={len(splits['categorical_low'])}, cat_high={len(splits['categorical_high'])}")
    print(f"[encode] Final X shape: {X_df.shape}, Label: {y_name!r}")

    return Numericized(
        X=X_df.to_numpy(),
        y=y_arr,
        X_df=X_df,
        feature_names=list(X_df.columns),
        label_name=y_name,
        pipeline=pre_fitted,
        splits=splits,
        dropped_cols=dropped_ids
    )




In [None]:
# ---------- usage ----------
num = make_all_numeric(df, label_hint=None, max_onehot_cardinality=30)

  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)


[clean] Rows: 6362620 -> 6362620 after dedupe
[clean] Dropped ID-like cols: []
[types] numeric=8, boolean=1, cat_low=1, cat_high=0
[encode] Final X shape: (6362620, 14), Label: 'isfraud'


AttributeError: 'Numericized' object has no attribute 'to_csv'

In [None]:


os.makedirs("processed", exist_ok=True)

# 1) Full processed table (features + optional label)
proc = num.X_df.copy()
if num.y is not None:
    proc[num.label_name or "label"] = num.y

# Parquet (smaller/faster) + CSV
proc.to_parquet("processed/financial_fraud_processed.parquet", index=False)  # pip install pyarrow if needed
proc.to_csv("processed/financial_fraud_processed.csv", index=False)

In [None]:
num = pd.read_csv("financial_fraud_numeric.csv")
num = num.head(1000)

In [None]:


# Example: ready for train/test split (uses detected label if present)
from sklearn.model_selection import train_test_split
X = num.X
y = num.y  # may be None if no label found

X_train, X_test, y_train, y_test = train_test_split(
    X, y if y is not None else np.zeros(len(X)),  # dummy if no label
    test_size=0.2, random_state=42, stratify=y if (y is not None and len(np.unique(y))>1) else None
)

print("Train shape:", X_train.shape, "| Test shape:", X_test.shape)

  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)
  dt = pd.to_datetime(col, errors="coerce", utc=True, infer_datetime_format=True)


[clean] Rows: 6362620 -> 6362620 after dedupe
[clean] Dropped ID-like cols: []
[types] numeric=8, boolean=1, cat_low=1, cat_high=0
[encode] Final X shape: (6362620, 14), Label: 'isfraud'
Train shape: (5090096, 14) | Test shape: (1272524, 14)


In [None]:



torch.manual_seed(0)
np.random.seed(0)

# -----------------------------
# 1) Toy data (binary labels ±1)
# -----------------------------
X = num.X
y = num.y
# X, y = make_moons(n_samples=1000, noise=0.9, random_state=0)
X = StandardScaler().fit_transform(X).astype(np.float32)
y = 2.0*y.astype(np.float32) - 1.0  # {0,1} -> {-1,+1}

Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)
Xtr = torch.tensor(Xtr)            # [n_train, d]
Xte = torch.tensor(Xte)            # [n_test, d]
ytr = torch.tensor(ytr)            # [n_train]
yte = torch.tensor(yte)            # [n_test]

n_train, d = Xtr.shape
n_qubits = d
n_layers = 2   # feel free to increase (cost grows with params × kernel evals)

# ----------------------------------------------------
# 2) A parameterized Qiskit feature map  |phi_theta(x)>
#    - Fixed data encoders (RX(x_i), RZ(x_i^2))
#    - Trainable single-qubit layers + RZZ ring per layer
#    All trainables are pure rotation angles -> clean shift rule.
# ----------------------------------------------------
@dataclass
class AnsatzShape:
    n_layers: int
    n_qubits: int

    @property
    def sizes(self):
        # For each layer and qubit: RY(theta_y), RZ(theta_z)
        # And an RZZ entangler per (layer, qubit) along a ring
        return {
            "theta_y": (self.n_layers, self.n_qubits),
            "theta_z": (self.n_layers, self.n_qubits),
            "theta_zz": (self.n_layers, self.n_qubits),
        }

    @property
    def total_params(self):
        L, Q = self.n_layers, self.n_qubits
        return L * Q * 3

def unflatten_theta(theta_flat: np.ndarray, shape: AnsatzShape):
    L, Q = shape.n_layers, shape.n_qubits
    assert theta_flat.size == shape.total_params
    k = 0
    th_y = theta_flat[k:k+L*Q].reshape(L, Q); k += L*Q
    th_z = theta_flat[k:k+L*Q].reshape(L, Q); k += L*Q
    th_zz = theta_flat[k:k+L*Q].reshape(L, Q)
    return th_y, th_z, th_zz

def build_feature_map(x_vec: np.ndarray, theta_flat: np.ndarray, shape: AnsatzShape) -> QuantumCircuit:
    """Return a circuit preparing |phi_theta(x)> from |0...0>."""
    L, Q = shape.n_layers, shape.n_qubits
    th_y, th_z, th_zz = unflatten_theta(theta_flat, shape)

    qc = QuantumCircuit(Q)
    # Fixed data encoding
    for q in range(Q):
        qc.rx(float(x_vec[q]), q)     # RX(x_i)
        qc.rz(float(x_vec[q]**2), q)  # RZ(x_i^2)

    # Trainable layers
    for l in range(L):
        for q in range(Q):
            qc.ry(float(th_y[l, q]), q)
            qc.rz(float(th_z[l, q]), q)
        # Entangling ring
        for q in range(Q):
            r = (q + 1) % Q
            qc.rzz(float(th_zz[l, q]), q, r)
    return qc

def statevector_from_circuit(qc: QuantumCircuit) -> np.ndarray:
    return Statevector.from_instruction(qc).data  # complex vector (2^n,)

def batch_states(X_np: np.ndarray, theta_np: np.ndarray, shape: AnsatzShape) -> np.ndarray:
    """Return array [N, 2^n] of complex statevectors for all samples."""
    return np.stack([statevector_from_circuit(build_feature_map(x, theta_np, shape))
                     for x in X_np], axis=0)

def kernel_from_states(S: np.ndarray) -> np.ndarray:
    """K_ij = |<phi_i|phi_j>|^2 from S [N, D]."""
    G = S @ S.conj().T             # [N,N] complex Gram
    K = np.abs(G)**2               # fidelity kernel
    # Stabilizer to keep PSD and help optimization:
    K += 1e-6 * np.eye(K.shape[0], dtype=K.dtype)
    return K.real.astype(np.float32)

# ---------------------------------------------------------
# 3) Autograd: Quantum kernel forward + parameter-shift back
# ---------------------------------------------------------
class QKernelShift(torch.autograd.Function):
    @staticmethod
    def forward(ctx, X: torch.Tensor, theta: torch.Tensor, n_layers_q: int):
        # Save static config
        shape = AnsatzShape(n_layers=int(n_layers_q), n_qubits=X.shape[1])
        ctx.shape = shape

        # Compute kernel K(theta) on CPU with Qiskit
        X_np = X.detach().cpu().numpy()
        th_np = theta.detach().cpu().numpy()
        S = batch_states(X_np, th_np, shape)
        K = kernel_from_states(S)

        # For backward
        ctx.save_for_backward(X.detach(), theta.detach())
        return torch.from_numpy(K)  # [N,N] float32

    @staticmethod
    def backward(ctx, grad_output: torch.Tensor):
        # dL/dtheta = sum_ij (dL/dK_ij) * (dK_ij/dtheta)
        X, theta = ctx.saved_tensors
        shape: AnsatzShape = ctx.shape
        X_np = X.cpu().numpy()
        th_np = theta.cpu().numpy().copy()
        gK = grad_output.detach().cpu().numpy()

        shift = math.pi/2
        P = th_np.size
        grad_theta = np.zeros_like(th_np, dtype=np.float64)

        # Helper to compute K for a given theta
        def K_of(theta_vec: np.ndarray) -> np.ndarray:
            S = batch_states(X_np, theta_vec, shape)
            return kernel_from_states(S)

        # Precompute K at current theta? Not necessary for central diff, but optional.

        # Parameter-shift loop (exact for single-parameter rotation gates)
        for p in range(P):
            th_plus = th_np.copy();   th_plus[p]  += shift
            th_minus = th_np.copy();  th_minus[p] -= shift

            Kp = K_of(th_plus)
            Km = K_of(th_minus)
            dK = 0.5*(Kp - Km)       # (f(theta+π/2) - f(theta-π/2))/2

            grad_theta[p] = np.sum(gK * dK, dtype=np.float64)

        # No gradients for X (not learning the raw inputs)
        grad_X = None
        return grad_X, torch.from_numpy(grad_theta.astype(np.float32)), None

def quantum_kernel_matrix(X: torch.Tensor, theta: torch.Tensor, n_layers_q: int) -> torch.Tensor:
    return QKernelShift.apply(X, theta, n_layers_q)  # [N,N]

# --------------------------------------------
# 4) SVM-in-primal parameters (β in R^n, b∈R)
# --------------------------------------------
shape = AnsatzShape(n_layers=n_layers, n_qubits=n_qubits)
theta = torch.nn.Parameter(0.3*torch.randn(shape.total_params))  # trainable feature map
beta = torch.nn.Parameter(torch.zeros(n_train))                  # SVM coeffs
b = torch.nn.Parameter(torch.zeros(()))                          # bias

C = 5.0     # soft-margin weight
lr = 0.05
optim = torch.optim.Adam([theta, beta, b], lr=lr)

def svm_primal_loss(K: torch.Tensor, y: torch.Tensor, beta: torch.Tensor, b: torch.Tensor, C: float):
    # scores s = K β + b
    s = K @ beta + b                  # [n]
    hinge = torch.clamp(1.0 - y * s, min=0.0)
    reg = 0.5 * (beta @ (K @ beta))   # 0.5 * β^T K β
    return reg + C * hinge.mean(), s

# -----------------------------------------
# 5) Training loop (backprop end-to-end)
# -----------------------------------------
for step in range(200):
    optim.zero_grad()

    K_tr = quantum_kernel_matrix(Xtr, theta, n_layers)  # backpropagates via parameter-shift
    loss, scores = svm_primal_loss(K_tr, ytr, beta, b, C)

    loss.backward()
    optim.step()

    if (step+1) % 40 == 0:
        with torch.no_grad():
            # Evaluate cross-kernel K(Xte, Xtr) to classify test points
            # (no gradient needed)
            # Build states once per set for efficiency
            Xtr_np = Xtr.cpu().numpy()
            Xte_np = Xte.cpu().numpy()
            th_np = theta.detach().cpu().numpy()
            S_tr = batch_states(Xtr_np, th_np, shape)
            S_te = batch_states(Xte_np, th_np, shape)
            G_te_tr = S_te @ S_tr.conj().T
            K_te_tr = (np.abs(G_te_tr)**2).astype(np.float32)

            preds = np.sign(K_te_tr @ beta.detach().cpu().numpy() + b.detach().cpu().numpy())
            acc = (preds.squeeze() == yte.cpu().numpy()).mean()
        print(f"step {step+1:3d} | loss {loss.item():.4f} | test acc {acc*100:.1f}%")

print("Done.")


KeyboardInterrupt: 