In [1]:
!pip install pennylane scikit-learn numpy pandas joblib




In [8]:
#!/usr/bin/env python3
"""
qsvm_spam_ham_fixed.py

QSVM pipeline (quantum-kernel SVM) for spam/ham classification with a robust,
non-differentiable randomized search for feature-map parameter tuning (fixes
autograd vdot VJP issues).

Save and run:
    pip install pennylane scikit-learn numpy pandas joblib
    python qsvm_spam_ham_fixed.py

This script:
 - Loads your uploaded datasets (/mnt/data/spam_ham_dataset.csv and /mnt/data/spam.csv if present)
 - TF-IDF -> PCA -> scale to [-pi, pi] for AngleEmbedding
 - Defines a parameterized quantum feature map that returns statevectors
 - Uses randomized search (and optional local Nelder-Mead refine) to maximize kernel-target alignment
 - Computes fidelity kernel |<psi(x)|psi(z)>|^2 and trains an SVM on the precomputed kernel
 - Saves artifacts under ./artifacts/
"""
import os
import time
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Pennylane
try:
    import pennylane as qml
    from pennylane import numpy as qnp
except Exception as e:
    raise ImportError("This script requires pennylane. Install with: pip install pennylane") from e

# ----------------------------
# USER CONFIG
# ----------------------------
DATA_PATHS = ["spam_ham_dataset.csv", "spam.csv"]
ARTIFACTS_DIR = "artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

RND = 42
np.random.seed(RND)
try:
    qnp.random.seed(RND)
except Exception:
    pass

n_qubits = 6                # number of qubits (PCA components)
feature_map_layers = 3      # parameterized layers in the feature map
random_search_trials = 40   # number of random samples to try during search
random_search_subsample = 300  # # train samples used to evaluate alignment per trial
local_refine = True         # if True, attempts Nelder-Mead refine (requires scipy)
local_refine_iters = 300

tfidf_max_features = 2000
pca_components = n_qubits

device_name = "default.qubit"  # use statevector simulator for kernel fidelity
svm_C_grid = [0.1, 1, 10, 50]

# ----------------------------
# Data load & preprocessing
# ----------------------------
def load_and_merge(paths):
    frames = []
    for p in paths:
        if not os.path.exists(p):
            print(f"Info: {p} not found, skipping.")
            continue
        try:
            df = pd.read_csv(p, encoding="utf-8", low_memory=False)
        except Exception:
            df = pd.read_csv(p, encoding="latin-1", low_memory=False)

        if "text" in df.columns and ("label" in df.columns or "label_num" in df.columns):
            if "label_num" not in df.columns:
                df["label_num"] = df["label"].map(lambda s: 1 if str(s).strip().lower().startswith("spam") else 0)
            frames.append(df[["text", "label_num"]].rename(columns={"label_num":"label"}))
        elif "v2" in df.columns and "v1" in df.columns:
            frames.append(pd.DataFrame({
                "text": df["v2"].astype(str),
                "label": df["v1"].apply(lambda s: 1 if str(s).strip().lower()=="spam" else 0)
            }))
        else:
            cols = list(df.columns)
            if len(cols) >= 2:
                temp = df[[cols[0], cols[1]]].rename(columns={cols[0]:"text", cols[1]:"label"})
                try:
                    temp["label"] = temp["label"].astype(int)
                except Exception:
                    temp["label"] = temp["label"].apply(lambda s: 1 if str(s).strip().lower().startswith("spam") else 0)
                frames.append(temp)
    if not frames:
        raise FileNotFoundError(f"No suitable dataset found among {paths}")
    df_all = pd.concat(frames, ignore_index=True)
    df_all["text"] = df_all["text"].astype(str).str.strip()
    df_all = df_all.drop_duplicates(subset=["text"])
    df_all = df_all.dropna(subset=["text","label"])
    df_all["label"] = df_all["label"].astype(int)
    print("Merged dataset:", df_all.shape)
    return df_all

def vectorize_and_split(df):
    texts = df["text"].values
    labels = df["label"].values
    X_train_txt, X_test_txt, y_train, y_test = train_test_split(texts, labels, test_size=0.20, stratify=labels, random_state=RND)
    X_train_txt, X_val_txt, y_train, y_val = train_test_split(X_train_txt, y_train, test_size=0.1765, stratify=y_train, random_state=RND)
    tfv = TfidfVectorizer(max_features=tfidf_max_features, stop_words="english")
    X_train_tfidf = tfv.fit_transform(X_train_txt).toarray()
    X_val_tfidf = tfv.transform(X_val_txt).toarray()
    X_test_tfidf = tfv.transform(X_test_txt).toarray()
    joblib.dump(tfv, os.path.join(ARTIFACTS_DIR, "tfidf_vectorizer.joblib"))
    print("TF-IDF shapes:", X_train_tfidf.shape, X_val_tfidf.shape, X_test_tfidf.shape)
    return X_train_tfidf, X_val_tfidf, X_test_tfidf, y_train, y_val, y_test, X_train_txt, X_val_txt, X_test_txt

# ----------------------------
# Quantum feature map qnode (returns statevector)
# ----------------------------
dev = qml.device(device_name, wires=n_qubits)

@qml.qnode(dev, interface="autograd")
def state_circuit(params, x):
    """
    params: array shape (feature_map_layers, n_qubits)
    x: array shape (n_qubits,) scaled into [-pi, pi]
    returns: statevector (complex numpy array) via qml.state()
    """
    # embed classical features
    qml.templates.AngleEmbedding(x, wires=range(n_qubits), rotation="X")
    for l in range(params.shape[0]):
        # trainable single qubit rotations (RY)
        for w in range(n_qubits):
            qml.RY(params[l, w], wires=w)
        # entangling ring
        for w in range(n_qubits):
            qml.CNOT(wires=[w, (w + 1) % n_qubits])
    return qml.state()

# ----------------------------
# Kernel utilities (numpy-based; no autograd)
# ----------------------------
def compute_states(params, X):
    """Compute statevectors for all rows in X (X is numpy array shape (N, n_qubits))."""
    states = []
    for x in X:
        st = state_circuit(params, x)  # returns pennylane numpy array, convert to np.array
        states.append(np.array(st))
    return np.array(states)  # shape (N, dim) complex

def fidelity_kernel_from_states(states_A, states_B=None):
    """Compute fidelity kernel K_ij = |<a_i|b_j>|^2 using numpy (real float)."""
    if states_B is None:
        states_B = states_A
    N = states_A.shape[0]
    M = states_B.shape[0]
    K = np.zeros((N, M), dtype=float)
    for i in range(N):
        for j in range(M):
            ov = np.vdot(states_A[i], states_B[j])
            K[i, j] = np.abs(ov) ** 2
    return K

# ----------------------------
# Kernel-target alignment (numpy) and randomized search optimizer
# ----------------------------
def kernel_target_alignment_score_from_params(params, X_subset, y_subset):
    """
    Compute kernel-target alignment for given params on subset.
    y_subset: {0,1} -> converted to +/-1 internally.
    Returns alignment (float).
    """
    states = compute_states(params, X_subset)
    K = fidelity_kernel_from_states(states)
    y_pm1 = 2 * np.array(y_subset) - 1
    yyT = np.outer(y_pm1, y_pm1)
    numerator = np.sum(K * yyT)
    denom = np.linalg.norm(K) * (np.linalg.norm(y_pm1) ** 2 + 1e-12)
    alignment = numerator / (denom + 1e-12)
    return float(alignment)

def optimize_feature_map_random_search(X_train_scaled, y_train,
                                       init_params=None,
                                       n_trials=40,
                                       subsample=300,
                                       random_state=RND,
                                       local_refine=False):
    """
    Randomized search for best feature-map params by maximizing kernel-target alignment
    on a random subset (uses numpy-based evaluation; no autograd).
    If local_refine True, attempts Nelder-Mead refine around the best candidate (requires scipy).
    """
    rng = np.random.RandomState(random_state)
    n_train = X_train_scaled.shape[0]
    subs_idx = rng.choice(n_train, min(subsample, n_train), replace=False)
    X_sub = X_train_scaled[subs_idx]
    y_sub = np.array(y_train)[subs_idx]

    best_score = -np.inf
    best_params = None

    # scale for random sampling around init
    base_sigma = 0.5 if init_params is None else 0.2

    print(f"Random search: {n_trials} trials, subset size {len(subs_idx)}")
    for t in range(n_trials):
        if init_params is None:
            cand = 0.5 * rng.randn(feature_map_layers, n_qubits)
        else:
            cand = np.array(init_params) + base_sigma * rng.randn(*init_params.shape)
        try:
            score = kernel_target_alignment_score_from_params(cand, X_sub, y_sub)
        except Exception as e:
            print(f"Trial {t+1} evaluation failed: {e}")
            score = -np.inf
        print(f"  Trial {t+1}/{n_trials} alignment = {score:.6f}")
        if score > best_score:
            best_score = score
            best_params = cand.copy()

    print("Best subset alignment:", best_score)

    if local_refine and (best_params is not None):
        try:
            print("Attempting local Nelder-Mead refinement (scipy required)...")
            from scipy.optimize import minimize

            def obj_flat(xflat):
                x = xflat.reshape(feature_map_layers, n_qubits)
                return -kernel_target_alignment_score_from_params(x, X_sub, y_sub)  # minimize negative alignment

            x0 = best_params.flatten()
            res = minimize(obj_flat, x0, method="Nelder-Mead", options={"maxiter": local_refine_iters, "disp": True})
            best_params = res.x.reshape(feature_map_layers, n_qubits)
            print("Refinement done. new subset alignment:", kernel_target_alignment_score_from_params(best_params, X_sub, y_sub))
        except Exception as e:
            print("Local refinement failed (scipy missing or error):", e)

    return best_params

# ----------------------------
# Train QSVM with learned kernel
# ----------------------------
def train_qsvm_with_learned_kernel(params, X_train_scaled, y_train, X_test_scaled, y_test):
    print("Computing statevectors for full training set...")
    states_train = compute_states(params, X_train_scaled)
    K_train = fidelity_kernel_from_states(states_train)

    # Grid-search C using precomputed kernel cross-validation
    best_score = -1
    best_C = None
    best_clf = None
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=RND)
    for C in svm_C_grid:
        clf = SVC(kernel="precomputed", C=C)
        cv_scores = []
        for train_idx, val_idx in skf.split(K_train, y_train):
            K_tr = K_train[train_idx][:, train_idx]
            K_val = K_train[val_idx][:, train_idx]
            clf.fit(K_tr, y_train[train_idx])
            y_val_pred = clf.predict(K_val)
            cv_scores.append(accuracy_score(y_train[val_idx], y_val_pred))
        avg = np.mean(cv_scores)
        print(f"  C={C} CV acc = {avg:.4f}")
        if avg > best_score:
            best_score = avg
            best_C = C
            best_clf = clf

    print("Best C:", best_C)

    # Fit final classifier on full training kernel
    final_clf = SVC(kernel="precomputed", C=best_C, probability=True)
    final_clf.fit(K_train, y_train)

    # compute test kernel slice
    states_test = compute_states(params, X_test_scaled)
    K_test_train = fidelity_kernel_from_states(states_test, states_train)

    y_test_pred = final_clf.predict(K_test_train)
    test_acc = accuracy_score(y_test, y_test_pred)
    print("QSVM Test accuracy:", test_acc)
    print(classification_report(y_test, y_test_pred))

    # Save artifacts
    joblib.dump({"params": params, "states_train": states_train, "K_train": K_train}, os.path.join(ARTIFACTS_DIR, "qkernel_artifacts.joblib"))
    joblib.dump(final_clf, os.path.join(ARTIFACTS_DIR, "qsvm_clf.joblib"))
    return test_acc, final_clf, K_train, states_train

# ----------------------------
# Optional classical baseline & ensemble
# ----------------------------
def train_classical_svm(X_train, y_train, X_val, y_val, X_test, y_test):
    svc = SVC(kernel="linear", C=1.0, probability=True, random_state=RND)
    svc.fit(X_train, y_train)
    print("Classical SVM val acc:", accuracy_score(y_val, svc.predict(X_val)))
    print("Classical SVM test acc:", accuracy_score(y_test, svc.predict(X_test)))
    joblib.dump(svc, os.path.join(ARTIFACTS_DIR, "classical_svm.joblib"))
    return svc

# ----------------------------
# MAIN pipeline
# ----------------------------
def main():
    t0 = time.time()
    df = load_and_merge(DATA_PATHS)
    X_train_tfidf, X_val_tfidf, X_test_tfidf, y_train, y_val, y_test, \
        X_train_txt, X_val_txt, X_test_txt = vectorize_and_split(df)

    # classical baseline
    svc = train_classical_svm(X_train_tfidf, y_train, X_val_tfidf, y_val, X_test_tfidf, y_test)

    # PCA reduce & scale to [-pi, pi]
    pca = PCA(n_components=pca_components, random_state=RND)
    X_train_pca = pca.fit_transform(X_train_tfidf)
    X_val_pca = pca.transform(X_val_tfidf)
    X_test_pca = pca.transform(X_test_tfidf)
    joblib.dump(pca, os.path.join(ARTIFACTS_DIR, "pca.joblib"))

    scaler = MinMaxScaler(feature_range=(-np.pi, np.pi))
    X_train_pca_scaled = scaler.fit_transform(X_train_pca)
    X_val_pca_scaled = scaler.transform(X_val_pca)
    X_test_pca_scaled = scaler.transform(X_test_pca)
    joblib.dump(scaler, os.path.join(ARTIFACTS_DIR, "pca_scaler.joblib"))

    print("PCA reduced shapes:", X_train_pca_scaled.shape, X_val_pca_scaled.shape, X_test_pca_scaled.shape)

    # initialize random params (small)
    init_params = 0.1 * np.random.randn(feature_map_layers, n_qubits)

    # Randomized search (robust, avoids autograd issues)
    learned_params = optimize_feature_map_random_search(
        X_train_pca_scaled, y_train,
        init_params=init_params,
        n_trials=random_search_trials,
        subsample=random_search_subsample,
        random_state=RND,
        local_refine=local_refine
    )
    print("Learned params shape:", learned_params.shape)
    joblib.dump(learned_params, os.path.join(ARTIFACTS_DIR, "learned_featuremap_params.joblib"))

    # Train QSVM
    qsvm_test_acc, final_clf, K_train, states_train = train_qsvm_with_learned_kernel(
        learned_params, X_train_pca_scaled, y_train, X_test_pca_scaled, y_test
    )

    # Optional ensemble with classical SVM
    print("Computing ensemble (average probabilities)...")
    svm_probs = svc.predict_proba(X_test_tfidf)[:, 1]
    # get QSVM probabilities by fitting probability=True SVC on precomputed kernel (we already did)
    # final_clf was trained with probability=True, so we can use it directly
    states_test = compute_states(learned_params, X_test_pca_scaled)
    K_test_train = fidelity_kernel_from_states(states_test, states_train)
    qsvm_probs = final_clf.predict_proba(K_test_train)[:, 1]

    ensemble_probs = 0.5 * (svm_probs + qsvm_probs)
    ensemble_preds = (ensemble_probs >= 0.5).astype(int)
    ensemble_acc = accuracy_score(y_test, ensemble_preds)
    print(f"Classical SVM test acc: {accuracy_score(y_test, svc.predict(X_test_tfidf)):.4f}")
    print(f"QSVM test acc: {qsvm_test_acc:.4f}, Ensemble test acc: {ensemble_acc:.4f}")
    print("Elapsed time: %.1f s" % (time.time() - t0))

if __name__ == "__main__":
    main()


Merged dataset: (10151, 2)
TF-IDF shapes: (6686, 2000) (1434, 2000) (2031, 2000)
Classical SVM val acc: 0.9686192468619247
Classical SVM test acc: 0.9650418513047759
PCA reduced shapes: (6686, 6) (1434, 6) (2031, 6)
Random search: 40 trials, subset size 300
  Trial 1/40 alignment = 0.200130
  Trial 2/40 alignment = 0.200130
  Trial 3/40 alignment = 0.200130
  Trial 4/40 alignment = 0.200130
  Trial 5/40 alignment = 0.200130
  Trial 6/40 alignment = 0.200130
  Trial 7/40 alignment = 0.200130
  Trial 8/40 alignment = 0.200130
  Trial 9/40 alignment = 0.200130
  Trial 10/40 alignment = 0.200130
  Trial 11/40 alignment = 0.200130
  Trial 12/40 alignment = 0.200130
  Trial 13/40 alignment = 0.200130
  Trial 14/40 alignment = 0.200130
  Trial 15/40 alignment = 0.200130
  Trial 16/40 alignment = 0.200130
  Trial 17/40 alignment = 0.200130
  Trial 18/40 alignment = 0.200130
  Trial 19/40 alignment = 0.200130
  Trial 20/40 alignment = 0.200130
  Trial 21/40 alignment = 0.200130
  Trial 22/40 al