In [None]:
import os, json, math
from copy import deepcopy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# -----------------------------------
# Utilities: quantum/open-system helpers
# -----------------------------------
def pure_density(vec):
    v = np.array(vec, dtype=complex).reshape((-1,1))
    return v @ v.conj().T

def depolarizing_kraus(p):
    I = np.array([[1,0],[0,1]], dtype=complex)
    X = np.array([[0,1],[1,0]], dtype=complex)
    Y = np.array([[0,-1j],[1j,0]], dtype=complex)
    Z = np.array([[1,0],[0,-1]], dtype=complex)
    k0 = np.sqrt(max(0.0, 1 - 3*p/4)) * I
    k1 = np.sqrt(max(0.0, p/4)) * X
    k2 = np.sqrt(max(0.0, p/4)) * Y
    k3 = np.sqrt(max(0.0, p/4)) * Z
    return [k0, k1, k2, k3]

def apply_kraus(rho, kraus_list):
    out = np.zeros_like(rho, dtype=complex)
    for K in kraus_list:
        out += K @ rho @ K.conj().T
    out = (out + out.conj().T) / 2.0
    tr = np.trace(out)
    if np.abs(tr) > 1e-12:
        out = out / tr
    return out

def purity_of(rho):
    return float(np.real_if_close(np.trace(rho @ rho)))

def von_neumann_entropy(rho):
    vals = np.linalg.eigvalsh(rho)
    vals = np.clip(vals, 1e-12, None)
    return float(-np.sum(vals * np.log2(vals)))

# -----------------------------------
# Models: PFA, OQFA (very small)
# -----------------------------------
class PFA:
    def __init__(self):
        self.initial = np.array([1.0, 0.0])
        self.transitions = {}
        self.accepting = {0}
    def add_transition(self, symbol, matrix):
        self.transitions[symbol] = np.array(matrix, dtype=float)
    def recognize(self, s):
        p = self.initial.copy()
        for ch in s:
            if ch in self.transitions:
                p = self.transitions[ch] @ p
            # else: ignore unknown symbol (no-op)
            p = np.clip(p, 0, None)
            ssum = p.sum()
            if ssum > 0:
                p = p / ssum
        return float(np.sum([p[i] for i in self.accepting]))

class OQFA:
    def __init__(self):
        self.rho0 = pure_density(np.array([1.0, 0.0], dtype=complex))
        self.transitions = {}  # symbol -> (U, depolarizing_p)
        self.accepting = {0}
    def add_transition(self, symbol, U, depolarizing_p=0.0):
        self.transitions[symbol] = (np.array(U, dtype=complex), float(depolarizing_p))
    def recognize(self, s, return_rho=False):
        rho = self.rho0.copy()
        for ch in s:
            if ch in self.transitions:
                U, p = self.transitions[ch]
                rho = U @ rho @ U.conj().T
                if p > 0:
                    rho = apply_kraus(rho, depolarizing_kraus(p))
        p_accept = float(np.real_if_close(rho[0,0].real))
        if return_rho:
            return p_accept, rho
        return p_accept

# -----------------------------------
# Synthetic dataset generation
# -----------------------------------
SYMBOLS = ['L','A','T','O','V','F']  # L=login, A=add beneficiary, T=transfer, O=logout, V=view, F=failed_auth

NORMAL_TEMPLATES = [
    ['L','V','O'],
    ['L','T','O'],
    ['L','A','T','O'],
    ['L','V','V','O']
]

FRAUD_TEMPLATES = [
    ['L','A','T','O'],
    ['L','F','F','T','O'],
    ['L','A','A','T','O'],
    ['L','T','T','O']
]

def generate_sequence_from_template(tmpl, insert_noise_prob=0.08, drop_prob=0.05, substitute_prob=0.03):
    seq = []
    for sym in tmpl:
        if np.random.rand() < drop_prob:
            continue
        if np.random.rand() < substitute_prob:
            choices = [s for s in SYMBOLS if s != sym]
            seq.append(np.random.choice(choices))
        else:
            seq.append(sym)
        if np.random.rand() < insert_noise_prob:
            seq.append(np.random.choice(SYMBOLS))
    return ''.join(seq)

def create_synthetic_dataset(n_normal=400, n_fraud=200, noise_params=None, random_state=0):
    if noise_params is None:
        noise_params = {'insert_noise_prob':0.08, 'drop_prob':0.05, 'substitute_prob':0.03}
    rows = []
    rng = np.random.RandomState(random_state)
    for _ in range(n_normal):
        tmpl = NORMAL_TEMPLATES[rng.randint(len(NORMAL_TEMPLATES))]
        seq = generate_sequence_from_template(tmpl, **noise_params)
        rows.append({'sequence': seq, 'label': 0})
    for _ in range(n_fraud):
        tmpl = FRAUD_TEMPLATES[rng.randint(len(FRAUD_TEMPLATES))]
        seq = generate_sequence_from_template(tmpl, **noise_params)
        rows.append({'sequence': seq, 'label': 1})
    df = pd.DataFrame(rows).sample(frac=1, random_state=random_state).reset_index(drop=True)
    return df

# -----------------------------------
# Model builders
# -----------------------------------
def build_baseline_pfa():
    pfa = PFA()
    pfa.add_transition('L', [[0.9,0.1],[0.1,0.9]])
    pfa.add_transition('A', [[0.8,0.2],[0.2,0.8]])
    pfa.add_transition('T', [[0.2,0.8],[0.8,0.2]])
    pfa.add_transition('O', [[0.9,0.1],[0.1,0.9]])
    pfa.add_transition('V', [[0.85,0.15],[0.15,0.85]])
    pfa.add_transition('F', [[0.3,0.7],[0.7,0.3]])
    return pfa

def build_param_oqfa(theta_a, theta_b, depolarizing_p):
    Ua = np.array([[math.cos(theta_a), -math.sin(theta_a)],
                   [math.sin(theta_a),  math.cos(theta_a)]], dtype=complex)
    Ub = np.array([[math.cos(theta_b), -math.sin(theta_b)],
                   [math.sin(theta_b),  math.cos(theta_b)]], dtype=complex)
    oq = OQFA()
    oq.add_transition('A', Ua, depolarizing_p=depolarizing_p)
    oq.add_transition('T', Ub, depolarizing_p=depolarizing_p)
    I = np.eye(2, dtype=complex)
    for s in ['L','O','V','F']:
        oq.add_transition(s, I, depolarizing_p=depolarizing_p)
    return oq

# -----------------------------------
# Evaluation & grid search
# -----------------------------------
def acceptances_for_model(model, sequences):
    return np.array([model.recognize(s) for s in sequences])

def evaluate_thresholded(acceptances, labels, thr):
    preds = (acceptances >= thr).astype(int)
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, zero_division=0)
    rec = recall_score(labels, preds, zero_division=0)
    f1 = f1_score(labels, preds, zero_division=0)
    try:
        auc = roc_auc_score(labels, acceptances)
    except Exception:
        auc = None
    return {'acc':acc, 'prec':prec, 'rec':rec, 'f1':f1, 'auc':auc, 'preds':preds}

# -----------------------------------
# Main flow
# -----------------------------------
def main():
    outdir = "./oqfa_fraud_outputs"
    os.makedirs(outdir, exist_ok=True)
    np.random.seed(0)

    # create dataset
    df = create_synthetic_dataset(n_normal=800, n_fraud=400,
                                  noise_params={'insert_noise_prob':0.12, 'drop_prob':0.08, 'substitute_prob':0.06},
                                  random_state=0)
    df.to_csv(os.path.join(outdir, "synthetic_transactions.csv"), index=False)

    # splits
    train_df, temp = train_test_split(df, test_size=0.4, random_state=0, stratify=df['label'])
    val_df, test_df = train_test_split(temp, test_size=0.5, random_state=0, stratify=temp['label'])
    train_df.to_csv(os.path.join(outdir, "train.csv"), index=False)
    val_df.to_csv(os.path.join(outdir, "val.csv"), index=False)
    test_df.to_csv(os.path.join(outdir, "test.csv"), index=False)

    # coarse grid search
    theta_candidates = np.linspace(-math.pi/3, math.pi/3, 7)
    noise_candidates = [0.0, 0.02, 0.05, 0.08]
    threshold_candidates = [0.35, 0.45, 0.5, 0.55, 0.65]

    train_seqs = train_df['sequence'].tolist()
    train_labels = train_df['label'].to_numpy()
    val_seqs = val_df['sequence'].tolist()
    val_labels = val_df['label'].to_numpy()

    results = []
    best = None

    for p in noise_candidates:
        for ta in theta_candidates:
            for tb in theta_candidates:
                oq = build_param_oqfa(ta, tb, p)
                acc_train = acceptances_for_model(oq, train_seqs)
                acc_val = acceptances_for_model(oq, val_seqs)
                for thr in threshold_candidates:
                    tr_stats = evaluate_thresholded(acc_train, train_labels, thr)
                    val_stats = evaluate_thresholded(acc_val, val_labels, thr)
                    results.append({
                        'p': p, 'theta_a': ta, 'theta_b': tb, 'threshold': thr,
                        'train_acc': tr_stats['acc'], 'val_acc': val_stats['acc'],
                        'train_f1': tr_stats['f1'], 'val_f1': val_stats['f1'], 'val_auc': val_stats['auc'] if val_stats['auc'] is not None else float('nan')
                    })
                    if best is None or val_stats['acc'] > best['score']:
                        best = {'score': val_stats['acc'], 'p': p, 'theta_a': ta, 'theta_b': tb, 'threshold': thr, 'model': deepcopy(oq)}

    results_df = pd.DataFrame(results).sort_values(['val_acc'], ascending=False)
    results_df.to_csv(os.path.join(outdir, "grid_search_results.csv"), index=False)

    # Evaluate best on test
    best_model = best['model']
    best_params = {k:v for k,v in best.items() if k!='model'}
    test_seqs = test_df['sequence'].tolist()
    test_labels = test_df['label'].to_numpy()
    test_accepts = acceptances_for_model(best_model, test_seqs)
    test_eval = evaluate_thresholded(test_accepts, test_labels, best_params['threshold'])

    test_out = test_df.copy()
    test_out['acceptance'] = test_accepts
    test_out['pred'] = test_eval['preds']
    test_out.to_csv(os.path.join(outdir, "test_predictions.csv"), index=False)

    # Plots (matplotlib only; each plot separate)
    plt.figure(figsize=(6,4))
    plt.hist(test_out[test_out['label']==0]['acceptance'], bins=20, alpha=0.7, label='normal')
    plt.hist(test_out[test_out['label']==1]['acceptance'], bins=20, alpha=0.7, label='fraud')
    plt.xlabel("OQFA acceptance")
    plt.ylabel("Count")
    plt.title("Acceptance distribution (test set)")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(outdir, "acceptance_distribution.png"), dpi=200, bbox_inches='tight')
    plt.close()

    acc_vs_noise = results_df.groupby('p')['val_acc'].mean().reset_index()
    plt.figure(figsize=(6,4))
    plt.plot(acc_vs_noise['p'], acc_vs_noise['val_acc'], marker='o')
    plt.xlabel("Depolarizing noise p")
    plt.ylabel("Mean validation accuracy")
    plt.title("Validation accuracy vs depolarizing noise")
    plt.tight_layout()
    plt.savefig(os.path.join(outdir, "accuracy_vs_noise.png"), dpi=200, bbox_inches='tight')
    plt.close()

    pur_rows = []
    for p in noise_candidates:
        oq_tmp = build_param_oqfa(best_params['theta_a'], best_params['theta_b'], p)
        purities = []
        entropies = []
        for s in test_seqs:
            pa, rho = oq_tmp.recognize(s, return_rho=True)
            purities.append(purity_of(rho))
            entropies.append(von_neumann_entropy(rho))
        pur_rows.append({'p': p, 'purity': float(np.mean(purities)), 'entropy': float(np.mean(entropies))})
    df_pur = pd.DataFrame(pur_rows)
    plt.figure(figsize=(6,4))
    plt.plot(df_pur['p'], df_pur['purity'], marker='o')
    plt.xlabel("Depolarizing noise p")
    plt.ylabel("Mean purity")
    plt.title("OQFA purity vs depolarizing noise (test set)")
    plt.tight_layout()
    plt.savefig(os.path.join(outdir, "purity_vs_noise.png"), dpi=200, bbox_inches='tight')
    plt.close()

    summary = {
        'best_val_accuracy': float(best['score']),
        'best_params': {'p': float(best['p']), 'theta_a': float(best['theta_a']), 'theta_b': float(best['theta_b']), 'threshold': float(best['threshold'])},
        'test_metrics': {'accuracy': float(test_eval['acc']), 'precision': float(test_eval['prec']), 'recall': float(test_eval['rec']), 'f1': float(test_eval['f1']), 'auc': float(test_eval['auc']) if test_eval['auc'] is not None else None}
    }
    with open(os.path.join(outdir, "summary.json"), "w") as f:
        json.dump(summary, f, indent=2)

    print("Done. Outputs written to:", outdir)
    print("Best validation accuracy:", best['score'])
    print("Best params:", best_params)
    print("Test metrics:", summary['test_metrics'])

if __name__ == "__main__":
    main()
