# Quantum Fraud Detection - Quick Start Notebook

This notebook demonstrates how to run the quantum fraud detection pipeline interactively.

## Overview
- **Classical Models**: Logistic Regression, Isolation Forest, XGBoost
- **Quantum Models**: VQC, Quantum Kernel
- **Backends**: Simulator, Aer, IBM Quantum Hardware

## 1. Setup and Imports

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import time
from pathlib import Path

# Add parent directory to path
sys.path.append('..')

from src.data_loader import load_csvs, merge_on_transaction_id
from src.preprocessing import PreprocessConfig, preprocess_pipeline, split_data
from src.model_classical import (
    ClassicalConfig, train_logreg,
    IsolationForestConfig, train_isolation_forest,
    XGBoostConfig, train_xgboost
)
from src.model_quantum import (
    QuantumConfig, train_vqc,
    QuantumKernelConfig, train_quantum_kernel
)
from src.quantum_backend import BackendConfig
from src import evaluation as eval_mod
from src.results_comparison import save_all_results

print("✓ All imports successful!")

## 2. Load Configuration

In [None]:
# Load configuration
with open('../configs/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print("Configuration loaded:")
print(f"- Dataset: {config['data']['transaction_csv']}")
print(f"- Features to select: {config['preprocessing']['top_k_corr_features']}")
print(f"- Backend: {config['quantum_backend']['backend_type']}")

## 3. Load and Preprocess Data

In [None]:
# Load data
print("Loading data...")
df_txn, df_id = load_csvs(
    config['data']['transaction_csv'],
    config['data']['identity_csv']
)
df = merge_on_transaction_id(df_txn, df_id)

print(f"Dataset shape: {df.shape}")
print(f"Fraud rate: {df['isFraud'].mean():.2%}")

# Display sample
df.head()

In [None]:
# Preprocess
print("Preprocessing data...")
pp_cfg = PreprocessConfig(
    missing_threshold=config['preprocessing']['missing_threshold'],
    target_col=config['preprocessing']['target_col'],
    id_cols=config['preprocessing']['id_cols'],
    top_k_corr_features=config['preprocessing']['top_k_corr_features'],
)

df_processed, selected_features = preprocess_pipeline(df, pp_cfg)

print(f"Selected features: {selected_features}")
print(f"Processed shape: {df_processed.shape}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = split_data(
    df_processed,
    target=pp_cfg.target_col,
    test_size=config['preprocessing']['test_size'],
    random_state=config['preprocessing']['random_state'],
    stratify=config['preprocessing']['stratify'],
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Features: {X_train.shape[1]}")
print(f"Train fraud rate: {y_train.mean():.2%}")
print(f"Test fraud rate: {y_test.mean():.2%}")

## 4. Train Classical Models

### 4.1 Logistic Regression

In [None]:
print("Training Logistic Regression...")
lr_cfg = ClassicalConfig(
    penalty=config['logistic_regression']['penalty'],
    C=config['logistic_regression']['C'],
    max_iter=config['logistic_regression']['max_iter'],
    use_random_oversampler=config['logistic_regression']['use_random_oversampler'],
)

start = time.time()
lr_model = train_logreg(X_train.values, y_train.values, lr_cfg)
lr_time = time.time() - start

# Evaluate
y_pred_lr = lr_model.predict(X_test.values)
y_proba_lr = lr_model.predict_proba(X_test.values)[:, 1]
lr_metrics = eval_mod.compute_metrics(y_test.values, y_pred_lr, y_proba_lr)

print(f"Training time: {lr_time:.2f}s")
print(f"Metrics: {lr_metrics}")

### 4.2 Isolation Forest

In [None]:
print("Training Isolation Forest...")
if_cfg = IsolationForestConfig(
    n_estimators=config['isolation_forest']['n_estimators'],
    contamination=config['isolation_forest']['contamination'],
    random_state=config['isolation_forest']['random_state'],
)

start = time.time()
if_model = train_isolation_forest(X_train.values, y_train.values, if_cfg)
if_time = time.time() - start

# Evaluate (convert -1/1 to 0/1)
y_pred_if = if_model.predict(X_test.values)
y_pred_if = np.where(y_pred_if == -1, 1, 0)
y_score_if = if_model.decision_function(X_test.values)
if_metrics = eval_mod.compute_metrics(y_test.values, y_pred_if, y_score_if)

print(f"Training time: {if_time:.2f}s")
print(f"Metrics: {if_metrics}")

### 4.3 XGBoost

In [None]:
print("Training XGBoost...")
xgb_cfg = XGBoostConfig(
    n_estimators=config['xgboost']['n_estimators'],
    max_depth=config['xgboost']['max_depth'],
    learning_rate=config['xgboost']['learning_rate'],
    random_state=config['xgboost']['random_state'],
)

start = time.time()
xgb_model = train_xgboost(X_train.values, y_train.values, xgb_cfg)
xgb_time = time.time() - start

# Evaluate
y_pred_xgb = xgb_model.predict(X_test.values)
y_proba_xgb = xgb_model.predict_proba(X_test.values)[:, 1]
xgb_metrics = eval_mod.compute_metrics(y_test.values, y_pred_xgb, y_proba_xgb)

print(f"Training time: {xgb_time:.2f}s")
print(f"Metrics: {xgb_metrics}")

## 5. Train Quantum Models

### 5.1 Variational Quantum Classifier (VQC)

In [None]:
print("Training Quantum VQC...")
print("⚠️ This may take several minutes...")

# Setup backend
backend_cfg = BackendConfig(
    backend_type=config['quantum_backend']['backend_type'],
    shots=config['quantum_backend']['shots'],
)

vqc_cfg = QuantumConfig(
    num_features=X_train.shape[1],
    reps_feature_map=config['quantum_vqc']['reps_feature_map'],
    reps_ansatz=config['quantum_vqc']['reps_ansatz'],
    optimizer_maxiter=config['quantum_vqc']['optimizer_maxiter'],
    backend_config=backend_cfg,
)

start = time.time()
vqc_model = train_vqc(X_train.values, y_train.values, vqc_cfg)
vqc_time = time.time() - start

# Evaluate
y_pred_vqc = vqc_model.predict(X_test.values)
try:
    y_proba_vqc = vqc_model.predict_proba(X_test.values)[:, 1]
except:
    y_proba_vqc = None
vqc_metrics = eval_mod.compute_metrics(y_test.values, y_pred_vqc, y_proba_vqc)

print(f"✓ Training complete!")
print(f"Training time: {vqc_time:.2f}s")
print(f"Metrics: {vqc_metrics}")

### 5.2 Quantum Kernel

In [None]:
print("Training Quantum Kernel...")
print("⚠️ This may take several minutes...")

qk_cfg = QuantumKernelConfig(
    num_features=X_train.shape[1],
    reps_feature_map=config['quantum_kernel']['reps_feature_map'],
    C=config['quantum_kernel']['C'],
    backend_config=backend_cfg,
)

start = time.time()
qk_model = train_quantum_kernel(X_train.values, y_train.values, qk_cfg)
qk_time = time.time() - start

# Evaluate
y_pred_qk = qk_model.predict(X_test.values)
y_proba_qk = qk_model.predict_proba(X_test.values)[:, 1]
qk_metrics = eval_mod.compute_metrics(y_test.values, y_pred_qk, y_proba_qk)

print(f"✓ Training complete!")
print(f"Training time: {qk_time:.2f}s")
print(f"Metrics: {qk_metrics}")

## 6. Compare Results

In [None]:
# Compile all results
all_metrics = {
    'Logistic Regression': lr_metrics,
    'Isolation Forest': if_metrics,
    'XGBoost': xgb_metrics,
    'Quantum VQC': vqc_metrics,
    'Quantum Kernel': qk_metrics,
}

all_times = {
    'Logistic Regression': lr_time,
    'Isolation Forest': if_time,
    'XGBoost': xgb_time,
    'Quantum VQC': vqc_time,
    'Quantum Kernel': qk_time,
}

# Create comparison DataFrame
results_df = pd.DataFrame(all_metrics).T
results_df['training_time'] = pd.Series(all_times)
results_df = results_df.sort_values('f1', ascending=False)

print("\n" + "="*80)
print("RESULTS COMPARISON")
print("="*80)
print(results_df.round(4))

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# F1 Score comparison
ax1 = axes[0]
colors = ['#4ECDC4' if 'Quantum' not in name else '#FF6B6B' for name in results_df.index]
results_df['f1'].plot(kind='barh', ax=ax1, color=colors)
ax1.set_xlabel('F1 Score')
ax1.set_title('Model Performance Comparison')
ax1.grid(axis='x', alpha=0.3)

# Training time comparison
ax2 = axes[1]
results_df['training_time'].plot(kind='barh', ax=ax2, color=colors)
ax2.set_xlabel('Training Time (seconds)')
ax2.set_title('Training Time Comparison')
ax2.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Quantum Advantage Analysis

In [None]:
# Separate classical and quantum results
classical_models = ['Logistic Regression', 'Isolation Forest', 'XGBoost']
quantum_models = ['Quantum VQC', 'Quantum Kernel']

classical_f1 = results_df.loc[classical_models, 'f1'].max()
quantum_f1 = results_df.loc[quantum_models, 'f1'].max()

improvement = ((quantum_f1 - classical_f1) / classical_f1) * 100

print("\n" + "="*80)
print("QUANTUM ADVANTAGE ANALYSIS")
print("="*80)
print(f"Best Classical F1: {classical_f1:.4f}")
print(f"Best Quantum F1: {quantum_f1:.4f}")
print(f"Improvement: {improvement:+.2f}%")
print()

if improvement > 0:
    print("✅ Quantum models show advantage over classical models!")
elif improvement > -5:
    print("⚖️ Quantum and classical models show comparable performance.")
else:
    print("❌ Classical models currently outperform quantum models.")
    print("   Consider: more training data, hyperparameter tuning, or different feature selection.")

## 8. Save Results

In [None]:
# Save comprehensive results
output_dir = '../results'
save_all_results(all_metrics, all_times, output_dir)

print(f"\n✓ All results saved to: {output_dir}")
print("\nGenerated files:")
print("- metrics_comparison.png")
print("- metrics_table.csv")
print("- training_time_comparison.png")
print("- quantum_advantage_report.txt")
print("- results.json")

## 9. Next Steps

### Experiment Ideas:

1. **Try different feature counts:**
   ```python
   # Modify preprocessing config
   pp_cfg.top_k_corr_features = 6  # Try 4, 6, 8, 10
   ```

2. **Test on IBM Quantum Hardware:**
   ```python
   backend_cfg = BackendConfig(
       backend_type="ibm_quantum",
       ibm_token="YOUR_TOKEN",
       ibm_backend_name="ibm_brisbane",
       shots=1024
   )
   ```

3. **Tune quantum circuit depth:**
   ```python
   vqc_cfg.reps_feature_map = 3
   vqc_cfg.reps_ansatz = 3
   ```

4. **Optimize classical models:**
   ```python
   xgb_cfg.n_estimators = 200
   xgb_cfg.max_depth = 8
   ```

### Analysis:
- Compare confusion matrices
- Analyze feature importance
- Study quantum circuit properties
- Investigate failure cases