In [3]:
!pip install qiskit qi0skit-aer imbalanced-learn




In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import pickle
import warnings
from datetime import datetime

from qiskit import QuantumCircuit, transpile
from qiskit_aer import AerSimulator

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, 
                             recall_score, f1_score, confusion_matrix, 
                             roc_curve, classification_report)

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

warnings.filterwarnings('ignore')
np.random.seed(42)


In [2]:
os.makedirs('output', exist_ok=True)
os.makedirs('output/data', exist_ok=True)
os.makedirs('output/models', exist_ok=True)
os.makedirs('output/plots', exist_ok=True)
print(" Directory structure created")


 Directory structure created


In [3]:
print("STEP 1: DATA LOADING & EXPLORATION")

df_raw = pd.read_csv('dataset.csv')
df_raw.columns = df_raw.columns.str.strip().str.lower()

print(f"\nRaw Dataset Shape: {df_raw.shape}")
print(f"Columns: {list(df_raw.columns)}")
print(f"\nClass Distribution:")
print(df_raw['fraud'].value_counts())
print(f"Fraud Percentage: {df_raw['fraud'].mean() * 100:.2f}%")
print(f"\nMissing Values:\n{df_raw.isnull().sum()}")

# Save data info
data_info = {
    'raw_shape': df_raw.shape,
    'columns': list(df_raw.columns),
    'fraud_percentage': float(df_raw['fraud'].mean() * 100),
    'missing_values': df_raw.isnull().sum().to_dict(),
    'class_distribution': df_raw['fraud'].value_counts().to_dict()
}

with open('output/data/data_info.json', 'w') as f:
    json.dump(data_info, f, indent=2)

print("\n Data exploration complete")


STEP 1: DATA LOADING & EXPLORATION

Raw Dataset Shape: (100000, 8)
Columns: ['distance_from_home', 'distance_from_last_transaction', 'ratio_to_median_purchase_price', 'repeat_retailer', 'used_chip', 'used_pin_number', 'online_order', 'fraud']

Class Distribution:
fraud
0    91260
1     8740
Name: count, dtype: int64
Fraud Percentage: 8.74%

Missing Values:
distance_from_home                3
distance_from_last_transaction    2
ratio_to_median_purchase_price    3
repeat_retailer                   3
used_chip                         2
used_pin_number                   3
online_order                      5
fraud                             0
dtype: int64

 Data exploration complete


In [5]:
print("STEP 2: FEATURE ENGINEERING & SELECTION")

X = df_raw.drop('fraud', axis=1)
y = df_raw['fraud']

# Handle missing values
X = X.fillna(X.median())

# Outlier removal (IQR method)
print("\nRemoving outliers...")
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1
outlier_mask = ~((X < (Q1 - 3 * IQR)) | (X > (Q3 + 3 * IQR))).any(axis=1)
X_clean = X[outlier_mask]
y_clean = y[outlier_mask]
print(f"Removed {len(X) - len(X_clean)} outliers ({(1-len(X_clean)/len(X))*100:.1f}%)")

# Remove constant features
X_clean = X_clean.loc[:, X_clean.nunique() > 1]

# Feature correlation analysis
correlations = X_clean.corrwith(y_clean).abs().sort_values(ascending=False)
print(f"\nTop Features by Correlation:")
print(correlations.head())

# Feature Selection using Mutual Information
print("\nPerforming feature selection...")
selector = SelectKBest(mutual_info_classif, k=4)
X_selected = selector.fit_transform(X_clean, y_clean)
selected_features = X_clean.columns[selector.get_support()].tolist()
print(f"Selected Features: {selected_features}")

# Save processed data
df_processed = pd.DataFrame(X_selected, columns=selected_features)
df_processed['Class'] = y_clean.values
df_processed.to_csv('output/data/processed_dataset.csv', index=False)

feature_info = {
    'original_features': list(X.columns),
    'selected_features': selected_features,
    'selection_method': 'Mutual Information',
    'feature_scores': {feat: float(score) for feat, score in 
                       zip(selected_features, selector.scores_[selector.get_support()])}
}

with open('output/data/feature_selection.json', 'w') as f:
    json.dump(feature_info, f, indent=2)

print("\n Feature engineering complete")



STEP 2: FEATURE ENGINEERING & SELECTION

Removing outliers...
Removed 34470 outliers (34.5%)

Top Features by Correlation:
ratio_to_median_purchase_price    0.587869
online_order                      0.156824
distance_from_home                0.002360
used_chip                         0.001974
distance_from_last_transaction    0.000457
dtype: float64

Performing feature selection...
Selected Features: ['distance_from_home', 'ratio_to_median_purchase_price', 'used_chip', 'online_order']

 Feature engineering complete


In [6]:
print("STEP 3: DATA PARTITIONING & SCALING")
X_temp, X_test, y_temp, y_test = train_test_split(
    X_selected, y_clean, test_size=0.30, random_state=42, stratify=y_clean
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.143, random_state=42, stratify=y_temp
)

print(f"\nData Partition Sizes:")
print(f"  Training:   {len(X_train):5d} samples ({len(X_train)/len(X_selected)*100:.1f}%)")
print(f"  Validation: {len(X_val):5d} samples ({len(X_val)/len(X_selected)*100:.1f}%)")
print(f"  Test:       {len(X_test):5d} samples ({len(X_test)/len(X_selected)*100:.1f}%)")

scaler_classical = RobustScaler()
X_train_scaled = scaler_classical.fit_transform(X_train)
X_val_scaled = scaler_classical.transform(X_val)
X_test_scaled = scaler_classical.transform(X_test)

scaler_quantum = MinMaxScaler(feature_range=(0, np.pi))
X_train_quantum = scaler_quantum.fit_transform(X_train)
X_val_quantum = scaler_quantum.transform(X_val)
X_test_quantum = scaler_quantum.transform(X_test)

with open('output/models/scaler_classical.pkl', 'wb') as f:
    pickle.dump(scaler_classical, f)
with open('output/models/scaler_quantum.pkl', 'wb') as f:
    pickle.dump(scaler_quantum, f)

print("\nApplying SMOTETomek to training data...")
resampler = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = resampler.fit_resample(X_train_scaled, y_train)
print(f"Balanced Training: {len(X_train_balanced)} samples")
print(f"Class Distribution: {np.bincount(y_train_balanced.astype(int))}")

print("\n Data partitioning complete")


STEP 3: DATA PARTITIONING & SCALING

Data Partition Sizes:
  Training:   39311 samples (60.0%)
  Validation:  6560 samples (10.0%)
  Test:       19659 samples (30.0%)

Applying SMOTETomek to training data...
Balanced Training: 75146 samples
Class Distribution: [37573 37573]

 Data partitioning complete


In [7]:
print("STEP 4: TRAINING CLASSICAL BASELINES")

results = {}
predictions = {}
models_dict = {}

print("\n→ Training Logistic Regression...")
lr = LogisticRegression(C=0.1, max_iter=1000, solver='saga', 
                       class_weight='balanced', random_state=42)
lr.fit(X_train_balanced, y_train_balanced)

y_val_proba_lr = lr.predict_proba(X_val_scaled)[:, 1]
y_test_pred_lr = lr.predict(X_test_scaled)
y_test_proba_lr = lr.predict_proba(X_test_scaled)[:, 1]

results['Logistic Regression'] = {
    'Val_AUC': roc_auc_score(y_val, y_val_proba_lr),
    'Test_AUC': roc_auc_score(y_test, y_test_proba_lr),
    'Test_Accuracy': accuracy_score(y_test, y_test_pred_lr),
    'Test_Precision': precision_score(y_test, y_test_pred_lr),
    'Test_Recall': recall_score(y_test, y_test_pred_lr),
    'Test_F1': f1_score(y_test, y_test_pred_lr)
}
predictions['lr'] = y_test_proba_lr
models_dict['lr'] = lr
print(f"  Val AUC: {results['Logistic Regression']['Val_AUC']:.4f} | Test AUC: {results['Logistic Regression']['Test_AUC']:.4f}")

print("\n→ Training Random Forest...")
rf = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5,
                           class_weight='balanced', random_state=42, n_jobs=-1)
rf.fit(X_train_balanced, y_train_balanced)

y_val_proba_rf = rf.predict_proba(X_val_scaled)[:, 1]
y_test_pred_rf = rf.predict(X_test_scaled)
y_test_proba_rf = rf.predict_proba(X_test_scaled)[:, 1]

results['Random Forest'] = {
    'Val_AUC': roc_auc_score(y_val, y_val_proba_rf),
    'Test_AUC': roc_auc_score(y_test, y_test_proba_rf),
    'Test_Accuracy': accuracy_score(y_test, y_test_pred_rf),
    'Test_Precision': precision_score(y_test, y_test_pred_rf),
    'Test_Recall': recall_score(y_test, y_test_pred_rf),
    'Test_F1': f1_score(y_test, y_test_pred_rf)
}
predictions['rf'] = y_test_proba_rf
models_dict['rf'] = rf
print(f"  Val AUC: {results['Random Forest']['Val_AUC']:.4f} | Test AUC: {results['Random Forest']['Test_AUC']:.4f}")

print("\n→ Training Gradient Boosting...")
gb = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, 
                               max_depth=5, subsample=0.8, random_state=42)
gb.fit(X_train_balanced, y_train_balanced)

y_val_proba_gb = gb.predict_proba(X_val_scaled)[:, 1]
y_test_pred_gb = gb.predict(X_test_scaled)
y_test_proba_gb = gb.predict_proba(X_test_scaled)[:, 1]

results['Gradient Boosting'] = {
    'Val_AUC': roc_auc_score(y_val, y_val_proba_gb),
    'Test_AUC': roc_auc_score(y_test, y_test_proba_gb),
    'Test_Accuracy': accuracy_score(y_test, y_test_pred_gb),
    'Test_Precision': precision_score(y_test, y_test_pred_gb),
    'Test_Recall': recall_score(y_test, y_test_pred_gb),
    'Test_F1': f1_score(y_test, y_test_pred_gb)
}
predictions['gb'] = y_test_proba_gb
models_dict['gb'] = gb
print(f"  Val AUC: {results['Gradient Boosting']['Val_AUC']:.4f} | Test AUC: {results['Gradient Boosting']['Test_AUC']:.4f}")

print("\n→ Training Neural Network...")
nn = MLPClassifier(hidden_layer_sizes=(64, 32, 16), activation='relu',
                  solver='adam', alpha=0.001, learning_rate='adaptive',
                  max_iter=300, random_state=42, early_stopping=True)
nn.fit(X_train_balanced, y_train_balanced)

y_val_proba_nn = nn.predict_proba(X_val_scaled)[:, 1]
y_test_pred_nn = nn.predict(X_test_scaled)
y_test_proba_nn = nn.predict_proba(X_test_scaled)[:, 1]

results['Neural Network'] = {
    'Val_AUC': roc_auc_score(y_val, y_val_proba_nn),
    'Test_AUC': roc_auc_score(y_test, y_test_proba_nn),
    'Test_Accuracy': accuracy_score(y_test, y_test_pred_nn),
    'Test_Precision': precision_score(y_test, y_test_pred_nn),
    'Test_Recall': recall_score(y_test, y_test_pred_nn),
    'Test_F1': f1_score(y_test, y_test_pred_nn)
}
predictions['nn'] = y_test_proba_nn
models_dict['nn'] = nn
print(f"  Val AUC: {results['Neural Network']['Val_AUC']:.4f} | Test AUC: {results['Neural Network']['Test_AUC']:.4f}")

for name, model in models_dict.items():
    with open(f'output/models/{name}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

print("\n Classical baselines complete")


STEP 4: TRAINING CLASSICAL BASELINES

→ Training Logistic Regression...
  Val AUC: 0.9999 | Test AUC: 0.9999

→ Training Random Forest...
  Val AUC: 1.0000 | Test AUC: 1.0000

→ Training Gradient Boosting...
  Val AUC: 1.0000 | Test AUC: 1.0000

→ Training Neural Network...
  Val AUC: 1.0000 | Test AUC: 1.0000

 Classical baselines complete


In [8]:
print("STEP 5: QUANTUM MODEL SETUP")

n_qubits = 4  
simulator = AerSimulator()
NUM_LAYERS = 3 

print(f"\nQuantum Configuration:")
print(f"  Qubits: {n_qubits}")
print(f"  Backend: AerSimulator")

def feature_map(x):
    """Encode classical data using Angle Encoding"""
    qc = QuantumCircuit(n_qubits)
    for i in range(n_qubits):
        qc.ry(x[i], i)
    return qc

def variational_layer(params):
    """Trainable layers with Circular Entanglement"""
    qc = QuantumCircuit(n_qubits)
    p = params.reshape(NUM_LAYERS, n_qubits)
    
    for l in range(NUM_LAYERS):
        for i in range(n_qubits):
            qc.ry(p[l, i], i)
        
        for i in range(n_qubits):
            qc.cx(i, (i + 1) % n_qubits)
            
    for i in range(n_qubits):
        qc.ry(p[0, i], i)
        
    return qc

def quantum_model(x, params):
    """Complete circuit composition"""
    qc = QuantumCircuit(n_qubits)
    qc.compose(feature_map(x), inplace=True)
    qc.compose(variational_layer(params), inplace=True)
    return qc

def quantum_forward(x, params, shots=2048):
    """Forward pass using Parity measurement"""
    qc = quantum_model(x, params)
    qc.measure_all()
    result = simulator.run(qc, shots=shots).result()
    counts = result.get_counts()
    
    exp = 0
    for bitstring, count in counts.items():
        # Parity logic: captures interactions across ALL qubits
        parity = (-1)**bitstring.count('1')
        exp += parity * count
    return exp / shots

def quantum_predict_proba(x, params, shots=2048):
    """Convert expectation [-1, 1] to probability [0, 1]"""
    exp = quantum_forward(x, params, shots)
    return (exp + 1) / 2

print("\n Quantum circuit defined with 3 layers and Parity Measurement")

STEP 5: QUANTUM MODEL SETUP

Quantum Configuration:
  Qubits: 4
  Backend: AerSimulator

 Quantum circuit defined with 3 layers and Parity Measurement


In [11]:
print("STEP 6: QUANTUM TRAINING")

np.random.seed(42)
params = np.random.uniform(0, 2*np.pi, size=n_qubits * NUM_LAYERS)
train_size = 2000
fraud_idx = np.where(y_train == 1)[0]
non_fraud_idx = np.where(y_train == 0)[0]

n_fraud = min(train_size // 2, len(fraud_idx))
n_non_fraud = train_size - n_fraud

train_idx = np.concatenate([
    np.random.choice(fraud_idx, n_fraud, replace=False),
    np.random.choice(non_fraud_idx, n_non_fraud, replace=False)
])

X_train_qml = X_train_quantum[train_idx]
y_train_qml = y_train.values[train_idx] if hasattr(y_train, 'values') else y_train[train_idx]

print(f"\nTraining Configuration:")
print(f"  Training samples: {len(X_train_qml)}")
print(f"  Fraud: {y_train_qml.sum()} | Non-fraud: {len(y_train_qml) - y_train_qml.sum()}")

def binary_cross_entropy(y_true, y_pred):
    eps = 1e-9
    y_pred = np.clip(y_pred, eps, 1 - eps)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

def compute_loss(params, X_subset, y_subset, shots=256):
    """Compute loss for optimizer"""
    preds = []
    for x in X_subset:
        p = quantum_predict_proba(x, params, shots)
        preds.append(p)
    preds = np.array(preds)
    return binary_cross_entropy(y_subset, preds)

epochs = 30
batch_size = 100
loss_history = []

print("\nStarting training with COBYLA optimizer...")

for epoch in range(epochs):
    epoch_start = datetime.now()
    
    batch_idx = np.random.choice(len(X_train_qml), batch_size, replace=False)
    X_batch = X_train_qml[batch_idx]
    y_batch = y_train_qml[batch_idx]
    
    result = minimize(
    compute_loss, 
    params, 
    args=(X_batch, y_batch, 512),
    method='COBYLA',
    options={
        'maxiter': 60, 
        'rhobeg': 0.1  
    }
    )
    params = result.x
    
    if (epoch + 1) % 5 == 0:
        train_loss = compute_loss(params, X_train_qml[:200], y_train_qml[:200], shots=512)
        loss_history.append(train_loss)
        
        epoch_time = (datetime.now() - epoch_start).total_seconds()
        print(f"Epoch {epoch+1:2d}/{epochs} | Loss: {train_loss:.4f} | Time: {epoch_time:.1f}s")
    else:
        epoch_time = (datetime.now() - epoch_start).total_seconds()
        print(f"Epoch {epoch+1:2d}/{epochs} | Optimizing... | Time: {epoch_time:.1f}s")

quantum_model_data = {
    'params': params.tolist(),
    'n_qubits': n_qubits,
    'selected_features': selected_features,
    'loss_history': loss_history
}

with open('output/models/quantum_model.json', 'w') as f:
    json.dump(quantum_model_data, f, indent=2)

print("\n Quantum training complete")


STEP 6: QUANTUM TRAINING

Training Configuration:
  Training samples: 2000
  Fraud: 1000 | Non-fraud: 1000

Starting training with COBYLA optimizer...
Epoch  1/30 | Optimizing... | Time: 15.9s
Epoch  2/30 | Optimizing... | Time: 26.0s
Epoch  3/30 | Optimizing... | Time: 18.0s
Epoch  4/30 | Optimizing... | Time: 19.2s
Epoch  5/30 | Loss: 0.1870 | Time: 21.5s
Epoch  6/30 | Optimizing... | Time: 19.8s
Epoch  7/30 | Optimizing... | Time: 18.1s
Epoch  8/30 | Optimizing... | Time: 19.7s
Epoch  9/30 | Optimizing... | Time: 22.1s
Epoch 10/30 | Loss: 0.1756 | Time: 20.8s
Epoch 11/30 | Optimizing... | Time: 22.7s
Epoch 12/30 | Optimizing... | Time: 23.3s
Epoch 13/30 | Optimizing... | Time: 22.6s
Epoch 14/30 | Optimizing... | Time: 18.9s
Epoch 15/30 | Loss: 0.1716 | Time: 19.5s
Epoch 16/30 | Optimizing... | Time: 19.0s
Epoch 17/30 | Optimizing... | Time: 20.6s
Epoch 18/30 | Optimizing... | Time: 15.6s
Epoch 19/30 | Optimizing... | Time: 18.0s
Epoch 20/30 | Loss: 0.1783 | Time: 19.8s
Epoch 21/30 |

In [10]:
from scipy.optimize import minimize

In [12]:
print("STEP 7: QUANTUM EVALUATION")

print("\nEvaluating on test set...")

y_test_proba_quantum = []
for i, x in enumerate(X_test_quantum):
    if i % 500 == 0:
        print(f"  Processed {i}/{len(X_test_quantum)} samples", end='\r')
    p = quantum_predict_proba(x, params, shots=2048)
    y_test_proba_quantum.append(p)

print() 
y_test_proba_quantum = np.array(y_test_proba_quantum)
y_test_pred_quantum = (y_test_proba_quantum > 0.5).astype(int)

results['Quantum VQC'] = {
    'Test_AUC': roc_auc_score(y_test, y_test_proba_quantum),
    'Test_Accuracy': accuracy_score(y_test, y_test_pred_quantum),
    'Test_Precision': precision_score(y_test, y_test_pred_quantum),
    'Test_Recall': recall_score(y_test, y_test_pred_quantum),
    'Test_F1': f1_score(y_test, y_test_pred_quantum)
}

predictions['quantum'] = y_test_proba_quantum

print(f"\nQuantum Model Performance:")
print(f"  Test AUC:       {results['Quantum VQC']['Test_AUC']:.4f}")
print(f"  Test Accuracy:  {results['Quantum VQC']['Test_Accuracy']:.4f}")

print("\n Quantum evaluation complete")


STEP 7: QUANTUM EVALUATION

Evaluating on test set...
  Processed 19500/19659 samples

Quantum Model Performance:
  Test AUC:       0.9994
  Test Accuracy:  0.7962

 Quantum evaluation complete
