In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import pickle
import warnings
from datetime import datetime

from qiskit import QuantumCircuit, transpile
from qiskit_aer import AerSimulator

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (roc_auc_score, accuracy_score, precision_score, 
                             recall_score, f1_score, confusion_matrix, 
                             roc_curve, classification_report)

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

warnings.filterwarnings('ignore')
np.random.seed(42)


In [2]:
os.makedirs('output', exist_ok=True)
os.makedirs('output/data', exist_ok=True)
os.makedirs('output/models', exist_ok=True)
os.makedirs('output/plots', exist_ok=True)
print(" Directory structure created")


✅ Directory structure created


In [3]:
print("\n" + "="*60)
print("STEP 1: DATA LOADING & EXPLORATION")
print("="*60)

df_raw = pd.read_csv('diabetes.csv')   
df_raw.columns = df_raw.columns.str.strip().str.lower()

print(f"\nRaw Dataset Shape: {df_raw.shape}")
print(f"Columns: {list(df_raw.columns)}")

print(f"\nClass Distribution (Outcome):")
print(df_raw['outcome'].value_counts())

print(f"Diabetes Percentage: {df_raw['outcome'].mean() * 100:.2f}%")

print(f"\nMissing Values:\n{df_raw.isnull().sum()}")

data_info = {
    'raw_shape': df_raw.shape,
    'columns': list(df_raw.columns),
    'diabetes_percentage': float(df_raw['outcome'].mean() * 100),
    'missing_values': df_raw.isnull().sum().to_dict(),
    'class_distribution': df_raw['outcome'].value_counts().to_dict()
}

with open('output/data/data_info.json', 'w') as f:
    json.dump(data_info, f, indent=2)

print("\n Data exploration complete")



STEP 1: DATA LOADING & EXPLORATION

Raw Dataset Shape: (768, 9)
Columns: ['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin', 'bmi', 'diabetespedigreefunction', 'age', 'outcome']

Class Distribution (Outcome):
outcome
0    500
1    268
Name: count, dtype: int64
Diabetes Percentage: 34.90%

Missing Values:
pregnancies                 0
glucose                     0
bloodpressure               0
skinthickness               0
insulin                     0
bmi                         0
diabetespedigreefunction    0
age                         0
outcome                     0
dtype: int64

✅ Data exploration complete


In [4]:
print("\n" + "="*60)
print("STEP 2: FEATURE ENGINEERING & SELECTION")
print("="*60)

X = df_raw.drop('outcome', axis=1)
y = df_raw['outcome']

zero_as_missing = ['glucose', 'bloodpressure', 'skinthickness', 'insulin', 'bmi']
X[zero_as_missing] = X[zero_as_missing].replace(0, np.nan)

X = X.fillna(X.median())

print("\nRemoving outliers...")
Q1 = X.quantile(0.25)
Q3 = X.quantile(0.75)
IQR = Q3 - Q1
outlier_mask = ~((X < (Q1 - 3 * IQR)) | (X > (Q3 + 3 * IQR))).any(axis=1)
X_clean = X[outlier_mask]
y_clean = y[outlier_mask]
print(f"Removed {len(X) - len(X_clean)} outliers ({(1-len(X_clean)/len(X))*100:.1f}%)")

X_clean = X_clean.loc[:, X_clean.nunique() > 1]

correlations = X_clean.corrwith(y_clean).abs().sort_values(ascending=False)
print(f"\nTop Features by Correlation:")
print(correlations.head())

print("\nPerforming feature selection...")
selector = SelectKBest(mutual_info_classif, k=4)
X_selected = selector.fit_transform(X_clean, y_clean)
selected_features = X_clean.columns[selector.get_support()].tolist()
print(f"Selected Features: {selected_features}")

df_processed = pd.DataFrame(X_selected, columns=selected_features)
df_processed['Outcome'] = y_clean.values   
df_processed.to_csv('output/data/processed_dataset.csv', index=False)

feature_info = {
    'original_features': list(X.columns),
    'selected_features': selected_features,
    'selection_method': 'Mutual Information',
    'feature_scores': {
        feat: float(score)
        for feat, score in zip(
            selected_features,
            selector.scores_[selector.get_support()]
        )
    }
}

with open('output/data/feature_selection.json', 'w') as f:
    json.dump(feature_info, f, indent=2)

print("\n Feature engineering complete")


STEP 2: FEATURE ENGINEERING & SELECTION

Removing outliers...
Removed 319 outliers (41.5%)

Top Features by Correlation:
glucose          0.469962
bmi              0.332163
pregnancies      0.209009
skinthickness    0.172630
age              0.164649
dtype: float64

Performing feature selection...
Selected Features: ['glucose', 'bmi', 'diabetespedigreefunction', 'age']

✅ Feature engineering complete


In [8]:
print("\n" + "="*60)
print("STEP 3: DATA PARTITIONING & SCALING")
print("="*60)

X_temp, X_test, y_temp, y_test = train_test_split(
    X_clean, y_clean, test_size=0.30, random_state=42, stratify=y_clean
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.143, random_state=42, stratify=y_temp
)

print(f"\nData Partition Sizes:")
print(f"  Training:   {len(X_train):5d}")
print(f"  Validation: {len(X_val):5d}")
print(f"  Test:       {len(X_test):5d}")

print("\nClass Distribution (Outcome):")
print(f"  Train: {np.bincount(y_train.astype(int))}")
print(f"  Val:   {np.bincount(y_val.astype(int))}")
print(f"  Test:  {np.bincount(y_test.astype(int))}")

print("\nPerforming feature selection on training data...")
selector = SelectKBest(mutual_info_classif, k=4)

X_train_sel = selector.fit_transform(X_train, y_train)
X_val_sel = selector.transform(X_val)
X_test_sel = selector.transform(X_test)

selected_features = X_train.columns[selector.get_support()].tolist()
print(f"Selected Features: {selected_features}")

print("\nApplying SMOTETomek to training data...")
resampler = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = resampler.fit_resample(X_train_sel, y_train)

print(f"Balanced Training Size: {len(X_train_balanced)}")
print(f"Balanced Distribution: {np.bincount(y_train_balanced.astype(int))}")
scaler_classical = RobustScaler()

X_train_balanced_scaled = scaler_classical.fit_transform(X_train_balanced)
X_val_scaled = scaler_classical.transform(X_val_sel)
X_test_scaled = scaler_classical.transform(X_test_sel)

scaler_quantum = MinMaxScaler(feature_range=(0, np.pi))

X_train_quantum = scaler_quantum.fit_transform(X_train_sel)
X_val_quantum = scaler_quantum.transform(X_val_sel)
X_test_quantum = scaler_quantum.transform(X_test_sel)

with open('output/models/scaler_classical.pkl', 'wb') as f:
    pickle.dump(scaler_classical, f)

with open('output/models/scaler_quantum.pkl', 'wb') as f:
    pickle.dump(scaler_quantum, f)

with open('output/models/feature_selector.pkl', 'wb') as f:
    pickle.dump(selector, f)

print("\n Data partitioning & preprocessing complete")



STEP 3: DATA PARTITIONING & SCALING

Data Partition Sizes:
  Training:     269
  Validation:    45
  Test:         135

Class Distribution (Outcome):
  Train: [172  97]
  Val:   [29 16]
  Test:  [86 49]

Performing feature selection on training data...
Selected Features: ['glucose', 'bmi', 'diabetespedigreefunction', 'age']

Applying SMOTETomek to training data...
Balanced Training Size: 308
Balanced Distribution: [154 154]

✅ Data partitioning & preprocessing complete


In [24]:
print("\n" + "="*60)
print("STEP 4: TRAINING CLASSICAL MODELS (4 CLASSIFIERS)")
print("="*60)

results = {}
predictions = {}
models_dict = {}

print("\n→ Training Logistic Regression...")
lr = LogisticRegression(
    C=0.5,
    max_iter=1000,
    solver='saga',
    class_weight='balanced',
    random_state=42
)

lr.fit(X_train_balanced_scaled, y_train_balanced)

y_val_proba = lr.predict_proba(X_val_scaled)[:, 1]
y_test_pred = lr.predict(X_test_scaled)
y_test_proba = lr.predict_proba(X_test_scaled)[:, 1]

results['Logistic Regression'] = {
    'Val_AUC': roc_auc_score(y_val, y_val_proba),
    'Test_AUC': roc_auc_score(y_test, y_test_proba),
    'Test_Accuracy': accuracy_score(y_test, y_test_pred),
    'Test_Precision': precision_score(y_test, y_test_pred),
    'Test_Recall': recall_score(y_test, y_test_pred),
    'Test_F1': f1_score(y_test, y_test_pred)
}

predictions['lr'] = y_test_proba
models_dict['lr'] = lr

print(f"  Val AUC: {results['Logistic Regression']['Val_AUC']:.4f} | "
      f"Test AUC: {results['Logistic Regression']['Test_AUC']:.4f}")

print("\n→ Training Random Forest...")
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_split=6,
    min_samples_leaf=3,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train_balanced_scaled, y_train_balanced)

y_val_proba = rf.predict_proba(X_val_scaled)[:, 1]
y_test_pred = rf.predict(X_test_scaled)
y_test_proba = rf.predict_proba(X_test_scaled)[:, 1]

results['Random Forest'] = {
    'Val_AUC': roc_auc_score(y_val, y_val_proba),
    'Test_AUC': roc_auc_score(y_test, y_test_proba),
    'Test_Accuracy': accuracy_score(y_test, y_test_pred),
    'Test_Precision': precision_score(y_test, y_test_pred),
    'Test_Recall': recall_score(y_test, y_test_pred),
    'Test_F1': f1_score(y_test, y_test_pred)
}

predictions['rf'] = y_test_proba
models_dict['rf'] = rf

print(f"  Val AUC: {results['Random Forest']['Val_AUC']:.4f} | "
      f"Test AUC: {results['Random Forest']['Test_AUC']:.4f}")

print("\n→ Training Gradient Boosting...")
gb = GradientBoostingClassifier(
    n_estimators=250,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    random_state=42
)

gb.fit(X_train_balanced_scaled, y_train_balanced)

y_val_proba = gb.predict_proba(X_val_scaled)[:, 1]
y_test_pred = gb.predict(X_test_scaled)
y_test_proba = gb.predict_proba(X_test_scaled)[:, 1]

results['Gradient Boosting'] = {
    'Val_AUC': roc_auc_score(y_val, y_val_proba),
    'Test_AUC': roc_auc_score(y_test, y_test_proba),
    'Test_Accuracy': accuracy_score(y_test, y_test_pred),
    'Test_Precision': precision_score(y_test, y_test_pred),
    'Test_Recall': recall_score(y_test, y_test_pred),
    'Test_F1': f1_score(y_test, y_test_pred)
}

predictions['gb'] = y_test_proba
models_dict['gb'] = gb

print(f"  Val AUC: {results['Gradient Boosting']['Val_AUC']:.4f} | "
      f"Test AUC: {results['Gradient Boosting']['Test_AUC']:.4f}")

print("\n→ Training Neural Network...")
nn = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation='relu',
    solver='adam',
    alpha=0.001,
    learning_rate='adaptive',
    max_iter=500,
    early_stopping=True,
    random_state=42
)

nn.fit(X_train_balanced_scaled, y_train_balanced)

y_val_proba = nn.predict_proba(X_val_scaled)[:, 1]
y_test_pred = nn.predict(X_test_scaled)
y_test_proba = nn.predict_proba(X_test_scaled)[:, 1]

results['Neural Network'] = {
    'Val_AUC': roc_auc_score(y_val, y_val_proba),
    'Test_AUC': roc_auc_score(y_test, y_test_proba),
    'Test_Accuracy': accuracy_score(y_test, y_test_pred),
    'Test_Precision': precision_score(y_test, y_test_pred),
    'Test_Recall': recall_score(y_test, y_test_pred),
    'Test_F1': f1_score(y_test, y_test_pred)
}

predictions['nn'] = y_test_proba
models_dict['nn'] = nn

print(f"  Val AUC: {results['Neural Network']['Val_AUC']:.4f} | "
      f"Test AUC: {results['Neural Network']['Test_AUC']:.4f}")

for name, model in models_dict.items():
    with open(f'output/models/{name}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

print("\n✅ Classical model training complete")



STEP 4: TRAINING CLASSICAL MODELS (4 CLASSIFIERS)

→ Training Logistic Regression...
  Val AUC: 0.9095 | Test AUC: 0.8308

→ Training Random Forest...
  Val AUC: 0.8750 | Test AUC: 0.7907

→ Training Gradient Boosting...
  Val AUC: 0.8578 | Test AUC: 0.7916

→ Training Neural Network...
  Val AUC: 0.8276 | Test AUC: 0.7425

✅ Classical model training complete


In [21]:
print("\n" + "="*60)
print("STEP 5: QUANTUM MODEL SETUP")
print("="*60)

from qiskit import QuantumCircuit
from qiskit_aer import AerSimulator
import numpy as np

n_qubits = 4              
NUM_LAYERS = 3           
shots = 2048

simulator = AerSimulator(method="statevector")

print(f"\nQuantum Configuration:")
print(f"  Qubits        : {n_qubits}")
print(f"  Layers        : {NUM_LAYERS}")
print(f"  Backend       : AerSimulator")
print(f"  Shots         : {shots}")

def feature_map(x):
    qc = QuantumCircuit(n_qubits)
    for i in range(n_qubits):
        qc.ry(x[i], i)
    return qc

def variational_layer(params):
    qc = QuantumCircuit(n_qubits)
    p = params.reshape(NUM_LAYERS, n_qubits)

    for layer in range(NUM_LAYERS):
        for q in range(n_qubits):
            qc.ry(p[layer, q], q)

        for q in range(n_qubits):
            qc.cx(q, (q + 1) % n_qubits)

    return qc

def quantum_model(x, params):
    qc = QuantumCircuit(n_qubits)
    qc.compose(feature_map(x), inplace=True)
    qc.compose(variational_layer(params), inplace=True)
    return qc

def quantum_forward(x, params, shots=shots):
    qc = quantum_model(x, params)
    qc.measure_all()

    result = simulator.run(qc, shots=shots).result()
    counts = result.get_counts()

    expectation = 0.0
    for bitstring, count in counts.items():
        parity = (-1) ** bitstring.count('1')
        expectation += parity * count

    return expectation / shots   # ∈ [-1, 1]

def quantum_predict_proba(x, params, shots=shots):
    exp_val = quantum_forward(x, params, shots)
    return (exp_val + 1.0) / 2.0

print("\n Quantum circuit defined (Angle Encoding + Variational Ansatz + Parity Measurement)")



STEP 5: QUANTUM MODEL SETUP

Quantum Configuration:
  Qubits        : 4
  Layers        : 3
  Backend       : AerSimulator
  Shots         : 2048

✅ Quantum circuit defined (Angle Encoding + Variational Ansatz + Parity Measurement)


In [22]:
print("\n" + "="*60)
print("STEP 6: QUANTUM TRAINING")
print("="*60)

import numpy as np
from datetime import datetime
from scipy.optimize import minimize
import json

np.random.seed(42)

params = np.random.uniform(0, 2 * np.pi, size=n_qubits * NUM_LAYERS)

train_size = min(200, len(X_train_quantum))  
pos_idx = np.where(y_train == 1)[0]
neg_idx = np.where(y_train == 0)[0]

n_pos = min(train_size // 2, len(pos_idx))
n_neg = min(train_size - n_pos, len(neg_idx))

train_idx = np.concatenate([
    np.random.choice(pos_idx, n_pos, replace=False),
    np.random.choice(neg_idx, n_neg, replace=False)
])

X_train_qml = X_train_quantum[train_idx]
y_train_qml = y_train.values[train_idx] if hasattr(y_train, 'values') else y_train[train_idx]

print(f"\nTraining Configuration:")
print(f"  Training samples: {len(X_train_qml)}")
print(f"  Positive: {int(y_train_qml.sum())} | Negative: {len(y_train_qml) - int(y_train_qml.sum())}")
def binary_cross_entropy(y_true, y_pred):
    eps = 1e-9
    y_pred = np.clip(y_pred, eps, 1 - eps)
    return -np.mean(
        y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)
    )

def compute_loss(params, X_subset, y_subset, shots=256):
    preds = np.array([
        quantum_predict_proba(x, params, shots) for x in X_subset
    ])
    return binary_cross_entropy(y_subset, preds)

epochs = 25
batch_size = min(50, len(X_train_qml))
loss_history = []

print("\nStarting training with COBYLA optimizer...")
print("=" * 60)

for epoch in range(epochs):
    start_time = datetime.now()

    batch_idx = np.random.choice(len(X_train_qml), batch_size, replace=False)
    X_batch = X_train_qml[batch_idx]
    y_batch = y_train_qml[batch_idx]

    result = minimize(
        compute_loss,
        params,
        args=(X_batch, y_batch, 512),
        method='COBYLA',
        options={
            'maxiter': 40,
            'rhobeg': 0.1
        }
    )

    params = result.x

    if (epoch + 1) % 5 == 0:
        train_loss = compute_loss(
            params,
            X_train_qml[:50],
            y_train_qml[:50],
            shots=512
        )
        loss_history.append(train_loss)

        elapsed = (datetime.now() - start_time).total_seconds()
        print(f"Epoch {epoch+1:2d}/{epochs} | Loss: {train_loss:.4f} | Time: {elapsed:.1f}s")
    else:
        elapsed = (datetime.now() - start_time).total_seconds()
        print(f"Epoch {epoch+1:2d}/{epochs} | Optimizing... | Time: {elapsed:.1f}s")

quantum_model_data = {
    'params': params.tolist(),
    'n_qubits': n_qubits,
    'num_layers': NUM_LAYERS,
    'loss_history': loss_history
}

with open('output/models/quantum_model.json', 'w') as f:
    json.dump(quantum_model_data, f, indent=2)

print("\n Quantum training complete")



STEP 6: QUANTUM TRAINING

Training Configuration:
  Training samples: 200
  Positive: 97 | Negative: 103

Starting training with COBYLA optimizer...
Epoch  1/25 | Optimizing... | Time: 6.0s
Epoch  2/25 | Optimizing... | Time: 4.9s
Epoch  3/25 | Optimizing... | Time: 4.7s
Epoch  4/25 | Optimizing... | Time: 4.6s
Epoch  5/25 | Loss: 0.7344 | Time: 5.2s
Epoch  6/25 | Optimizing... | Time: 5.1s
Epoch  7/25 | Optimizing... | Time: 5.3s
Epoch  8/25 | Optimizing... | Time: 5.4s
Epoch  9/25 | Optimizing... | Time: 5.2s
Epoch 10/25 | Loss: 0.6635 | Time: 4.8s
Epoch 11/25 | Optimizing... | Time: 4.8s
Epoch 12/25 | Optimizing... | Time: 4.6s
Epoch 13/25 | Optimizing... | Time: 4.9s
Epoch 14/25 | Optimizing... | Time: 5.3s
Epoch 15/25 | Loss: 0.5665 | Time: 4.8s
Epoch 16/25 | Optimizing... | Time: 4.8s
Epoch 17/25 | Optimizing... | Time: 5.0s
Epoch 18/25 | Optimizing... | Time: 4.9s
Epoch 19/25 | Optimizing... | Time: 5.3s
Epoch 20/25 | Loss: 0.5739 | Time: 4.9s
Epoch 21/25 | Optimizing... | Time

In [23]:
print("\n" + "="*60)
print("STEP 7: QUANTUM EVALUATION")
print("="*60)

print("\nEvaluating Quantum Model on Test Set...")

y_test_proba_quantum = []

for i, x in enumerate(X_test_quantum):
    if i % 100 == 0:
        print(f"  Processed {i}/{len(X_test_quantum)} samples", end='\r')

    p = quantum_predict_proba(x, params, shots=2048)
    y_test_proba_quantum.append(p)

print() 
y_test_proba_quantum = np.array(y_test_proba_quantum)

y_test_pred_quantum = (y_test_proba_quantum >= 0.5).astype(int)

results['Quantum VQC'] = {
    'Test_AUC': roc_auc_score(y_test, y_test_proba_quantum),
    'Test_Accuracy': accuracy_score(y_test, y_test_pred_quantum),
    'Test_Precision': precision_score(y_test, y_test_pred_quantum, zero_division=0),
    'Test_Recall': recall_score(y_test, y_test_pred_quantum, zero_division=0),
    'Test_F1': f1_score(y_test, y_test_pred_quantum, zero_division=0)
}

predictions['quantum'] = y_test_proba_quantum

print("\nQuantum Model Performance:")
print(f"  Test AUC:       {results['Quantum VQC']['Test_AUC']:.4f}")
print(f"  Test Accuracy:  {results['Quantum VQC']['Test_Accuracy']:.4f}")
print(f"  Test Precision: {results['Quantum VQC']['Test_Precision']:.4f}")
print(f"  Test Recall:    {results['Quantum VQC']['Test_Recall']:.4f}")
print(f"  Test F1-score:  {results['Quantum VQC']['Test_F1']:.4f}")

print("\n Quantum evaluation complete")



STEP 7: QUANTUM EVALUATION

Evaluating Quantum Model on Test Set...
  Processed 100/135 samples

Quantum Model Performance:
  Test AUC:       0.7235
  Test Accuracy:  0.5333
  Test Precision: 0.4352
  Test Recall:    0.9592
  Test F1-score:  0.5987

✅ Quantum evaluation complete
