In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score
import matplotlib.pyplot as plt 

# Import models
from traditional_ml import predict_heart_disease        
from llm_model import prediction_by_llm

In [None]:
# Load test sample
df = pd.read_csv("heart.csv")

In [None]:
# result collectors
true_labels = []
ensemble_preds = []
llm_preds = []
majority_preds = []

In [None]:
for _, row in df.iterrows():
    # Extract input features from dataset
    age = row['age']              # Age in years
    gender = row['gender']        # 1=female, 2=male
    height = row['height']        # cm
    weight = row['weight']        # kg
    ap_hi = row['ap_hi']          # systolic
    ap_lo = row['ap_lo']          # diastolic
    cholesterol = row['cholesterol']
    gluc = row['gluc']
    smoke = row['smoke']
    alco = row['alco']
    active = row['active']

    # True label
    y_true = int(row['cardio'])
    true_labels.append(y_true)

    # Model 1: Ensemble (XGBoost + RF + stacking)
    ensemble_output = predict_heart_disease(age, gender, height, weight, ap_hi, ap_lo,
                                            cholesterol, gluc, smoke, alco, active)
    ensemble_prob = ensemble_output["probability"]
    ensemble_pred = ensemble_output["prediction"]
    ensemble_preds.append(ensemble_pred)

    # Model 2: LLM explanation-based model
    context = ""  # Optional: add medical context text if needed
    llm_output = prediction_by_llm(age, height, weight, gender, ap_hi, ap_lo,
                                   cholesterol, gluc, smoke, alco, active, context)

    # Extract probability from LLM (1.0 if positive, else 0.0)
    llm_prob = 1.0 if '"has_heart_disease": true' in llm_output.lower() else 0.0
    llm_preds.append(int(llm_prob))

    # Combine probabilities (average)
    final_prob = (ensemble_prob + llm_prob) / 2.0

    # Threshold = 0.8 → decide prediction
    final_pred = 1 if final_prob >= 0.8 else 0
    majority_preds.append(final_pred)

In [None]:
# Evaluate performance
print("\n=== Individual Model Accuracies ===")
print(f"Ensemble Model: {accuracy_score(true_labels, ensemble_preds):.3f}")
print(f"LLM Model: {accuracy_score(true_labels, llm_preds):.3f}")

print("\n=== Majority Vote Accuracy ===")
print(f"Combined (Majority) Model: {accuracy_score(true_labels, majority_preds):.3f}")

print("\n=== Classification Report (Majority Vote) ===")
print(classification_report(true_labels, majority_preds))

In [None]:
# --- Compute ROC curves ---
ensemble_fpr, ensemble_tpr, _ = roc_curve(true_labels, ensemble_prob)
llm_fpr, llm_tpr, _ = roc_curve(true_labels, llm_prob)
combined_fpr, combined_tpr, _ = roc_curve(true_labels, [(e + l) / 2 for e, l in zip(ensemble_prob, llm_prob)])

In [None]:
# --- Compute AUC values ---
ensemble_auc = roc_auc_score(true_labels, ensemble_prob)
llm_auc = roc_auc_score(true_labels, llm_prob)
combined_auc = roc_auc_score(true_labels, [(e + l) / 2 for e, l in zip(ensemble_prob, llm_prob)])

In [None]:
# --- Plot ROC curves ---
plt.figure(figsize=(8, 6))
plt.plot(ensemble_fpr, ensemble_tpr, label=f"Ensemble Model (AUC = {ensemble_auc:.3f})")
plt.plot(llm_fpr, llm_tpr, label=f"LLM Model (AUC = {llm_auc:.3f})")
plt.plot(combined_fpr, combined_tpr, label=f"Combined Model (AUC = {combined_auc:.3f})")
plt.plot([0, 1], [0, 1], 'k--', label="Random Classifier")

plt.title("ROC Curves — Heart Disease Models")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()