In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, recall_score, confusion_matrix, precision_score, f1_score, matthews_corrcoef
from sklearn.utils import resample
import numpy as np

# Load test data from CSV file
test_data = pd.read_csv('test.csv')

# Ensure the column order of X_test matches that of the training data (X)
X_test = test_data[X.columns]
y_test = test_data['heart']

# List of trained models to evaluate
models = [lgr, svc, rf, mlp, gbdt, ada, xgb, lgbm, catboost]

# Create an empty list to store results for each model
results = []

plt.figure(figsize=(10, 8))  # Create a new figure for plotting

# Loop through each model to calculate metrics and plot ROC curve
for model in models:
    # Check if the model has a decision function (used for SVM, etc.)
    if hasattr(model, "decision_function"):
        y_score = model.decision_function(X_test)
    else:
        # Otherwise, use predicted probabilities (e.g., for random forest, logistic regression)
        y_score = model.predict_proba(X_test)[:, 1]

    # Calculate ROC curve parameters (FPR: false positive rate, TPR: true positive rate)
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    auc = roc_auc_score(y_test, y_score)  # Calculate AUC (Area Under Curve)
    
    # Calculate other evaluation metrics
    y_pred = model.predict(X_test)  # Get model predictions
    accuracy = accuracy_score(y_test, y_pred)  # Accuracy
    sensitivity = recall_score(y_test, y_pred)  # Sensitivity (Recall)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()  # Confusion matrix
    specificity = tn / (tn + fp)  # Specificity
    ppv = precision_score(y_test, y_pred)  # Positive Predictive Value (PPV)
    npv = tn / (tn + fn)  # Negative Predictive Value (NPV)
    f1 = f1_score(y_test, y_pred)  # F1 Score
    mcc = matthews_corrcoef(y_test, y_pred)  # Matthews Correlation Coefficient (MCC)
    youden_index = sensitivity + specificity - 1  # Youden's Index
    utility_score = (sensitivity + specificity) / 2  # Utility Score (average of sensitivity and specificity)
    
    # Use bootstrapping to calculate the 95% confidence interval for AUC
    n_bootstraps = 1000
    auc_scores = []
    for _ in range(n_bootstraps):
        # Bootstrap resampling
        y_test_resampled, y_score_resampled = resample(y_test, y_score, random_state=np.random.randint(1, 100))
        auc_resampled = roc_auc_score(y_test_resampled, y_score_resampled)
        auc_scores.append(auc_resampled)
    
    # Calculate 95% confidence interval for AUC
    auc_scores_sorted = np.array(auc_scores)
    auc_scores_sorted.sort()
    lower_bound = auc_scores_sorted[int(0.025 * len(auc_scores_sorted))]  # 2.5th percentile
    upper_bound = auc_scores_sorted[int(0.975 * len(auc_scores_sorted))]  # 97.5th percentile
    
    # Plot ROC curve for the current model
    plt.plot(fpr, tpr, label=f'{model.__class__.__name__} (AUC = {auc:.3f} [{lower_bound:.3f}-{upper_bound:.3f}])')

    # Store results for each model in a list
    results.append({
        'Model': model.__class__.__name__,
        'AUC': auc,
        'Accuracy': accuracy,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'PPV': ppv,
        'NPV': npv,
        'F1 Score': f1,
        'MCC': mcc,
        'Youden Index': youden_index,
        'Utility Score': utility_score
    })

# Plot a diagonal line representing random guessing (no predictive power)
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])

# Add plot labels and title
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve', fontsize=15)

# Add a legend to the lower-right corner of the plot
plt.legend(loc="lower right")

# Remove the top and right plot boundaries for cleaner look
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['top'].set_visible(False)

# Save the plot as an SVG file
plt.savefig('test_ROC.svg', format='svg')

# Display the ROC plot
plt.show()

# Convert the results list into a DataFrame and print it
results_df = pd.DataFrame(results)
print(results_df)

# Save the results DataFrame as a CSV file
results_df.to_csv('test_results.csv', index=False)


In [None]:
import shap
import matplotlib.pyplot as plt
import pandas as pd

# Replace the column names in the plot with cleaned names (using a name mapping)
cleaned_feature_names = [name_mapping.get(name, name) for name in X_test.columns]

# Create a SHAP explainer for the random forest model
explainer = shap.Explainer(rf.predict, X)

# Randomly select 800 samples from the test set
X_test_sampled = X_test.sample(n=800, random_state=42)

# Calculate SHAP values for the selected samples
shap_values = explainer(X_test_sampled)

# Create a SHAP summary plot with the SHAP values, using the cleaned feature names
plt.figure()
shap.summary_plot(shap_values, X_test_sampled, feature_names=cleaned_feature_names, max_display=10, show=False)

# Save the plot as an SVG file
plt.savefig('shap.svg', format='svg')
plt.close()
