In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, roc_curve, precision_score, recall_score, f1_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.utils import resample

all_features = pd.read_csv('all_features.csv')
y = all_features['heart']
X = all_features.drop(columns=['heart'])

# Custom scoring functions
def sensitivity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[1, 1] / (cm[1, 1] + cm[1, 0])

def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    return cm[0, 0] / (cm[0, 0] + cm[0, 1])


# Cross-validation configuration
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(penalty='l1', max_iter=10000, C=1, solver='saga'),
    'SVM': SVC(kernel='rbf', C=1, probability=True),
    'Random Forest': RandomForestClassifier(n_estimators=2500, max_depth=None, min_samples_split=2, min_samples_leaf=1, n_jobs=10),
    'MLP': MLPClassifier(hidden_layer_sizes=(10, 50), activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant', learning_rate_init=0.001),
    'GBDT': GradientBoostingClassifier(learning_rate=0.1, max_depth=10, n_estimators=200),
    'AdaBoost': AdaBoostClassifier(learning_rate=0.2, n_estimators=200, random_state=42),
    'XGBoost': XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=200, use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': LGBMClassifier(learning_rate=0.1, max_depth=10, n_estimators=200, random_state=42),
    'CatBoost': CatBoostClassifier(depth=10, iterations=200, learning_rate=0.1, verbose=0, random_state=42)
}

# Store cross-validation results
results = []
roc_curves = {}

# Perform cross-validation for each model
for name, model in models.items():
    auc_scores = []
    accuracy_scores = []
    sensitivity_scores = []
    specificity_scores = []
    ppv_scores = []
    npv_scores = []
    f1_scores = []
    mcc_scores = []
    
    y_true_all = []
    y_pred_proba_all = []
    
    for train_index, test_index in cv.split(X, y):
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train_fold, y_train_fold)
        y_pred_fold = model.predict(X_test_fold)
        y_pred_proba_fold = model.predict_proba(X_test_fold)[:, 1]
        
        y_true_all.extend(y_test_fold)
        y_pred_proba_all.extend(y_pred_proba_fold)
        
        auc_scores.append(roc_auc_score(y_test_fold, y_pred_proba_fold))
        accuracy_scores.append(accuracy_score(y_test_fold, y_pred_fold))
        sensitivity_scores.append(sensitivity_score(y_test_fold, y_pred_fold))
        specificity_scores.append(specificity_score(y_test_fold, y_pred_fold))
        ppv_scores.append(precision_score(y_test_fold, y_pred_fold))
        npv_scores.append(precision_score(y_test_fold, y_pred_fold, pos_label=0))
        f1_scores.append(f1_score(y_test_fold, y_pred_fold))
        mcc_scores.append(matthews_corrcoef(y_test_fold, y_pred_fold))
        
        results.append({
            'Model': name,
            'Fold': len(auc_scores),
            'AUC': auc_scores[-1],
            'Accuracy': accuracy_scores[-1],
            'Sensitivity': sensitivity_scores[-1],
            'Specificity': specificity_scores[-1],
            'PPV': ppv_scores[-1],
            'NPV': npv_scores[-1],
            'F1 Score': f1_scores[-1],
            'MCC': mcc_scores[-1]
        })
    
    fpr, tpr, _ = roc_curve(y_true_all, y_pred_proba_all)
    roc_curves[name] = (fpr, tpr, roc_auc_score(y_true_all, y_pred_proba_all))
    
    print(f"{name}:")
    print(f"  Mean AUC: {np.mean(auc_scores):.4f} (+/- {np.std(auc_scores):.4f})")
    print(f"  Mean Accuracy: {np.mean(accuracy_scores):.4f} (+/- {np.std(accuracy_scores):.4f})")
    print(f"  Mean Sensitivity: {np.mean(sensitivity_scores):.4f} (+/- {np.std(sensitivity_scores):.4f})")
    print(f"  Mean Specificity: {np.mean(specificity_scores):.4f} (+/- {np.std(specificity_scores):.4f})")
    print(f"  Mean PPV: {np.mean(ppv_scores):.4f} (+/- {np.std(ppv_scores):.4f})")
    print(f"  Mean NPV: {np.mean(npv_scores):.4f} (+/- {np.std(npv_scores):.4f})")
    print(f"  Mean F1 Score: {np.mean(f1_scores):.4f} (+/- {np.std(f1_scores):.4f})")
    print(f"  Mean MCC: {np.mean(mcc_scores):.4f} (+/- {np.std(mcc_scores):.4f})")
    print()

# Compute confidence intervals
bootstrap_iterations = 1000
confidence_level = 95
bootstrap_results = {name: [] for name in models.keys()}
conf_intervals = {}

for name, model in models.items():
    y_true_all = []
    y_pred_proba_all = []
    
    for train_index, test_index in cv.split(X, y):
        X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train_fold, y_train_fold)
        y_pred_proba_fold = model.predict_proba(X_test_fold)[:, 1]
        
        y_true_all.extend(y_test_fold)
        y_pred_proba_all.extend(y_pred_proba_fold)
    
    y_true_all = np.array(y_true_all)
    y_pred_proba_all = np.array(y_pred_proba_all)
    
    for _ in range(bootstrap_iterations):
        y_true_bootstrap, y_pred_proba_bootstrap = resample(y_true_all, y_pred_proba_all)
        bootstrap_results[name].append(roc_auc_score(y_true_bootstrap, y_pred_proba_bootstrap))
    
    lower_bound = np.percentile(bootstrap_results[name], (100 - confidence_level) / 2)
    upper_bound = np.percentile(bootstrap_results[name], 100 - (100 - confidence_level) / 2)
    
    conf_intervals[name] = (lower_bound, upper_bound)
    
    print(f"{name} AUC 95% CI: {lower_bound:.4f} - {upper_bound:.4f}")

# Convert results to DataFrame and save as CSV file
df_results = pd.DataFrame(results)
df_results.to_csv('cv_failure.csv', index=False)

print("Cross-validation results saved to 'cv_results.csv'")

# Plot ROC curves
plt.figure(figsize=(10, 8))
for name, (fpr, tpr, auc) in roc_curves.items():
    lower_bound, upper_bound = conf_intervals[name]
    plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.2f}, 95% CI: {lower_bound:.2f} - {upper_bound:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()
