In [None]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
# from shutil import copyfile (only if you want to save predictions in google drive)

base_dir = "/content/drive/MyDrive/ML protein biomarkers IGIB/"
data_path = os.path.join(base_dir, "lasso_110.csv")

# Load Data
df = pd.read_csv(data_path)
y = df["severe_single_episode"].values
X = df.drop(columns=["severe_single_episode"]).values
feature_names = df.drop(columns=["severe_single_episode"]).columns

# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to Save Predictions and Print Metrics
def save_predictions(model_name, y_train, y_test, train_preds, test_preds, train_probs, test_probs):
    result_dir = os.path.join(base_dir, model_name)
    os.makedirs(result_dir, exist_ok=True)

    train_acc = accuracy_score(y_train, train_preds)
    test_acc = accuracy_score(y_test, test_preds)
    train_auc = roc_auc_score(y_train, train_probs)
    test_auc = roc_auc_score(y_test, test_probs)

    print(f"\n📊 {model_name.upper()} Metrics")
    print(f"Train Accuracy: {train_acc:.4f}, ROC-AUC: {train_auc:.4f}")
    print(f"Test  Accuracy: {test_acc:.4f}, ROC-AUC: {test_auc:.4f}")

    train_metrics = pd.DataFrame({"Metric": ["Accuracy", "ROC-AUC"], "Value": [train_acc, train_auc]})
    test_metrics = pd.DataFrame({"Metric": ["Accuracy", "ROC-AUC"], "Value": [test_acc, test_auc]})

    train_metrics.to_csv(os.path.join(result_dir, "train_results.csv"), index=False)
    test_metrics.to_csv(os.path.join(result_dir, "test_results.csv"), index=False)

    # Save predictions
    pd.DataFrame({"True": y_train, "Predicted": train_preds, "Probability": train_probs}).to_csv(
        os.path.join(result_dir, "train_predictions.csv"), index=False)
    pd.DataFrame({"True": y_test, "Predicted": test_preds, "Probability": test_probs}).to_csv(
        os.path.join(result_dir, "test_predictions.csv"), index=False)

    # To save in Google Drive (commented)
    # copyfile(os.path.join(result_dir, "test_predictions.csv"), "/content/drive/MyDrive/test_predictions.csv")

# ROC Curve Collection
roc_data = {}

#Logistic Regression
print("\n🔍 Logistic Regression")
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

train_probs = logreg.predict_proba(X_train)[:, 1]
test_probs = logreg.predict_proba(X_test)[:, 1]
train_preds = (train_probs > 0.5).astype(int)
test_preds = (test_probs > 0.5).astype(int)

save_predictions("logistic_regression", y_train, y_test, train_preds, test_preds, train_probs, test_probs)
fpr, tpr, _ = roc_curve(y_test, test_probs)
roc_data["Logistic Regression"] = (fpr, tpr)

# Feature importance
logreg_importance = pd.DataFrame({
    "Feature": feature_names,
    "Importance": np.abs(logreg.coef_[0])
}).sort_values(by="Importance", ascending=False)
logreg_importance.to_csv(os.path.join(base_dir, "logistic_regression", "feature_importance.csv"), index=False)

with open(os.path.join(base_dir, "logistic_regression", "logreg_model.pkl"), "wb") as f:
    pickle.dump(logreg, f)

# SVM
print("\n🔍 SVM (Linear Kernel)")
svm = SVC(kernel="linear", probability=True)
svm.fit(X_train, y_train)

train_probs = svm.predict_proba(X_train)[:, 1]
test_probs = svm.predict_proba(X_test)[:, 1]
train_preds = (train_probs > 0.5).astype(int)
test_preds = (test_probs > 0.5).astype(int)

save_predictions("svm", y_train, y_test, train_preds, test_preds, train_probs, test_probs)
fpr, tpr, _ = roc_curve(y_test, test_probs)
roc_data["SVM"] = (fpr, tpr)

with open(os.path.join(base_dir, "svm", "svm_model.pkl"), "wb") as f:
    pickle.dump(svm, f)

svm_importance = pd.DataFrame({
    "Feature": feature_names,
    "Importance": np.abs(svm.coef_[0])
}).sort_values(by="Importance", ascending=False)
svm_importance.to_csv(os.path.join(base_dir, "svm", "feature_importance.csv"), index=False)

# MLP
print("\n🔍 MLP (Keras)")
model = Sequential([
    Dense(128, activation="relu", input_dim=X_train.shape[1]),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy", "AUC"])
early_stop = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stop], verbose=0)

train_probs = model.predict(X_train).ravel()
test_probs = model.predict(X_test).ravel()
train_preds = (train_probs > 0.5).astype(int)
test_preds = (test_probs > 0.5).astype(int)

save_predictions("deep_learning", y_train, y_test, train_preds, test_preds, train_probs, test_probs)
fpr, tpr, _ = roc_curve(y_test, test_probs)
roc_data["Deep Learning"] = (fpr, tpr)

model.save(os.path.join(base_dir, "deep_learning", "dl_model.h5"))

# Feature importance (sklearn MLP + permutation)
print("Computing permutation importance for MLP...")
from sklearn.neural_network import MLPClassifier
mlp_sklearn = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=300)
mlp_sklearn.fit(X_train, y_train)
perm_importance = permutation_importance(mlp_sklearn, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)
mlp_importance = pd.DataFrame({
    "Feature": feature_names,
    "Importance": perm_importance.importances_mean
}).sort_values(by="Importance", ascending=False)
mlp_importance.to_csv(os.path.join(base_dir, "deep_learning", "feature_importance.csv"), index=False)

# Plot Combined ROC-AUC
print("\n📈 Plotting Combined ROC-AUC Curve")
plt.figure(figsize=(8, 6))
for name, (fpr, tpr) in roc_data.items():
    plt.plot(fpr, tpr, label=f"{name}")
plt.plot([0, 1], [0, 1], "k--", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-AUC Curve Comparison")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(os.path.join(base_dir, "combined_roc_auc.png"))
# plt.savefig("/content/drive/MyDrive/combined_roc_auc.png")  # Save to Google Drive
plt.show()

print("\n✅ All models processed, predictions saved, and ROC curve plotted!")
