In [None]:
import pandas as pd
import numpy as np
import os
import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import (train_test_split, GridSearchCV,
                                   cross_validate, KFold, learning_curve)

from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_curve, auc, precision_recall_curve, confusion_matrix,
                             make_scorer)

import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import xgboost as xgb

In [None]:

classifiers = {
    "lr": LogisticRegression(max_iter=2000),
    "svm": SVC(probability=True),
    "rf": RandomForestClassifier(random_state=42),
    "ann": MLPClassifier(max_iter=2000, random_state=42),
    "XGBoost": xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', random_state=42)
}

param_grids = {
     "lr": {
        "C": [0.001, 0.01, 0.1],
        "solver": ['liblinear'],
        "penalty": ['l1', 'l2']
    },
    "svm": {
        "C": [0.01, 0.1],
        "kernel": ['linear'],
        "gamma": ['scale']
    },
    "rf": {
        "n_estimators": [50, 100],
        "max_depth": [3, 5],
        "min_samples_split": [10, 20],
        "min_samples_leaf": [5, 10],
        "max_features": ['sqrt', 0.3]
    },
    "ann": {
        "hidden_layer_sizes": [(10,), (20,)],
        "activation": ['relu'],
        "alpha": [0.1, 1.0],
        "learning_rate_init": [0.001],
        "early_stopping": [True]
    },
    "XGBoost": {
        'max_depth': [3, 5],
        'learning_rate': [0.01, 0.05],
        'n_estimators': [100, 200],
        'subsample': [0.7],
        'colsample_bytree': [0.7],
        'reg_alpha': [1.0, 10.0],
        'reg_lambda': [1.0, 10.0],
        'min_child_weight': [5, 10]
    }
}


datasets = {
    "no_fs": 'data/4-upsample/upsampled_nofs_features.csv',
    "pca_fs": 'data/4-upsample/upsampled_pca_features.csv',
    "rfe_fs": 'data/4-upsample/upsampled_rfe_features.csv',
    "kbest_fs": 'data/4-upsample/upsampled_kbest_features.csv'
}


for dataset_name in datasets.keys():
    dataset_folder = f"Results/Model_Results_{dataset_name}"
    for subfolder in ["confusion_matrices", "roc_auc_curves",
                     "precision_recall_curves", "learning_curves"]:
        os.makedirs(os.path.join(dataset_folder, subfolder), exist_ok=True)


all_results = {
    dataset_name: {'train_test': [], 'cv': []}
    for dataset_name in datasets.keys()
}

fold = 5


def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn + fp)

scorers = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'specificity': make_scorer(specificity_score),
    'roc_auc': 'roc_auc',
    'pr_auc': 'average_precision'
}

In [None]:
results_train_test = []
results_cv = []
best_classifiers = {}

for dataset_name, dataset_path in datasets.items():
    print(f"\n{'='*70}")
    print(f"Processing dataset: {dataset_name}")
    print(f"{'='*70}")


    df = pd.read_csv(dataset_path)


    X = df.drop(['label'], axis=1) if 'SMILES' in df.columns else df.drop(['label'], axis=1)
    y = df['label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)


    model_save_path = os.path.join("Predict", dataset_name)
    os.makedirs(model_save_path, exist_ok=True)

    roc_data = []

    for name, clf in classifiers.items():
        try:
            print(f"\n{'-'*50}")
            print(f"Training {name.upper()} model on {dataset_name} dataset...")

            if name == "XGBoost":
                grid_search = GridSearchCV(clf, param_grids[name], cv=fold, scoring='accuracy', n_jobs=-1)
                grid_search.fit(X_train_scaled, y_train,
                             eval_set=[(X_test_scaled, y_test)],
                             verbose=True),
                best_clf = grid_search.best_estimator_
                print(f"Best parameters: {grid_search.best_params_}")
                print(f"Best CV accuracy: {grid_search.best_score_:.4f}")
            else:
                grid_search = GridSearchCV(clf, param_grids[name], cv=fold, scoring='accuracy', n_jobs=-1)
                grid_search.fit(X_train_scaled, y_train)
                best_clf = grid_search.best_estimator_
                print(f"Best parameters: {grid_search.best_params_}")
                print(f"Best CV accuracy: {grid_search.best_score_:.4f}")


            training_start = time.perf_counter()
            best_clf.fit(X_train_scaled, y_train)
            training_end = time.perf_counter()
            train_time = training_end - training_start

            y_pred = best_clf.predict(X_test_scaled)
            y_pred_proba = best_clf.predict_proba(X_test_scaled)[:, 1]

            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred)
            rec = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            test_cm = confusion_matrix(y_test, y_pred)
            tn, fp, fn, tp = test_cm.ravel()
            spec = tn / (tn + fp)

            fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
            roc_auc = auc(fpr, tpr)
            precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
            pr_auc = auc(recall, precision)

            roc_data.append((name, fpr, tpr, roc_auc))


            all_results[dataset_name]['train_test'].append([
                f"{name}_{dataset_name}",
                f"{train_time:.4f}",
                f"{acc * 100:.2f}",
                f"{prec * 100:.2f}",
                f"{rec * 100:.2f}",
                f"{f1 * 100:.2f}",
                f"{spec * 100:.2f}",
                f"{roc_auc * 100:.2f}",
                f"{pr_auc * 100:.2f}"
            ])


            cv_scores = cross_validate(best_clf, X_train_scaled, y_train,
                                    cv=fold, scoring=scorers, return_train_score=False)


            all_results[dataset_name]['cv'].append([
                f"{name}_{dataset_name}",
                f"{train_time:.4f}",
                f"{cv_scores['test_accuracy'].mean()*100:.2f} ± {cv_scores['test_accuracy'].std()*100:.2f}",
                f"{cv_scores['test_precision'].mean()*100:.2f} ± {cv_scores['test_precision'].std()*100:.2f}",
                f"{cv_scores['test_recall'].mean()*100:.2f} ± {cv_scores['test_recall'].std()*100:.2f}",
                f"{cv_scores['test_f1'].mean()*100:.2f} ± {cv_scores['test_f1'].std()*100:.2f}",
                f"{cv_scores['test_specificity'].mean()*100:.2f} ± {cv_scores['test_specificity'].std()*100:.2f}",
                f"{cv_scores['test_roc_auc'].mean()*100:.2f} ± {cv_scores['test_roc_auc'].std()*100:.2f}",
                f"{cv_scores['test_pr_auc'].mean()*100:.2f} ± {cv_scores['test_pr_auc'].std()*100:.2f}"
            ])

            dataset_results_folder = f"Results/Model_Results_{dataset_name}"

            plt.figure(figsize=(8, 6), dpi=600)
            sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues')
            plt.title(f'Confusion Matrix - {name} ({dataset_name})')
            plt.savefig(os.path.join(dataset_results_folder, "confusion_matrices", f'confusion_matrix_{name}.png'))
            plt.close()

            plt.figure(figsize=(8, 6), dpi=600)
            plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.3f})')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title(f'ROC Curve - {name} ({dataset_name})')
            plt.legend()
            plt.savefig(os.path.join(dataset_results_folder, "roc_auc_curves", f'roc_curve_{name}.png'))
            plt.close()

            plt.figure(figsize=(8, 6), dpi=600)
            plt.plot(recall, precision, label=f'PR curve (AUC = {pr_auc:.3f})')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            plt.title(f'Precision-Recall Curve - {name} ({dataset_name})')
            plt.legend()
            plt.savefig(os.path.join(dataset_results_folder, "precision_recall_curves", f'pr_curve_{name}.png'))
            plt.close()

            train_sizes = np.linspace(0.1, 1.0, 10)
            train_sizes, train_scores, test_scores = learning_curve(
                best_clf, X_train_scaled, y_train,
                train_sizes=train_sizes,
                cv=fold,
                n_jobs=-1,
                scoring='accuracy'
            )

            train_mean = np.mean(train_scores, axis=1)
            train_std = np.std(train_scores, axis=1)
            test_mean = np.mean(test_scores, axis=1)
            test_std = np.std(test_scores, axis=1)

            plt.figure(figsize=(8, 6), dpi=600)
            plt.plot(train_sizes, train_mean, 'o-', color='r', label='Training score')
            plt.plot(train_sizes, test_mean, 'o-', color='g', label='Cross-validation score')

            plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color='r')
            plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1, color='g')

            plt.xlabel('Training Examples')
            plt.ylabel('Score')
            plt.title(f'Learning Curve - {name} ({dataset_name})')
            plt.grid(True)
            plt.legend(loc='best')

            plt.savefig(os.path.join(dataset_results_folder, "learning_curves", f'learning_curve_{name}.png'))
            plt.close()

            model_file = os.path.join(model_save_path, f'{name}_model.joblib')
            joblib.dump(best_clf, model_file)

            scaler_file = os.path.join(model_save_path, f'{name}_scaler.joblib')
            joblib.dump(scaler, scaler_file)

            params_file = os.path.join(model_save_path, f'{name}_best_params.txt')
            with open(params_file, 'w') as f:
                f.write(str(grid_search.best_params_))

            print(f"Successfully completed {name} on {dataset_name}")

        except Exception as e:
            print(f"\nError occurred while processing {name} on {dataset_name}:")
            print(f"Error type: {type(e).__name__}")
            print(f"Error message: {str(e)}")
            continue

    plt.figure(figsize=(8, 6), dpi=600)
    for name, fpr, tpr, roc_auc in roc_data:
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - All Models ({dataset_name})')
    plt.legend(loc='lower right')
    plt.grid(True)


    dataset_results_folder = f"Results/Model_Results_{dataset_name}"
    plt.savefig(os.path.join(dataset_results_folder, "roc_auc_curves", 'roc_curve_all_models.png'))
    plt.close()

columns = ['Model', 'Time (s)', 'Accuracy (%)', 'Precision (%)', 'Recall (%)',
          'F1-Score (%)', 'specificity (%)', 'ROC-AUC (%)', 'PR AUC (%)']


all_train_test_results = []
all_cv_results = []

for dataset_name in datasets.keys():
    all_train_test_results.extend(all_results[dataset_name]['train_test'])
    all_cv_results.extend(all_results[dataset_name]['cv'])


df_results = pd.DataFrame(all_train_test_results, columns=columns)
df_cv = pd.DataFrame(all_cv_results, columns=columns)


df_results.to_csv('Results/Train_Test_Results_All.csv', index=False)
df_cv.to_csv('Results/Cross_Validation_Results_All.csv', index=False)

print("\nAll results saved successfully!")