In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from tqdm import tqdm
import matplotlib.pyplot as plt
import warnings
from imblearn.over_sampling import SMOTE
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv("paras.txt", sep = "\t")
data = data.sample(frac=1, random_state=42)
alphabet_1 = {"ASD": 0, 
            "ASD_Cancer": 1}
alphabet_2 = {"Cancer":0,
              "ASD_Cancer": 1}
data_1 = data.loc[data['Disease'].isin(['ASD', 'ASD_Cancer'])]
data_2 = data.loc[data['Disease'].isin(['Cancer', 'ASD_Cancer'])]

In [None]:
X_1 = data_1.iloc[:, 3:]
y_1 = data_1["Disease"].replace(alphabet_1)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(
    X_1, y_1, test_size=0.2, random_state=42, stratify=y_1
)
X_2 = data_2.iloc[:, 3:]
y_2 = data_2["Disease"].replace(alphabet_2)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(
    X_2, y_2, test_size=0.2, random_state=42, stratify=y_2
)


In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models = {
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'Ridge': RidgeClassifier(random_state=42),
    'LR': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(kernel='linear', probability=True, random_state=42, max_iter=100000),
    'Random Forest': RandomForestClassifier(random_state=42, n_jobs=-1),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Gradient Boost': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
    'LightGBM': LGBMClassifier(verbose=-1, n_jobs=-1),
    'CatBoost': CatBoostClassifier(verbose=False, random_state=42),
    'KNN': KNeighborsClassifier(n_jobs=-1),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier(max_iter=1000, random_state=42)
}

def evaluate_models(models, X, y, cv):
    results = []
    for name, model in tqdm(models.items()):
        scores = cross_validate(model, X, y, cv=cv, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], n_jobs=-1)
        results.append({
            'Model': name,
            'CV Accuracy': scores['test_accuracy'].mean(),
            'CV Precision': scores['test_precision_macro'].mean(),
            'CV Recall': scores['test_recall_macro'].mean(),
            'CV F1-Score': scores['test_f1_macro'].mean()
        })
    df_results = pd.DataFrame(results).sort_values(by='CV Accuracy', ascending=False)
    return df_results


In [None]:
def plot_model_performance(df_results, metric='CV Accuracy', save = None, title = None):
    df_sorted = df_results.sort_values(by=metric, ascending=False)
    cmap = plt.get_cmap('viridis')
    norm = plt.Normalize(df_sorted[metric].min(), df_sorted[metric].max())
    colors = [cmap(norm(value)) for value in df_sorted[metric]]
    plt.figure(figsize=(6, 4))
    bars = plt.barh(df_sorted['Model'], df_sorted[metric], color=colors)
    plt.xlabel(metric)
    if title:
        plt.title(title)
    else:
        plt.title(f'Model Comparison')
    for bar, value in zip(bars, df_sorted[metric]):
        plt.text(value, bar.get_y() + bar.get_height()/2, f'{value:.3f}', 
                 va='center', ha='left', color='black', fontsize=5)
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    if save is not None:
        plt.savefig(save)
    plt.show()


In [None]:
df_results_1 = evaluate_models(models, X_1, y_1, cv=cv)
print(df_results_1)
plot_model_performance(df_results_1, metric='CV Accuracy', save = "model_evaluation_1.pdf", title = "Model Comparision of ASD & Cancer/ASD")

In [None]:
df_results_2 = evaluate_models(models, X_2, y_2, cv=cv)
print(df_results_2)
plot_model_performance(df_results_2, metric='CV Accuracy', save = "model_evaluation_2.pdf", title = "Model Comparision of Cancer & Cancer/ASD")