In [None]:
%%capture
!pip install --user pycaret -full
!pip install numba==0.53

In [None]:
import pandas as pd
import numpy as np

import lightgbm as lgbm
import xgboost as xgb

from sklearn.model_selection import StratifiedGroupKFold, GroupKFold, KFold
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import average_precision_score, accuracy_score

import os
import pickle
import pathlib
from collections import defaultdict

from sklearn.ensemble import RandomForestClassifier
import sklearn

base_dir = pathlib.Path('/content/drive/MyDrive/Ikoma Paper')
dataset_path = base_dir / 'dataset.csv'
dataset_df = pd.read_csv(dataset_path)
save_dir = base_dir / '5fold'
save_dir.mkdir(exist_ok=True, parents=True)


In [None]:
feature_gene = dataset_df.columns[2:-9]
feature_gene

In [None]:
display(dataset_df['fold'].value_counts())
print(len(feature_gene))

# Model Training and Evaluation

We trained and compared the following machine learning models:

*   RandomForest
*   LightGBM
*   XGBoost
*   Logistic Regression
*   SVM
*   KNN

Each model was trained using a 5-fold cross-validation approach. The performance of each model was evaluated using the Receiver Operating Characteristic (ROC) curve and the Area Under the Curve (AUC) metric.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

# Convert column names to strings
dataset_df.columns = dataset_df.columns.astype(str)

# Define each model (LightGBM and XGBoost are set to use GPU)
models = {
    'RandomForest': RandomForestClassifier(),
    'LightGBM': LGBMClassifier(device='gpu'),
    'XGBoost': XGBClassifier(tree_method='gpu_hist'),
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier()
}

# Dictionary to store results
results = {model_name: {"fpr": [], "tpr": [], "aucs": [], "mean_fpr": np.linspace(0, 1, 100)} for model_name in models.keys()}

# Prepare the figure for plotting
plt.figure(figsize=(10, 8))

# Plot the ROC curve for each model
for model_name, model in models.items():
    tprs = []
    aucs = []
    mean_fpr = results[model_name]["mean_fpr"]

    for fold in range(5):
        print(f'{model_name}, {fold}')

        # Split training and validation data
        train_df = dataset_df[dataset_df['fold'] != fold]
        valid_df = dataset_df[dataset_df['fold'] == fold]

        train_X, train_y = train_df[feature_gene], train_df['label']
        valid_X, valid_y = valid_df[feature_gene], valid_df['label']

        # Train the model
        trained_model = model.fit(train_X, train_y)
        pred = trained_model.predict_proba(valid_X)[:, 1]

        # Compute ROC curve and AUC
        fpr, tpr, _ = roc_curve(valid_y, pred)
        roc_auc = auc(fpr, tpr)
        interp_tpr = np.interp(mean_fpr, fpr, tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(roc_auc)

        # Save FPR, TPR, and AUC for each fold
        results[model_name]["fpr"].append(fpr)
        results[model_name]["tpr"].append(tpr)
        results[model_name]["aucs"].append(roc_auc)

    # Compute and store the mean ROC curve
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    results[model_name]["mean_tpr"] = mean_tpr
    results[model_name]["mean_auc"] = mean_auc
    results[model_name]["std_auc"] = std_auc

    # Plot the ROC curve
    plt.plot(mean_fpr, mean_tpr, label=f'{model_name} (AUC = {mean_auc:.2f} ± {std_auc:.2f})')

# Finalize the ROC curve plot
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Save results to a file (e.g., using np.savez)
np.savez("roc_data.npz", results=results)


RandomForest, 0
RandomForest, 1
RandomForest, 2
RandomForest, 3
RandomForest, 4
LightGBM, 0
LightGBM, 1
LightGBM, 2
LightGBM, 3
LightGBM, 4
XGBoost, 0
XGBoost, 1
XGBoost, 2
XGBoost, 3
XGBoost, 4
Logistic Regression, 0
Logistic Regression, 1
Logistic Regression, 2
Logistic Regression, 3
Logistic Regression, 4
SVM, 0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict

# Convert column names to strings
dataset_df.columns = dataset_df.columns.astype(str)

# Dictionary to store results
result_dict = defaultdict(list)

# Perform 5-fold cross-validation
for fold in range(5):
    train_df = dataset_df[dataset_df['fold'] != fold]
    valid_df = dataset_df[dataset_df['fold'] == fold]

    train_X, train_y = train_df[feature_gene], train_df['label']
    valid_X, valid_y = valid_df[feature_gene], valid_df['label']

    # Define models (LightGBM and XGBoost are set to use GPU)
    models = {
        'RandomForest': RandomForestClassifier(),
        'LightGBM': LGBMClassifier(device='gpu'),
        'XGBoost': XGBClassifier(tree_method='gpu_hist'),
        'Logistic Regression': LogisticRegression(),
        'SVM': SVC(probability=True),
        'KNN': KNeighborsClassifier()
    }

    # Define evaluation metrics
    metrics = {
        'Accuracy': accuracy_score,
        'Balanced Accuracy': balanced_accuracy_score,
        'ROC_AUC': roc_auc_score,
        'F1_Score': f1_score,
        'Precision': precision_score,
        'Recall': recall_score
    }

    # Train and evaluate each model
    for model_name, model in models.items():
        print(fold, model_name)
        trained_model = model.fit(train_X, train_y)
        pred = trained_model.predict_proba(valid_X)[:, 1]

        # Compute evaluation metrics
        for metric_name, metric_func in metrics.items():
            if metric_name == 'ROC_AUC':
                metric_value = metric_func(valid_y, pred)
            else:
                metric_value = metric_func(valid_y, (pred > 0.1005).astype(int))
                
            result_dict['model_name'].append(model_name)
            result_dict['metric_name'].append(metric_name)
            result_dict['score'].append(metric_value)
            result_dict['fold'].append(fold)

# Convert results to a DataFrame
result_df = pd.DataFrame(result_dict)

# Visualization of results
plt.figure(figsize=(12, 10))
sns.barplot(data=result_df, x='metric_name', y='score', hue='model_name')
plt.title('Model Performance Comparison')
plt.xlabel('Metric Name')
plt.ylabel('Score')
plt.show()
