# TCGA-GBMLGG Analysis Demo

- **Cohort**: Focuses on the TCGA-GBMLGG dataset, a vital resource merging Glioblastoma Multiforme (GBM) and Lower-Grade Glioma (LGG).
- **Goal**: Perform histological subtype classification.
- **Prediction Target**: Predict whether a tumor is an `astrocytoma`, `oligodendroglioma`, or `oligoastrocytoma` based on its multi-omics profile.

**Data Source:** Broad Institute FireHose (http://firebrowse.org/?cohort=GBMLGG)

In [1]:
import pandas as pd
from pathlib import Path
root = Path("/home/vicente/Github/BioNeuralNet/GBMLGG")

mirna_raw = pd.read_csv(root/"GBMLGG.miRseq_RPKM_log2.txt", sep="\t",index_col=0,low_memory=False)                            
rna_raw = pd.read_csv(root / "GBMLGG.uncv2.mRNAseq_RSEM_normalized_log2.txt", sep="\t",index_col=0,low_memory=False)
meth_raw = pd.read_csv(root/"GBMLGG.meth.by_mean.data.txt", sep='\t',index_col=0,low_memory=False)
clinical_raw = pd.read_csv(root / "GBMLGG.clin.merged.picked.txt",sep="\t", index_col=0, low_memory=False)

# display shapes and first few rows-columns of each file
display(mirna_raw.iloc[:3,:5])
display(mirna_raw.shape)

display(rna_raw.iloc[:3,:5])
display(meth_raw.shape)

display(meth_raw.iloc[:3,:5])
display(meth_raw.shape)

display(clinical_raw.iloc[:3,:5])
display(clinical_raw.shape)

Unnamed: 0_level_0,TCGA-06-0675-11,TCGA-06-0678-11,TCGA-06-0680-11,TCGA-06-0681-11,TCGA-06-AABW-11
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
hsa-let-7a-1,12.847399,13.789578,13.603454,13.346797,13.545128
hsa-let-7a-2,13.850719,14.79297,14.597877,14.34426,14.554888
hsa-let-7a-3,12.873946,13.810832,13.611074,13.364372,13.583039


(548, 531)

Unnamed: 0_level_0,TCGA-02-0047-01,TCGA-02-0055-01,TCGA-02-2483-01,TCGA-02-2485-01,TCGA-02-2486-01
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
?|100133144,1.619742,,1.5591,3.999567,2.475344
?|100134869,2.757258,3.972445,3.801138,3.902759,2.264506
?|10357,5.773564,4.97244,5.915141,6.520796,5.966629


(20115, 685)

Unnamed: 0_level_0,TCGA-06-0125-01,TCGA-06-0125-02,TCGA-06-0152-01,TCGA-06-0152-02,TCGA-06-0171-01
Hybridization REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Composite Element REF,Beta_Value,Beta_Value,Beta_Value,Beta_Value,Beta_Value
A1BG,0.438986043005,0.565094788162,0.461699906718,0.534127262606,0.455267108058
A1CF,0.681141812896,0.724487443757,0.601439733092,0.632221318323,0.691054589549


(20115, 685)

Unnamed: 0_level_0,tcga-06-6391,tcga-19-a6j4,tcga-cs-6665,tcga-cs-6670,tcga-db-a4xc
Hybridization REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Composite Element REF,value,value,value,value,value
years_to_birth,44,68,51,43,26
vital_status,1,1,0,0,0


(14, 1110)

## Data Processing Summary

1. **Transpose Data:** All raw data (miRNA, RNA, etc.) is flipped so rows represent patients and columns represent features.
2. **Standardize Patient IDs:** Patient IDs in all tables are cleaned to the 12-character TCGA format (e.g., `TCGA-AB-1234`) for matching.
3. **Handle Duplicates:** Duplicate patient rows are averaged in the omics data. The first entry is kept for duplicate patients in the clinical data.
4. **Find Common Patients:** The script identifies the list of patients that exist in *all* datasets.
5. **Subset Data:** All data tables are filtered down to *only* this common list of patients, ensuring alignment.
6. **Extract Target:** The `histological_type` column is pulled from the processed clinical data to be used as the prediction target (y-variable).

In [None]:
mirna = mirna_raw.T
rna = rna_raw.T
meth = meth_raw.T
clinical = clinical_raw.T

print(f"miRNA (samples, features): {mirna.shape}")
print(f"RNA (samples, features): {rna.shape}")
print(f"Methylation (samples, features): {meth.shape}")
print(f"Clinical (samples, features): {clinical.shape}")

def trim_barcode(idx):
    return idx.to_series().str.slice(0, 12)

# standarized patient IDs across all files
meth.index = trim_barcode(meth.index)
rna.index = trim_barcode(rna.index)
mirna.index = trim_barcode(mirna.index)
clinical.index = clinical.index.str.upper()
clinical.index.name = "Patient_ID"

# convert all data to numeric, coercing errors to NaN
meth = meth.apply(pd.to_numeric, errors='coerce')
rna = rna.apply(pd.to_numeric, errors='coerce')
mirna = mirna.apply(pd.to_numeric, errors='coerce')

# for any duplicate columns in the omics data, we average their values
meth = meth.groupby(meth.index).mean()
rna = rna.groupby(rna.index).mean()
mirna = mirna.groupby(mirna.index).mean()

# for any duplicate rows in the clinical data, we keep the first occurrence
clinical = clinical[~clinical.index.duplicated(keep='first')]

print(f"\nMethylation shape: {meth.shape}")
print(f"RNA shape: {rna.shape}")
print(f"miRNA shape: {mirna.shape}")
print(f"Clinical shape: {clinical.shape}")

for df in [meth, rna, mirna]:
    df.columns = df.columns.str.replace(r"\?", "unknown_", regex=True)
    df.columns = df.columns.str.replace(r"\|", "_", regex=True)
    df.columns = df.columns.str.replace("-", "_", regex=False)
    df.columns = df.columns.str.replace(r"_+", "_", regex=True)
    df.columns = df.columns.str.strip("_")
    
    df.fillna(df.mean(), inplace=True)

# to see which pateints are common across all data files
common_patients = sorted(list(set(meth.index)&set(rna.index)&set(mirna.index)&set(clinical.index)))

print(f"\nFound: {len(common_patients)} patients across all data types.")

# subset to only common patients
meth_processed = meth.loc[common_patients]
rna_processed= rna.loc[common_patients]
mirna_processed = mirna.loc[common_patients]
clinical_processed = clinical.loc[common_patients]

# extract target labels from clinical data
targets = clinical_processed['histological_type']

miRNA (samples, features): (531, 548)
RNA (samples, features): (701, 18328)
Methylation (samples, features): (685, 20115)
Clinical (samples, features): (1110, 14)

Methylation shape: (658, 20115)
RNA shape: (681, 18328)
miRNA shape: (517, 548)
Clinical shape: (1110, 14)

Found: 511 patients across all data types.


In [4]:
display(mirna_processed.iloc[:3,:5])
display(mirna_processed.shape)

display(rna_processed.iloc[:3,:5])
display(rna_processed.shape)

display(meth_processed.iloc[:3,:5])
display(meth_processed.shape)

display(clinical_processed.iloc[:3,:5])
display(clinical_processed.shape)

display(targets.value_counts())

gene,hsa_let_7a_1,hsa_let_7a_2,hsa_let_7a_3,hsa_let_7b,hsa_let_7c
TCGA-CS-4938,12.622353,13.632728,12.651613,14.20893,14.376942
TCGA-CS-4941,11.809808,12.815815,11.820061,13.047853,11.955006
TCGA-CS-4942,11.113995,12.128618,11.165523,12.48179,11.858545


(511, 548)

gene,unknown_100133144,unknown_100134869,unknown_10357,unknown_10431,unknown_155060
TCGA-CS-4938,3.123352,4.50794,8.069184,9.724198,7.51179
TCGA-CS-4941,5.187819,4.404406,7.291745,8.608326,8.344526
TCGA-CS-4942,3.562316,3.462602,7.53246,9.279502,7.034985


(511, 18328)

Hybridization REF,Composite Element REF,A1BG,A1CF,A2BP1,A2LD1
TCGA-CS-4938,,0.683179,0.776869,0.652055,0.919739
TCGA-CS-4941,,0.521934,0.784401,0.563447,0.865717
TCGA-CS-4942,,0.610067,0.828194,0.607771,0.875369


(511, 20115)

Hybridization REF,Composite Element REF,years_to_birth,vital_status,days_to_death,days_to_last_followup
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-CS-4938,value,31,0,,3574.0
TCGA-CS-4941,value,67,1,234.0,
TCGA-CS-4942,value,44,1,1335.0,


(511, 14)

histological_type
astrocytoma          193
oligodendroglioma    191
oligoastrocytoma     127
Name: count, dtype: int64

In [None]:
import bioneuralnet as bnn

# drop unwanted columns from clinical data
clinical_processed.drop(columns=["Composite Element REF"], errors="ignore", inplace=True)

# we transform the methylation beta values to M-values and drop unwanted columns
meth_m = meth_processed.drop(columns=["Composite Element REF"], errors="ignore")

# convert beta values to M-values using bioneuralnet utility with small epsilon to avoid log(0)
meth_m = bnn.utils.beta_to_m(meth_m, eps=1e-6) 

# lastly we turn the target labels into numerical classes
mapping = {"astrocytoma": 0, "oligodendroglioma": 1, "oligoastrocytoma": 2}
target_labels = targets.map(mapping).to_frame(name="target")

# as a safety check we align the indices once more
X_meth = meth_m.loc[common_patients]
X_rna = rna_processed.loc[common_patients]
X_mirna = mirna_processed.loc[common_patients]
Y_labels = target_labels.loc[common_patients]
clinical_final = clinical_processed.loc[common_patients]

print(f"\nDNA_Methylation shape: {X_meth.shape}")
print(f"RNA shape: {X_rna.shape}")
print(f"miRNA shape: {X_mirna.shape}")
print(f"Clinical shape: {clinical_final.shape}")
print(Y_labels.value_counts())

2025-11-08 13:06:12,202 - bioneuralnet.utils.data - INFO - Starting Beta-to-M value conversion (shape: (511, 20114)). Epsilon: 1e-06
2025-11-08 13:06:13,301 - bioneuralnet.utils.data - INFO - Beta-to-M conversion complete.



Methylation shape: (511, 20114)
RNA shape: (511, 18328)
miRNA shape: (511, 548)
Clinical shape: (511, 13)
target
0         193
1         191
2         127
Name: count, dtype: int64


## Feature Selection Methodology

### Supported Methods and Interpretation

**BioNeuralNet** provides three techniques for feature selection, allowing for different views of the data's statistical profile:

- **Variance Thresholding:** Identifies features with the **highest overall variance** across all samples.

- **ANOVA F-test:** Pinpoints features that best **distinguish between the target classes** (KIRC, KIRP, and KICH).

- **Random Forest Importance:** Assesses **feature utility** based on its contribution to a predictive non-linear model.

### GBMLGG Cohort Selection Strategy

A dimensionality reduction step was essential for managing the high-feature-count omics data:

- **High-Feature Datasets:** Both DNA Methylation (20,114) and RNA (18,328) required significant feature reduction.

- **Filtering Process:** The **top 6,000 features** were initially extracted from the Methylation and RNA datasets using all three methods.

- **Final Set:** A consensus set was built by finding the intersection of features selected by the ANOVA F-test and Random Forest Importance, ensuring both statistical relevance and model-based utility.

- **Low-Feature Datasets:** The miRNA data (548 features) was passed through **without selection**, as its feature count was already manageable.

In [9]:
import bioneuralnet as bnn

# feature selection
meth_highvar = bnn.utils.select_top_k_variance(X_meth, k=6000)
rna_highvar = bnn.utils.select_top_k_variance(X_rna, k=6000)

meth_af = bnn.utils.top_anova_f_features(X_meth, Y_labels, max_features=6000)
rna_af = bnn.utils.top_anova_f_features(X_rna, Y_labels, max_features=6000)

meth_rf = bnn.utils.select_top_randomforest(X_meth, Y_labels, top_k=6000)
rna_rf = bnn.utils.select_top_randomforest(X_rna, Y_labels, top_k=6000)

meth_var_set = set(meth_highvar.columns)
meth_anova_set = set(meth_af.columns)
meth_rf_set = set(meth_rf.columns)

rna_var_set = set(rna_highvar.columns)
rna_anova_set = set(rna_af.columns)
rna_rf_set = set(rna_rf.columns)

meth_inter1 = list(meth_anova_set & meth_var_set)
meth_inter2 = list(meth_rf_set & meth_var_set)
meth_inter3 = list(meth_anova_set & meth_rf_set)
meth_all_three = list(meth_anova_set & meth_var_set & meth_rf_set)

rna_inter4 = list(rna_anova_set & rna_var_set)
rna_inter5 = list(rna_rf_set & rna_var_set)
rna_inter6 = list(rna_anova_set & rna_rf_set)
rna_all_three = list(rna_anova_set & rna_var_set & rna_rf_set)

2025-11-08 13:15:33,012 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values
2025-11-08 13:15:33,012 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 0 NaNs after median imputation
2025-11-08 13:15:33,012 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance
2025-11-08 13:15:33,085 - bioneuralnet.utils.preprocess - INFO - Selected top 6000 features by variance
2025-11-08 13:15:35,778 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values
2025-11-08 13:15:35,779 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 0 NaNs after median imputation
2025-11-08 13:15:35,779 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance
2025-11-08 13:15:35,843 - bioneuralnet.utils.preprocess - INFO - Selected top 6000 features by variance
2025-11-08 13:15:38,824 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values
2025-11-08 13:15:38,824 - bioneuralnet.

In [11]:
print("FROM THE 6000 Methylation feature selection:\n")
print(f"Anova-F & variance selection share: {len(meth_inter1)} features")
print(f"Random Forest & variance selection share: {len(meth_inter2)} features")
print(f"Anova-F & Random Forest share: {len(meth_inter3)} features")
print(f"All three methods agree on: {len(meth_all_three)} features")

FROM THE 6000 Methylation feature selection:

Anova-F & variance selection share: 2704 features
Random Forest & variance selection share: 1768 features
Anova-F & Random Forest share: 1823 features
All three methods agree on: 809 features


In [12]:
print("\nFROM THE 6000 RNA feature selection:\n")
print(f"Anova-F & variance selection share: {len(rna_inter4)} features")
print(f"Random Forest & variance selection share: {len(rna_inter5)} features")
print(f"Anova-F & Random Forest share: {len(rna_inter6)} features")
print(f"All three methods agree on: {len(rna_all_three)} features")


FROM THE 6000 RNA feature selection:

Anova-F & variance selection share: 2183 features
Random Forest & variance selection share: 1977 features
Anova-F & Random Forest share: 2127 features
All three methods agree on: 763 features


## Feature Selection Summary: ANOVA-RF Intersection

The chosen strategy for feature selection is based on the **overlap** between features identified by the **ANOVA F-test** and **Random Forest Importance**. This approach offers comprehensive filtering by balancing class-based relevance (ANOVA) with non-linear model importance (Random Forest). The resulting feature sets are considered the most robust for downstream analysis.

### Feature Overlap Results

The following table details the number of features resulting from the intersection of different selection methods for each omics data type.

| Omics Data Type | ANOVA-F & Variance | RF & Variance | ANOVA-F & Random Forest (Selected) | All Three Agree |
| :--- | :--- | :--- | :--- | :--- |
| **Methylation** | 2,704 features | 1,768 features | **1,823 features** | 809 features |
| **RNA** | 2,183 features | 1,977 features | **2,127 features** | 763 features |

In [13]:
X_meth_selected = X_meth[meth_inter3]
X_rna_selected = X_rna[rna_inter6]

print("\nFinal Shapes for Modeling")
print(f"Methylation (X1): {X_meth_selected.shape}")
print(f"RNA-Seq (X2): {X_rna_selected.shape}")
print(f"miRNA-Seq (X3): {X_mirna.shape}")
print(f"Labels (Y): {Y_labels.shape}")


Final Shapes for Modeling
Methylation (X1): (511, 1823)
RNA-Seq (X2): (511, 2127)
miRNA-Seq (X3): (511, 548)
Labels (Y): (511, 1)


## Data Availability

To facilitate rapid experimentation and reproduction of our results, the fully processed and feature-selected dataset used in this analysis has been made available directly within the package.

Users can load this dataset, bypassing all preceding data acquisition, preprocessing, and feature selection steps. This allows users to proceed immediately from this step.

In [1]:
import bioneuralnet as bnn

tgca_gbmlgg = bnn.datasets.DatasetLoader("gbmlgg")
display(tgca_gbmlgg.shape)

# The dataset is returned as a dictionary. We extract each file independetly based on the name( Key).
dna_meth = tgca_gbmlgg.data["meth"]
rna = tgca_gbmlgg.data["rna"]
mirna = tgca_gbmlgg.data["mirna"]
clinical = tgca_gbmlgg.data["clinical"]
target = tgca_gbmlgg.data["target"]

{'mirna': (511, 548),
 'target': (511, 1),
 'clinical': (511, 13),
 'rna': (511, 2127),
 'meth': (511, 1823)}

In [16]:
clinical = tgca_gbmlgg.data["clinical"]
samples_before = clinical.shape[1]

clinical_half_len = clinical.shape[1] // 2
clinical.dropna(inplace=True, axis=1, thresh=clinical_half_len)
samples_after = clinical.shape[1]
print(f"Samples dropped by dropna: {samples_before - samples_after}")
print(f"Final shape of clinical data: {clinical.shape}")
clinical.drop(columns=['histological_type'])

Samples dropped by dropna: 0
Final shape of clinical data: (511, 12)


Unnamed: 0_level_0,years_to_birth,vital_status,days_to_death,days_to_last_followup,tumor_tissue_site,gender,date_of_initial_pathologic_diagnosis,radiation_therapy,karnofsky_performance_score,race,ethnicity
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
TCGA-CS-4938,31.0,0,,3574.0,central nervous system,female,2005,no,90.0,white,not hispanic or latino
TCGA-CS-4941,67.0,1,234.0,,central nervous system,male,2005,yes,90.0,white,not hispanic or latino
TCGA-CS-4942,44.0,1,1335.0,,central nervous system,female,2006,yes,70.0,black or african american,
TCGA-CS-4943,37.0,1,1106.0,,central nervous system,male,2009,no,50.0,white,
TCGA-CS-4944,50.0,0,,1828.0,central nervous system,male,2010,yes,100.0,white,
...,...,...,...,...,...,...,...,...,...,...,...
TCGA-WY-A85A,20.0,0,,1320.0,central nervous system,male,2010,no,,white,
TCGA-WY-A85B,24.0,0,,1393.0,central nervous system,male,2010,no,,white,
TCGA-WY-A85C,36.0,0,,1426.0,central nervous system,male,2010,yes,,white,
TCGA-WY-A85D,60.0,0,,1147.0,central nervous system,male,2010,no,,white,


In [17]:
# BioNeuralNet provides a preprocessing function to handle clinical data
clinical = tgca_gbmlgg.data["clinical"]
#clinical = bnn.utils.data.impute_omics_knn(clinical)
# For more details on the preprocessing functions, see `bioneuralnet.utils.preprocess``
clinical_preprocessed = bnn.utils.preprocess_clinical(
    clinical, 
    target, 
    top_k=4, 
    scale=False, 
    ignore_columns=[ "vital_status","histological_type","days_to_last_followup",  "years_to_birth", "days_to_death", "date_of_initial_pathologic_diagnosis"])

clinical_preprocessed.columns
display(clinical_preprocessed.iloc[:3,:5])

2025-11-13 01:46:19,984 - bioneuralnet.utils.preprocess - INFO - [Inf]: Replaced 0 infinite values
2025-11-13 01:46:19,984 - bioneuralnet.utils.preprocess - INFO - [NaN]: Replaced 206 NaNs after median imputation
2025-11-13 01:46:19,984 - bioneuralnet.utils.preprocess - INFO - [Zero-Var]: 0 columns dropped due to zero variance
2025-11-13 01:46:20,066 - bioneuralnet.utils.preprocess - INFO - Selected top 4 features by RandomForest importance


Unnamed: 0_level_0,karnofsky_performance_score,gender_male,radiation_therapy_no,ethnicity_hispanic or latino
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TCGA-CS-4938,90.0,False,True,False
TCGA-CS-4941,90.0,True,False,False
TCGA-CS-4942,70.0,False,False,False


In [18]:
import pandas as pd

X_train_full = pd.concat([dna_meth, rna, mirna], axis=1)

print(f"Nan values in X_train_full: {X_train_full.isna().sum().sum()}")
X_train_full = X_train_full.dropna()
print(f"Nan value in X_train_full after dropping: {X_train_full.isna().sum().sum()}")

print(f"X_train_full shape: {X_train_full.shape}")
# building the graph using the similarity graph function with k=15
A_train = bnn.utils.gen_similarity_graph(X_train_full, k=15)

print(f"\nNetwork shape: {A_train.shape}")

Nan values in X_train_full: 0
Nan value in X_train_full after dropping: 0
X_train_full shape: (511, 4498)

Network shape: (4498, 4498)


In [19]:
# import os
# import random
# import logging 
# import warnings 
# import numpy as np
# import torch
# import ray
# import bioneuralnet as bnn
# from bioneuralnet.utils import logger 

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"


# if not ray.is_initialized():

#     # 3. Set the ray init logging level to INFO
#     ray.init(logging_level=logging.INFO) 
    
#     # Ignore common warnings
#     warnings.filterwarnings("ignore", category=UserWarning)
#     warnings.filterwarnings("ignore", category=DeprecationWarning)


2025-11-13 01:46:38,771	INFO worker.py:1888 -- Started a local Ray instance.




[36m(tune_train_n pid=237589)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T4650b_00000/checkpoint_000000)
[36m(tune_train_n pid=237589)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T4650b_00000/checkpoint_000001)
[36m(tune_train_n pid=237589)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T4650b_00000/checkpoint_000002)
[36m(tune_train_n pid=237589)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T4650b_00000/checkpoint_000003)
[36m(tune_train_n pid=237589)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T4650b_00000/checkpoint_000004)
[36m(tune_train_n pid=237589)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T4650b



[36m(tune_train_n pid=238459)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T649bc_00000/checkpoint_000004)[32m [repeated 105x across cluster][0m
[36m(tune_train_n pid=238547)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T649bc_00001/checkpoint_000000)[32m [repeated 96x across cluster][0m
[36m(tune_train_n pid=238712)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T649bc_00003/checkpoint_000001)[32m [repeated 6x across cluster][0m
[36m(tune_train_n pid=238872)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T649bc_00005/checkpoint_000000)[32m [repeated 2x across cluster][0m
[36m(tune_train_n pid=239028)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T649bc_00007/checkpoin



[36m(tune_train_n pid=239280)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T8c07c_00000/checkpoint_000001)[32m [repeated 2x across cluster][0m
[36m(tune_train_n pid=239443)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T8c07c_00002/checkpoint_000000)[32m [repeated 115x across cluster][0m
[36m(tune_train_n pid=239600)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T8c07c_00004/checkpoint_000000)[32m [repeated 3x across cluster][0m
[36m(tune_train_n pid=239762)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T8c07c_00006/checkpoint_000005)[32m [repeated 38x across cluster][0m
[36m(tune_train_n pid=239919)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T8c07c_00008/checkpoin



[36m(tune_train_n pid=240086)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Ta7a57_00000/checkpoint_000002)[32m [repeated 5x across cluster][0m
[36m(tune_train_n pid=240169)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Ta7a57_00001/checkpoint_000001)[32m [repeated 99x across cluster][0m
[36m(tune_train_n pid=240326)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Ta7a57_00003/checkpoint_000006)[32m [repeated 8x across cluster][0m
[36m(tune_train_n pid=240482)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Ta7a57_00005/checkpoint_000001)[32m [repeated 96x across cluster][0m
[36m(tune_train_n pid=240645)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Ta7a57_00007/checkpoint



[36m(tune_train_n pid=240909)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T119a5_00000/checkpoint_000004)[32m [repeated 7x across cluster][0m
[36m(tune_train_n pid=241071)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T119a5_00002/checkpoint_000000)[32m [repeated 97x across cluster][0m
[36m(tune_train_n pid=241228)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T119a5_00004/checkpoint_000000)[32m [repeated 18x across cluster][0m
[36m(tune_train_n pid=241385)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T119a5_00006/checkpoint_000001)[32m [repeated 4x across cluster][0m
[36m(tune_train_n pid=241544)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T119a5_00008/checkpoint



[36m(tune_train_n pid=241714)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T2d826_00000/checkpoint_000000)
[36m(tune_train_n pid=241714)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T2d826_00000/checkpoint_000001)
[36m(tune_train_n pid=241714)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T2d826_00000/checkpoint_000002)
[36m(tune_train_n pid=241714)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T2d826_00000/checkpoint_000003)
[36m(tune_train_n pid=241714)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T2d826_00000/checkpoint_000004)
[36m(tune_train_n pid=241714)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T2d826



[36m(tune_train_n pid=242791)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T97a2f_00000/checkpoint_000001)[32m [repeated 3x across cluster][0m
[36m(tune_train_n pid=242952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T97a2f_00002/checkpoint_000000)[32m [repeated 100x across cluster][0m
[36m(tune_train_n pid=243110)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T97a2f_00004/checkpoint_000000)[32m [repeated 3x across cluster][0m
[36m(tune_train_n pid=243271)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T97a2f_00006/checkpoint_000007)[32m [repeated 9x across cluster][0m
[36m(tune_train_n pid=243428)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/T97a2f_00008/checkpoint



[36m(tune_train_n pid=243608)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Tc3f1a_00000/checkpoint_000007)[32m [repeated 12x across cluster][0m
[36m(tune_train_n pid=243691)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Tc3f1a_00001/checkpoint_000031)[32m [repeated 124x across cluster][0m
[36m(tune_train_n pid=243848)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Tc3f1a_00003/checkpoint_000000)[32m [repeated 2x across cluster][0m
[36m(tune_train_n pid=244006)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Tc3f1a_00005/checkpoint_000001)[32m [repeated 19x across cluster][0m
[36m(tune_train_n pid=244177)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Tc3f1a_00007/checkpoi



[36m(tune_train_n pid=245041)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Te11cc_00000/checkpoint_000000)[32m [repeated 2x across cluster][0m
[36m(tune_train_n pid=245138)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Te11cc_00001/checkpoint_000001)[32m [repeated 101x across cluster][0m
[36m(tune_train_n pid=245298)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Te11cc_00003/checkpoint_000000)[32m [repeated 18x across cluster][0m
[36m(tune_train_n pid=245454)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Te11cc_00005/checkpoint_000000)[32m [repeated 2x across cluster][0m
[36m(tune_train_n pid=245611)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/vicente/ray_results/tune_dp/Te11cc_00007/checkpoin



## Reproducibility and Seeding

To ensure our experimental results are fully reproducible, a single global seed is set at the beginning of the analysis.

This utility function propagates the seed to all sources of randomness, including `random`, `numpy`, and `torch` (for both CPU and GPU). Critically, it also configures the PyTorch cuDNN backend to use deterministic algorithms.

**for each DPMON outer iteration, the seed is incremented to generate a differnt internal test/train split.**

In [20]:
import bioneuralnet as bnn

SEED = 118
bnn.utils.set_seed(SEED)

2025-11-13 01:46:39,215 - bioneuralnet.utils.data - INFO - Setting global seed for reproducibility to: 118
2025-11-13 01:46:39,216 - bioneuralnet.utils.data - INFO - CUDA available. Applying seed to all GPU operations
2025-11-13 01:46:39,217 - bioneuralnet.utils.data - INFO - Seed setting complete


---

## Classification using DPMON: Training and Evaluation

* Run 3 outer iterations, each with a different seed.
* Each iteration performs hyperparameter tuning.
* After tuning, train `repeat_num = 3` models with the best hyperparameters.
* Collect predictions from the best model of each iteration.
* Compute **Accuracy**, **F1 Weighted**, and **F1 Macro +/- standard deviation** across iterations.

This demonstrates the **end-to-end BioNeuralNet pipeline** in action.

### Analysis of Hyperparameter Optimization

The hyperparameter tuning results below showcase the best configuration found across three distinct GNN model runs.

| Parameter | SAGE (GraphSAGE) | GCN (Graph Convolutional) | GAT (Graph Attention) |
| :--- | :--- | :--- | :--- |
| **gnn_layer_num** | 8 | 4 | 8 |
| **gnn_hidden_dim** | 32 | 16 | 32 |
| **lr (Learning Rate)** | 0.000401 | 0.001687 | 0.000401 |
| **weight_decay** | 0.007823 | 0.000269 | 0.007823 |
| **nn_hidden_dim1** | 32 | 64 | 32 |
| **nn_hidden_dim2** | 32 | 64 | 32 |
| **num_epochs** | 2048 | 2048 | 2048 |


### Results

| Metric | SAGE | GCN | GAT |
| :--- | :--- | :--- | :--- |
| **Accuracy** | 0.9472 +/- 0.0747 | 0.9954 +/- 0.0065 | 0.9993 +/- 0.0009 |
| **F1 Weighted** | 0.9464 +/- 0.0758 | 0.9954 +/- 0.0064 | 0.9993 +/- 0.0009 |
| **F1 Macro** | 0.9510 +/- 0.0693 | 0.9957 +/- 0.0061 | 0.9993 +/- 0.0010 |


In [21]:
import numpy as np
from pathlib import Path
from sklearn.metrics import f1_score, accuracy_score
from bioneuralnet.downstream_task import DPMON

output_dir_base_sage = Path("/home/vicente/Github/BioNeuralNet/dpmon_results_SAGE_FINAL")
target = target.rename(columns={"target": "phenotype"})

n_repeats = 5
all_preds = []

for r in range(n_repeats):
    bnn.utils.set_seed(SEED+r)
    dpmon_repeat = DPMON(
        adjacency_matrix=A_train,
        omics_list=[dna_meth, rna, mirna],
        phenotype_data=target,
        clinical_data=clinical_preprocessed,
        repeat_num=3,
        model='SAGE',
        tune=True,
        gpu=True,
        cuda=0,
        output_dir=output_dir_base_sage,
    )
    
    predictions_df, _ = dpmon_repeat.run()
    all_preds.append(predictions_df["Predicted"].values)

all_preds = np.array(all_preds)

f1_macro_list = [f1_score(target, pred, average='macro') for pred in all_preds]
f1_weighted_list = [f1_score(target, pred, average='weighted') for pred in all_preds]
accuracy_list = [accuracy_score(target, pred) for pred in all_preds]

avg_f1_macro = np.mean(f1_macro_list)
std_f1_macro = np.std(f1_macro_list)

avg_f1_weighted = np.mean(f1_weighted_list)
std_f1_weighted = np.std(f1_weighted_list)

avg_acc = np.mean(accuracy_list)
std_acc = np.std(accuracy_list)

print(f"Accuracy: {avg_acc:.4f} +/- {std_acc:.4f}")
print(f"F1 Weighted: {avg_f1_weighted:.4f} +/- {std_f1_weighted:.4f}")
print(f"F1 Macro: {avg_f1_macro:.4f} +/- {std_f1_macro:.4f}")

2025-11-13 01:46:39,241 - bioneuralnet.utils.data - INFO - Setting global seed for reproducibility to: 118
2025-11-13 01:46:39,242 - bioneuralnet.utils.data - INFO - CUDA available. Applying seed to all GPU operations
2025-11-13 01:46:39,242 - bioneuralnet.utils.data - INFO - Seed setting complete
2025-11-13 01:46:39,243 - bioneuralnet.downstream_task.dpmon - INFO - Output directory set to: /home/vicente/Github/BioNeuralNet/dpmon_results_SAGE_FINAL
2025-11-13 01:46:39,243 - bioneuralnet.downstream_task.dpmon - INFO - Initialized DPMON with the provided parameters.
2025-11-13 01:46:39,243 - bioneuralnet.downstream_task.dpmon - INFO - Starting DPMON run.
2025-11-13 01:46:39,264 - bioneuralnet.downstream_task.dpmon - INFO - Running hyperparameter tuning for DPMON.
2025-11-13 01:46:39,265 - bioneuralnet.downstream_task.dpmon - INFO - Using GPU 0
2025-11-13 01:46:39,442 - bioneuralnet.downstream_task.dpmon - INFO - Number of nodes in network: 4498
2025-11-13 01:46:41,154 - bioneuralnet.down

Accuracy: 0.9233 +/- 0.1534
F1 Weighted: 0.9023 +/- 0.1953
F1 Macro: 0.9035 +/- 0.1931


In [None]:
output_dir_base_gcn = Path("/home/vicente/Github/BioNeuralNet/dpmon_results_GCN_FINAL")
n_repeats = 5
all_preds = []

for r in range(n_repeats):
    bnn.utils.set_seed(SEED+r)
    dpmon_repeat = DPMON(
        adjacency_matrix=A_train,
        omics_list=[dna_meth, rna, mirna],
        phenotype_data=target,
        clinical_data=clinical_preprocessed,
        repeat_num=3,
        model='GCN',
        tune=True,
        gpu=True,
        cuda=0,
        output_dir=output_dir_base_gcn,
    )
    
    predictions_df, _ = dpmon_repeat.run()
    all_preds.append(predictions_df["Predicted"].values)

all_preds = np.array(all_preds)

f1_macro_list = [f1_score(target, pred, average='macro') for pred in all_preds]
f1_weighted_list = [f1_score(target, pred, average='weighted') for pred in all_preds]
accuracy_list = [accuracy_score(target, pred) for pred in all_preds]

avg_f1_macro = np.mean(f1_macro_list)
std_f1_macro = np.std(f1_macro_list)

avg_f1_weighted = np.mean(f1_weighted_list)
std_f1_weighted = np.std(f1_weighted_list)

avg_acc = np.mean(accuracy_list)
std_acc = np.std(accuracy_list)

print(f"Accuracy: {avg_acc:.4f} +/- {std_acc:.4f}")
print(f"F1 Weighted: {avg_f1_weighted:.4f} +/- {std_f1_weighted:.4f}")
print(f"F1 Macro: {avg_f1_macro:.4f} +/- {std_f1_macro:.4f}")

2025-11-13 01:53:06,854 - bioneuralnet.utils.data - INFO - Setting global seed for reproducibility to: 118
2025-11-13 01:53:06,855 - bioneuralnet.utils.data - INFO - CUDA available. Applying seed to all GPU operations
2025-11-13 01:53:06,855 - bioneuralnet.utils.data - INFO - Seed setting complete
2025-11-13 01:53:06,856 - bioneuralnet.downstream_task.dpmon - INFO - Output directory set to: /home/vicente/Github/BioNeuralNet/dpmon_results_GCN_FINAL
2025-11-13 01:53:06,856 - bioneuralnet.downstream_task.dpmon - INFO - Initialized DPMON with the provided parameters.
2025-11-13 01:53:06,856 - bioneuralnet.downstream_task.dpmon - INFO - Starting DPMON run.
2025-11-13 01:53:06,870 - bioneuralnet.downstream_task.dpmon - INFO - Running hyperparameter tuning for DPMON.
2025-11-13 01:53:06,871 - bioneuralnet.downstream_task.dpmon - INFO - Using GPU 0
2025-11-13 01:53:07,213 - bioneuralnet.downstream_task.dpmon - INFO - Number of nodes in network: 4498
2025-11-13 01:53:09,034 - bioneuralnet.downs

In [None]:
output_dir_base_gat = Path("/home/vicente/Github/BioNeuralNet/dpmon_results_GAT_FINAL")

n_repeats = 5
all_preds = []

for r in range(n_repeats):
    bnn.utils.set_seed(SEED+r)
    dpmon_repeat = DPMON(
        adjacency_matrix=A_train,
        omics_list=[dna_meth, rna, mirna],
        phenotype_data=target,
        clinical_data=clinical_preprocessed,
        repeat_num=3,
        model='GAT',
        tune=True,
        gpu=True,
        cuda=0,
        output_dir=output_dir_base_gat,
    )
    
    predictions_df, _ = dpmon_repeat.run()
    all_preds.append(predictions_df["Predicted"].values)

all_preds = np.array(all_preds)

f1_macro_list = [f1_score(target, pred, average='macro') for pred in all_preds]
f1_weighted_list = [f1_score(target, pred, average='weighted') for pred in all_preds]
accuracy_list = [accuracy_score(target, pred) for pred in all_preds]

avg_f1_macro = np.mean(f1_macro_list)
std_f1_macro = np.std(f1_macro_list)

avg_f1_weighted = np.mean(f1_weighted_list)
std_f1_weighted = np.std(f1_weighted_list)

avg_acc = np.mean(accuracy_list)
std_acc = np.std(accuracy_list)

print(f"Accuracy: {avg_acc:.4f} +/- {std_acc:.4f}")
print(f"F1 Weighted: {avg_f1_weighted:.4f} +/- {std_f1_weighted:.4f}")
print(f"F1 Macro: {avg_f1_macro:.4f} +/- {std_f1_macro:.4f}")

2025-11-09 20:00:27,319 - bioneuralnet.utils.data - INFO - Setting global seed for reproducibility to: 118
2025-11-09 20:00:27,319 - bioneuralnet.utils.data - INFO - CUDA available. Applying seed to all GPU operations
2025-11-09 20:00:27,319 - bioneuralnet.utils.data - INFO - Seed setting complete
2025-11-09 20:00:27,320 - bioneuralnet.downstream_task.dpmon - INFO - Output directory set to: /home/vicente/Github/BioNeuralNet/dpmon_cv_results_GAT_FINAL
2025-11-09 20:00:27,320 - bioneuralnet.downstream_task.dpmon - INFO - Initialized DPMON with the provided parameters.
2025-11-09 20:00:27,320 - bioneuralnet.downstream_task.dpmon - INFO - Starting DPMON run.
2025-11-09 20:00:27,330 - bioneuralnet.downstream_task.dpmon - INFO - Running hyperparameter tuning for DPMON.
2025-11-09 20:00:27,331 - bioneuralnet.downstream_task.dpmon - INFO - Using GPU 0
2025-11-09 20:00:27,498 - bioneuralnet.downstream_task.dpmon - INFO - Number of nodes in network: 4498
2025-11-09 20:00:30,311 - bioneuralnet.do

Accuracy: 0.9993 +/- 0.0009
F1 Weighted: 0.9993 +/- 0.0009
F1 Macro: 0.9993 +/- 0.0010


In [None]:
import warnings 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.exceptions import ConvergenceWarning
from scipy.stats import loguniform, randint

X = pd.concat([dna_meth, rna, mirna, clinical_preprocessed], axis=1)
y = target['phenotype']
print(f"Successfully created X matrix with shape: {X.shape}")
print(f"Successfully created y vector with shape: {y.shape}")

all_results = {
    "LogisticRegression": {"acc": [], "f1_w": [], "f1_m": []},
    "MLP": {"acc": [], "f1_w": [], "f1_m": []},
    "XGBoost": {"acc": [], "f1_w": [], "f1_m": []},
}

all_results = {
    "LogisticRegression": {"acc": [], "f1_w": [], "f1_m": []},
    "MLP": {"acc": [], "f1_w": [], "f1_m": []},
    "XGBoost": {"acc": [], "f1_w": [], "f1_m": []},
}

N_REPEATS = 5
TEST_SPLIT_SIZE = .7
CV_FOLDS = 3
N_ITER_SEARCH = 10

pipe_lr = Pipeline([('scaler', StandardScaler()),
    ('model', LogisticRegression(
        solver='lbfgs',
        max_iter=1000,
        penalty=None 
    ))
])

pipe_mlp = Pipeline([('scaler', StandardScaler()),
    ('model', MLPClassifier(
        max_iter=500,
        early_stopping=True,
        n_iter_no_change=10
    ))
])

pipe_xgb = Pipeline([('scaler', StandardScaler()),
    ('model', XGBClassifier(
        eval_metric='logloss'
    ))
])

params_lr = {
    'model__penalty': ['l2'], 
    'model__C': loguniform(1e-4, 1e2)
}

params_mlp = {
    'model__hidden_layer_sizes': [(100,), (100, 50), (50, 50)],
    'model__activation': ['relu', 'tanh'],
    'model__alpha': loguniform(1e-5, 1e-1),
    'model__learning_rate_init': loguniform(1e-4, 1e-2)
}

params_xgb = {
    'model__n_estimators': randint(100, 500),
    'model__learning_rate': loguniform(0.01, 0.3),
    'model__max_depth': randint(3, 10),
    'model__subsample': [0.7, 0.8, 0.9, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 0.9, 1.0]
}

models_to_tune = {
    "LogisticRegression": (pipe_lr, params_lr),
    "MLP": (pipe_mlp, params_mlp),
    "XGBoost": (pipe_xgb, params_xgb)
}

for r in range(N_REPEATS):
    seed = SEED + r
    print(f"\nRunning Repeat {r+1}/{N_REPEATS} (Seed: {seed})")

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, 
        test_size=TEST_SPLIT_SIZE, 
        random_state=seed, 
        stratify=y)
    
    cv_splitter = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=seed)
    for name, (pipeline, params) in models_to_tune.items():
        print(f"Tuning {name}")
        search = RandomizedSearchCV(
            estimator=pipeline,
            param_distributions=params,
            n_iter=N_ITER_SEARCH,
            cv=cv_splitter,
            scoring='f1_weighted',
            n_jobs=-1,
            random_state=seed
        )
        
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=ConvergenceWarning)
            warnings.filterwarnings("ignore", category=UserWarning)
            search.fit(X_train, y_train)

        print(f"Best params for {name}: {search.best_params_}")
        
        best_model = search.best_estimator_
        preds = best_model.predict(X_test)

        acc = accuracy_score(y_test, preds)
        f1_w = f1_score(y_test, preds, average='weighted', zero_division=0)
        f1_m = f1_score(y_test, preds, average='macro', zero_division=0)

        all_results[name]["acc"].append(acc)
        all_results[name]["f1_w"].append(f1_w)
        all_results[name]["f1_m"].append(f1_m)

print(f"Tuned Model Results (Averaged over {N_REPEATS} runs)")
print(f"(Tuning was {N_ITER_SEARCH} iterations with {CV_FOLDS}fold CV)")

for model_name, metrics in all_results.items():
    avg_acc = np.mean(metrics["acc"])
    std_acc = np.std(metrics["acc"])
    
    avg_f1_w = np.mean(metrics["f1_w"])
    std_f1_w = np.std(metrics["f1_w"])
    
    avg_f1_m = np.mean(metrics["f1_m"])
    std_f1_m = np.std(metrics["f1_m"])

    print(f"Results for {model_name}:")
    print(f"Accuracy: {avg_acc:.4f} +/- {std_acc:.4f}")
    print(f"F1 Weighted: {avg_f1_w:.4f} +/- {std_f1_w:.4f}")
    print(f"F1 Macro: {avg_f1_m:.4f} +/- {std_f1_m:.4f}")


## values below are just place holders.

In [None]:
import bioneuralnet as bnn

gnn_plot_data = {
    "Accuracy": {
        "SAGE": (0.9903, 0.0173), 
        "GCN": (0.9720, 0.0379), 
        "GAT": (0.9626, 0.0575)
    },
    "F1 Weighted": {
        "SAGE": (0.9903, 0.0173), 
        "GCN": (0.9631, 0.0557), 
        "GAT": (0.9546, 0.0732)
    },
    "F1 Macro": {
        "SAGE": (0.9876, 0.0203), 
        "GCN": (0.9150, 0.1410), 
        "GAT": (0.9107, 0.1569)
    }
}

baseline_plot_data = {
    "Accuracy": {
        "SAGE": (0.9903, 0.0173), 
        "LogReg": (0.9553, 0.0090), 
        "XGBoost": (0.9527, 0.0059), 
        "MLP": (0.9362, 0.0133)
    },
    "F1 Weighted": {
        "SAGE": (0.9903, 0.0173), 
        "LogReg": (0.9557, 0.0088), 
        "XGBoost": (0.9529, 0.0058), 
        "MLP": (0.9379, 0.0125)
    },
    "F1 Macro": {
        "SAGE": (0.9876, 0.0203), 
        "LogReg": (0.9413, 0.0124), 
        "XGBoost": (0.9451, 0.0106), 
        "MLP": (0.9138, 0.0170)
    }
}


bnn.metrics.plot_multiple_metrics(
    gnn_plot_data,
    title_map={
        "Accuracy": "GNNs Comparison: Accuracy",
        "F1 Weighted": "GNNs Comparison: F1 Weighted",
        "F1 Macro": "GNNs Comparison: F1 Macro"
    }
)

bnn.metrics.plot_multiple_metrics(
    baseline_plot_data,
    title_map={
        "Accuracy": "SAGE vs. Baselines: Accuracy",
        "F1 Weighted": "SAGE vs. Baselines: F1 Weighted",
        "F1 Macro": "SAGE vs. Baselines: F1 Macro"
    }
)

