In [1]:
import os
import sys
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(RESULTS_DIR, exist_ok=True)

RESULTS_PATH = os.path.join(RESULTS_DIR, 'results')
MODELS_PATH = os.path.join(RESULTS_DIR, 'model')

FIN_RESULTS_OV_PATH = os.path.join(PROJECT_ROOT, 'results_modelling_ovs')
FIN_RESULTS_SPLIT_PATH = os.path.join(PROJECT_ROOT, 'results_modelling_splits')
FEAT_IMP_PATH = os.path.join(PROJECT_ROOT, 'results_modelling_feat_imp')

MODEL_NAME = 'GBoost'

# Imports
from models.modelling_process import ModellingProcess
from utils.analysis import * 
import os
import pandas as pd
import re
from sklearn.pipeline import Pipeline
import pickle
from sklearn.compose import ColumnTransformer
from utils.feature_selection import FoldAwareAE


# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

DATA_CONFIG = {
            'use_pca': False,
            'pca_threshold': 0.85,
            'use_imputed': True,
            'select_random' : False, 
            'use_cohorts': False, 
            'requires_ohenc' : False, 
        }

In [None]:

def test_perf_all_models(model_path):
    """
    Evaluates the performance of all models.

    Args:
        model_path (str): Path to trained model files (.pkl).
        
    Returns: 
        pd.DataFrame: Test performance scores on the test cohorts (group B) per model-dataset combination.
    """
    files = os.listdir(model_path)
    test_perf = []
    for file in files:
        print(file)

        contains_pData = bool(re.search(r"pData", file, re.IGNORECASE))
        contains_intersection = bool(re.search(r"inter|intersection", file, re.IGNORECASE))
        contains_imputed = bool(re.search(r"imp|imputed|common", file, re.IGNORECASE))
        contains_aenc = bool(re.search(r"aenc|auto|autoenc|autoencoder", file, re.IGNORECASE))
        contains_scores = bool(re.search(r"score|scores", file, re.IGNORECASE))
        
        components = [
            "pData" if contains_pData else "",
            "Intersection" if contains_intersection else "",
            "Imputed" if contains_imputed else "",
            "AutoEncoder" if contains_aenc else "",
            "Scores" if contains_scores else ""
        ]

        dataset = "_".join(filter(None, components)) 
        
        DATA_CONFIG = {
            'use_pca': False,
            'pca_threshold': 0.85,
            'use_imputed': True,
            'select_random' : False, 
            'use_cohorts': False, 
            'requires_ohenc' : False, 
        }

        # Load data based on file type
        if contains_intersection:
            DATA_CONFIG['gene_type'] = 'intersection'
        elif contains_imputed:
            DATA_CONFIG['gene_type'] = 'common_genes'
        elif contains_aenc:
            DATA_CONFIG['gene_type'] = 'intersection'
        if contains_pData:
            DATA_CONFIG['clinical_covs'] = ["AGE", "TISSUE", "GLEASON_SCORE", 'PRE_OPERATIVE_PSA']
        if contains_pData and not contains_intersection and not contains_imputed and not contains_aenc and not contains_scores: 
            DATA_CONFIG['only_pData'] = True
            DATA_CONFIG['gene_type'] = None
            
        model = load_model(os.path.join(model_path, file))  
        mp = ModellingProcess()
        mp.prepare_test_data(DATA_CONFIG, PROJECT_ROOT)
        groups = set(mp.test_groups)
        groups = sorted(groups)
        X_cos, y_cos = mp.prepare_test_cohort_data(DATA_CONFIG, PROJECT_ROOT, groups)
        
        if contains_aenc: 
            if contains_pData:                
                pdata_cols = ['TISSUE', 'AGE',
                    'GLEASON_SCORE', 'PRE_OPERATIVE_PSA']
                exprs_cols =  list(set(X_cos[0].columns) - set(pdata_cols))
                exprs_cols = sorted(exprs_cols)
            else: 
                pdata_cols = []
                exprs_cols =  list(set(X_cos[0].columns) - set(pdata_cols))
                exprs_cols = sorted(exprs_cols)
                

            ae = FoldAwareAE(testing = True)            
            preprocessor = ColumnTransformer(
                transformers=[
                    ('feature_selection', ae, exprs_cols),  # Apply feature selection
                    ('other_features', 'passthrough', pdata_cols)         # Pass through other columns
                ]
            )
            preprocessor.fit(X=X_cos[0])
            pipe_steps = [
                ('preprocessor', preprocessor),
                ('model', model)] 
            
            model = Pipeline(pipe_steps)
           
        ci1 = model.score(X_cos[0], y_cos[0])
        ci2 = model.score(X_cos[1], y_cos[1])
        
        result = {
            'model' : file.replace(".pkl", ""), 
            'model_class' : 'GBoost', 
            'dataset' : dataset, 
            'ci_coh1' : ci1, 
            'ci_coh2' : ci2
        }
        test_perf.append(result)
    return pd.DataFrame(test_perf)

In [3]:
imp = feat_imp_all_models(MODELS_PATH, MODEL_NAME, DATA_CONFIG)
imp.to_csv(os.path.join(FEAT_IMP_PATH, 'feat_imp_cBoost.csv'))

In [8]:
results = load_all_results(RESULTS_PATH)
test_perf = test_perf_all_models(MODELS_PATH)

cboost_autoencoder_paper.pkl


2025-02-11 20:29:50,525 - INFO - Loaded data: 496 samples, 13214 features


['test_cohort_1', 'test_cohort_2']


2025-02-11 20:31:17,767 - INFO - Loaded data: 332 samples, 13214 features
2025-02-11 20:31:50,924 - INFO - Loaded data: 164 samples, 13214 features
  self.model.load_state_dict(torch.load(model_path + '.pth', map_location=torch.device('cpu')))


fit_transform
<class 'pandas.core.frame.DataFrame'>
Index: 332 entries, test_cohort_1_patient_1 to test_cohort_1_patient_332
Columns: 13214 entries, ENSG00000000003 to ENSG00000282608
dtypes: float64(13214)
memory usage: 33.5+ MB
None
cboost_autoencoder_pData_paper.pkl


2025-02-11 20:32:23,325 - INFO - Found clinical data specification
2025-02-11 20:32:23,457 - INFO - Loaded data: 496 samples, 13218 features


['test_cohort_1', 'test_cohort_2']


2025-02-11 20:32:55,306 - INFO - Found clinical data specification
2025-02-11 20:32:55,329 - INFO - Loaded data: 332 samples, 13218 features
2025-02-11 20:33:26,191 - INFO - Found clinical data specification
2025-02-11 20:33:26,203 - INFO - Loaded data: 164 samples, 13218 features
  self.model.load_state_dict(torch.load(model_path + '.pth', map_location=torch.device('cpu')))


fit_transform
<class 'pandas.core.frame.DataFrame'>
Index: 332 entries, test_cohort_1_patient_1 to test_cohort_1_patient_332
Columns: 13214 entries, ENSG00000000003 to ENSG00000282608
dtypes: float64(13214)
memory usage: 33.5+ MB
None
cboost_common_genes_only.pkl


2025-02-11 20:34:28,200 - INFO - Loaded data: 496 samples, 15495 features


['test_cohort_1', 'test_cohort_2']


2025-02-11 20:35:21,555 - INFO - Loaded data: 332 samples, 15495 features
2025-02-11 20:36:11,915 - INFO - Loaded data: 164 samples, 15495 features


cboost_imp_pData.pkl


2025-02-11 20:37:06,438 - INFO - Found clinical data specification
2025-02-11 20:37:06,642 - INFO - Loaded data: 496 samples, 15499 features


['test_cohort_1', 'test_cohort_2']


2025-02-11 20:38:04,391 - INFO - Found clinical data specification
2025-02-11 20:38:04,430 - INFO - Loaded data: 332 samples, 15499 features
2025-02-11 20:38:57,009 - INFO - Found clinical data specification
2025-02-11 20:38:57,022 - INFO - Loaded data: 164 samples, 15499 features


cboost_inter_only.pkl


2025-02-11 20:39:33,563 - INFO - Loaded data: 496 samples, 13214 features


['test_cohort_1', 'test_cohort_2']


2025-02-11 20:40:04,799 - INFO - Loaded data: 332 samples, 13214 features
2025-02-11 20:40:39,353 - INFO - Loaded data: 164 samples, 13214 features


cboost_pData.pkl


2025-02-11 20:41:13,393 - INFO - Found clinical data specification
2025-02-11 20:41:13,529 - INFO - Only uses pData
2025-02-11 20:41:13,535 - INFO - Loaded data: 496 samples, 4 features


['test_cohort_1', 'test_cohort_2']


2025-02-11 20:41:45,738 - INFO - Found clinical data specification
2025-02-11 20:41:45,747 - INFO - Only uses pData
2025-02-11 20:41:45,753 - INFO - Loaded data: 332 samples, 4 features
2025-02-11 20:42:18,079 - INFO - Found clinical data specification
2025-02-11 20:42:18,084 - INFO - Only uses pData
2025-02-11 20:42:18,087 - INFO - Loaded data: 164 samples, 4 features


cboost_really_intersect_pData.pkl


2025-02-11 20:42:50,589 - INFO - Found clinical data specification
2025-02-11 20:42:50,745 - INFO - Loaded data: 496 samples, 13218 features


['test_cohort_1', 'test_cohort_2']


2025-02-11 20:43:23,797 - INFO - Found clinical data specification
2025-02-11 20:43:23,816 - INFO - Loaded data: 332 samples, 13218 features
2025-02-11 20:43:56,017 - INFO - Found clinical data specification
2025-02-11 20:43:56,031 - INFO - Loaded data: 164 samples, 13218 features


In [9]:
results

Unnamed: 0,model,mean,sd
0,cboost_autoencoder_pData_paper,0.671056,0.045585
1,cboost_autoencoder_paper,0.649736,0.085648
2,cboost_common_genes_only,0.655619,0.05826
3,cboost_imp_pData,0.673758,0.066102
4,cboost_inter_only,0.692797,0.061404
5,cboost_pData,0.68691,0.086168
6,cboost_really_intersect_pData,0.66733,0.06327


In [10]:
test_perf

Unnamed: 0,model,model_class,dataset,ci_coh1,ci_coh2
0,cboost_autoencoder_paper,GBoost,AutoEncoder,0.704491,0.798592
1,cboost_autoencoder_pData_paper,GBoost,pData_AutoEncoder,0.662955,0.680135
2,cboost_common_genes_only,GBoost,Imputed,0.734893,0.803489
3,cboost_imp_pData,GBoost,pData_Imputed,0.741649,0.803489
4,cboost_inter_only,GBoost,Intersection,0.706243,0.81359
5,cboost_pData,GBoost,pData,0.695859,0.774564
6,cboost_really_intersect_pData,GBoost,pData_Intersection,0.736645,0.821549


In [11]:
split_results = load_split_results(RESULTS_PATH, MODEL_NAME)
split_results.to_csv(os.path.join(FIN_RESULTS_SPLIT_PATH, 'splits_GBoost.csv'))

In [12]:
split_results

Unnamed: 0,model_class,model,test_cohort,ci,dataset
0,GBoost,cboost_autoencoder_paper,Atlanta_2014_Long,0.586994,AutoEncoder
1,GBoost,cboost_autoencoder_paper,Belfast_2018_Jain,0.591925,AutoEncoder
2,GBoost,cboost_autoencoder_paper,CPC_GENE_2017_Fraser,0.525988,AutoEncoder
3,GBoost,cboost_autoencoder_paper,CPGEA_2020_Li,0.639076,AutoEncoder
4,GBoost,cboost_autoencoder_paper,CamCap_2016_Ross_Adams,0.805142,AutoEncoder
...,...,...,...,...,...
4,GBoost,cboost_really_intersect_pData,CamCap_2016_Ross_Adams,0.734100,pData_Intersection
5,GBoost,cboost_really_intersect_pData,CancerMap_2017_Luca,0.616902,pData_Intersection
6,GBoost,cboost_really_intersect_pData,DKFZ_2018_Gerhauser,0.788686,pData_Intersection
7,GBoost,cboost_really_intersect_pData,MSKCC_2010_Taylor,0.690217,pData_Intersection


In [13]:
results_combined = combine_results(results, test_perf)
results_combined.to_csv(os.path.join(FIN_RESULTS_OV_PATH, 'ov_GBoost.csv'))

In [14]:
results_combined

Unnamed: 0,model,mean,sd,model_class,dataset,ci_coh1,ci_coh2
0,cboost_autoencoder_pData_paper,0.671056,0.045585,GBoost,pData_AutoEncoder,0.662955,0.680135
1,cboost_autoencoder_paper,0.649736,0.085648,GBoost,AutoEncoder,0.704491,0.798592
2,cboost_common_genes_only,0.655619,0.05826,GBoost,Imputed,0.734893,0.803489
3,cboost_imp_pData,0.673758,0.066102,GBoost,pData_Imputed,0.741649,0.803489
4,cboost_inter_only,0.692797,0.061404,GBoost,Intersection,0.706243,0.81359
5,cboost_pData,0.68691,0.086168,GBoost,pData,0.695859,0.774564
6,cboost_really_intersect_pData,0.66733,0.06327,GBoost,pData_Intersection,0.736645,0.821549
