In [1]:
import os
import sys
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(RESULTS_DIR, exist_ok=True)

RESULTS_PATH = os.path.join(RESULTS_DIR, 'results')
MODELS_PATH = os.path.join(RESULTS_DIR, 'model')

FIN_RESULTS_OV_PATH = os.path.join(PROJECT_ROOT, 'results_modelling_ovs')
FIN_RESULTS_SPLIT_PATH = os.path.join(PROJECT_ROOT, 'results_modelling_splits')
FEAT_IMP_PATH = os.path.join(PROJECT_ROOT, 'results_modelling_feat_imp')

MODEL_NAME = 'GBoost'

# Imports
from models.modelling_process import ModellingProcess
from utils.analysis import * 
import os
import pandas as pd
import re
from sklearn.pipeline import Pipeline
import pickle
from sklearn.compose import ColumnTransformer
from utils.feature_selection import FoldAwareAE


# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

DATA_CONFIG = {
            'use_pca': False,
            'pca_threshold': 0.85,
            'use_imputed': True,
            'select_random' : False, 
            'use_cohorts': False, 
            # Muss je nach algo angepasst werden; CatBoost eig der einzige der keines braucht, bei den anderen auf True setzen
            'requires_ohenc' : False, 
        }

In [2]:
# def load_feat_imp(model):
#     imps = model.model.get_feature_importance()
    
#     df = pd.DataFrame({
#     'feature': model.model.feature_names_,
#     'value': imps
#     })
    
#     df = df.sort_values(by = "value", ascending=False)
#     df = df[df.loc[: , 'value'] > 0]
#     return df

# def feat_imp_all_models(model_path, model_name, DATA_CONFIG): 
#     files = os.listdir(model_path)
#     imps_list = []
    
#     for file in files:
#         print(file)
#         contains_pData = bool(re.search(r"pData", file, re.IGNORECASE))
#         contains_intersection = bool(re.search(r"inter|int|intersection", file, re.IGNORECASE))
#         contains_imputed = bool(re.search(r"imp|imputed|common", file, re.IGNORECASE))
#         contains_aenc = bool(re.search(r"aenc|auto|autoenc|autoencoder", file, re.IGNORECASE))
#         contains_scores = bool(re.search(r"score|scores", file, re.IGNORECASE))
        
#         DATA_CONFIG = {
#             'use_pca': False,
#             'pca_threshold': 0.85,
#             'use_imputed': True,
#             'select_random' : False, 
#             'use_cohorts': False, 
#             # Muss je nach algo angepasst werden; CatBoost eig der einzige der keines braucht, bei den anderen auf True setzen
#             'requires_ohenc' : False, 
#         }

#         # Load data based on file type
#         if contains_intersection:
#             DATA_CONFIG['gene_type'] = 'intersection'
#         elif contains_imputed:
#             DATA_CONFIG['gene_type'] = 'common_genes'
#         elif contains_aenc:
#             DATA_CONFIG['gene_type'] = 'intersection'
#         elif contains_scores: 
#             DATA_CONFIG['gene_type'] = 'scores'
#         if contains_pData:
#             DATA_CONFIG['clinical_covs'] = ["AGE", "TISSUE", "GLEASON_SCORE", 'PRE_OPERATIVE_PSA']
#         if contains_pData and not contains_intersection and not contains_imputed and not contains_aenc and not contains_scores: 
#             DATA_CONFIG['only_pData'] = True
#             DATA_CONFIG['gene_type'] = None
            
#         model = load_model(os.path.join(model_path, file))  
        
#         if contains_aenc: 
#             pass
#         else: 
#             components = [
#                 "pData" if contains_pData else "",
#                 "Intersection" if contains_intersection else "",
#                 "Imputed" if contains_imputed else "",
#                 "AutoEncoder" if contains_aenc else "",
#                 "Scores" if contains_scores else ""
#             ]

#             # Join non-empty components into a single string with a separator (e.g., "_")
#             dataset = "_".join(filter(None, components))         
#             imps = load_feat_imp(model)
#             imps.loc[:, 'model'] =  model_name
#             imps.loc[:, 'dataset'] = dataset
#             imps_list.append(imps)
        
#     df = pd.concat(imps_list, axis = 0)
#     return df
    

def test_perf_all_models(model_path):
    """
    Evaluates the performance of all models stored in a specified directory.

    Args:
        model_path (str): Path to the directory containing trained model files (.pkl).
        
    Returns:
        pd.DataFrame: A DataFrame containing model names and their test performance scores on two test cohorts.
    """
    files = os.listdir(model_path)
    test_perf = []
    for file in files:
        print(file)

        contains_pData = bool(re.search(r"pData", file, re.IGNORECASE))
        contains_intersection = bool(re.search(r"inter|intersection", file, re.IGNORECASE))
        contains_imputed = bool(re.search(r"imp|imputed|common", file, re.IGNORECASE))
        contains_aenc = bool(re.search(r"aenc|auto|autoenc|autoencoder", file, re.IGNORECASE))
        contains_scores = bool(re.search(r"score|scores", file, re.IGNORECASE))
        
        components = [
            "pData" if contains_pData else "",
            "Intersection" if contains_intersection else "",
            "Imputed" if contains_imputed else "",
            "AutoEncoder" if contains_aenc else "",
            "Scores" if contains_scores else ""
        ]

        dataset = "_".join(filter(None, components)) 
        
        DATA_CONFIG = {
            'use_pca': False,
            'pca_threshold': 0.85,
            'use_imputed': True,
            'select_random' : False, 
            'use_cohorts': False, 
            'requires_ohenc' : False, 
        }

        # Load data based on file type
        if contains_intersection:
            DATA_CONFIG['gene_type'] = 'intersection'
        elif contains_imputed:
            DATA_CONFIG['gene_type'] = 'common_genes'
        elif contains_aenc:
            DATA_CONFIG['gene_type'] = 'intersection'
        if contains_pData:
            DATA_CONFIG['clinical_covs'] = ["AGE", "TISSUE", "GLEASON_SCORE", 'PRE_OPERATIVE_PSA']
        if contains_pData and not contains_intersection and not contains_imputed and not contains_aenc and not contains_scores: 
            DATA_CONFIG['only_pData'] = True
            DATA_CONFIG['gene_type'] = None
            
        model = load_model(os.path.join(model_path, file))  
        mp = ModellingProcess()
        mp.prepare_test_data(DATA_CONFIG, PROJECT_ROOT)
        test_groups = ['test_cohort_2', 'test_cohort_1']
        X_cos, y_cos = mp.prepare_test_cohort_data(DATA_CONFIG, PROJECT_ROOT, test_groups)
        
        if contains_aenc: 
            if contains_pData:                
                pdata_cols = ['TISSUE', 'AGE',
                    'GLEASON_SCORE', 'PRE_OPERATIVE_PSA']
                exprs_cols =  list(set(X_cos[0].columns) - set(pdata_cols))
                exprs_cols = sorted(exprs_cols)
            else: 
                pdata_cols = []
                exprs_cols =  list(set(X_cos[0].columns) - set(pdata_cols))
                exprs_cols = sorted(exprs_cols)
                

            ae = FoldAwareAE(testing = True)            
            preprocessor = ColumnTransformer(
                transformers=[
                    ('feature_selection', ae, exprs_cols),  # Apply feature selection
                    ('other_features', 'passthrough', pdata_cols)         # Pass through other columns
                ]
            )
            preprocessor.fit(X=X_cos[0])
            pipe_steps = [
                ('preprocessor', preprocessor),
                ('model', model)] 
            
            model = Pipeline(pipe_steps)
           
        ci1 = model.score(X_cos[0], y_cos[0])
        ci2 = model.score(X_cos[1], y_cos[1])
        
        result = {
            'model' : file.replace(".pkl", ""), 
            'model_class' : 'GBoost', 
            'dataset' : dataset, 
            'ci_coh1' : ci2, 
            'ci_coh2' : ci1
        }
        test_perf.append(result)
    return pd.DataFrame(test_perf)

In [3]:
imp = feat_imp_all_models(MODELS_PATH, MODEL_NAME, DATA_CONFIG)
imp.to_csv(os.path.join(FEAT_IMP_PATH, 'feat_imp_cBoost.csv'))

In [12]:
results = load_all_results(RESULTS_PATH)
test_perf = test_perf_all_models(MODELS_PATH)

In [13]:
results

Unnamed: 0,model,mean,sd
0,cboost_autoencoder_pData_paper,0.671056,0.045585
1,cboost_autoencoder_paper,0.649736,0.085648
2,cboost_common_genes_only,0.655619,0.05826
3,cboost_imp_pData,0.673758,0.066102
4,cboost_inter_only,0.692797,0.061404
5,cboost_pData,0.68691,0.086168
6,cboost_really_intersect_pData,0.66733,0.06327


In [14]:
test_perf

Unnamed: 0,model,model_class,dataset,ci_coh1,ci_coh2
0,cboost_autoencoder_paper,GBoost,AutoEncoder,0.704491,0.798592
1,cboost_autoencoder_pData_paper,GBoost,pData_AutoEncoder,0.662955,0.680135
2,cboost_common_genes_only,GBoost,Imputed,0.734893,0.803489
3,cboost_imp_pData,GBoost,pData_Imputed,0.741649,0.803489
4,cboost_inter_only,GBoost,Intersection,0.706243,0.81359
5,cboost_pData,GBoost,pData,0.695859,0.774564
6,cboost_really_intersect_pData,GBoost,pData_Intersection,0.736645,0.821549


In [15]:
split_results = load_split_results(RESULTS_PATH, MODEL_NAME)
split_results.to_csv(os.path.join(FIN_RESULTS_SPLIT_PATH, 'splits_GBoost.csv'))

In [16]:
split_results

Unnamed: 0,model_class,model,test_cohort,ci,dataset
0,GBoost,cboost_autoencoder_paper,Atlanta_2014_Long,0.586994,AutoEncoder
1,GBoost,cboost_autoencoder_paper,Belfast_2018_Jain,0.591925,AutoEncoder
2,GBoost,cboost_autoencoder_paper,CPC_GENE_2017_Fraser,0.525988,AutoEncoder
3,GBoost,cboost_autoencoder_paper,CPGEA_2020_Li,0.639076,AutoEncoder
4,GBoost,cboost_autoencoder_paper,CamCap_2016_Ross_Adams,0.805142,AutoEncoder
...,...,...,...,...,...
4,GBoost,cboost_really_intersect_pData,CamCap_2016_Ross_Adams,0.734100,pData_Intersection
5,GBoost,cboost_really_intersect_pData,CancerMap_2017_Luca,0.616902,pData_Intersection
6,GBoost,cboost_really_intersect_pData,DKFZ_2018_Gerhauser,0.788686,pData_Intersection
7,GBoost,cboost_really_intersect_pData,MSKCC_2010_Taylor,0.690217,pData_Intersection


In [17]:
results_combined = combine_results(results, test_perf)
results_combined.to_csv(os.path.join(FIN_RESULTS_OV_PATH, 'ov_GBoost.csv'))

In [18]:
results_combined

Unnamed: 0,model,mean,sd,model_class,dataset,ci_coh1,ci_coh2
0,cboost_autoencoder_pData_paper,0.671056,0.045585,GBoost,pData_AutoEncoder,0.662955,0.680135
1,cboost_autoencoder_paper,0.649736,0.085648,GBoost,AutoEncoder,0.704491,0.798592
2,cboost_common_genes_only,0.655619,0.05826,GBoost,Imputed,0.734893,0.803489
3,cboost_imp_pData,0.673758,0.066102,GBoost,pData_Imputed,0.741649,0.803489
4,cboost_inter_only,0.692797,0.061404,GBoost,Intersection,0.706243,0.81359
5,cboost_pData,0.68691,0.086168,GBoost,pData,0.695859,0.774564
6,cboost_really_intersect_pData,0.66733,0.06327,GBoost,pData_Intersection,0.736645,0.821549


In [11]:
#model_path = os.path.join(RESULTS_DIR, 'model', 'cboost_inter_only.pkl')
#feat_imp = load_feat_imp(model_path)
#feat_imp.to_csv(os.path.join(FEAT_IMP_PATH, 'feat_imp_cBoost.csv'))