In [1]:
import os
import sys
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(RESULTS_DIR, exist_ok=True)

RESULTS_PATH = os.path.join(RESULTS_DIR, 'results')
MODELS_PATH = os.path.join(RESULTS_DIR, 'model')

FIN_RESULTS_OV_PATH = os.path.join(PROJECT_ROOT, 'results_modelling_ovs')
FIN_RESULTS_SPLIT_PATH = os.path.join(PROJECT_ROOT, 'results_modelling_splits')
FEAT_IMP_PATH = os.path.join(PROJECT_ROOT, 'results_modelling_feat_imp')

# Imports
from models.modelling_process import ModellingProcess
from utils.analysis import * 

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [2]:
import os
import pandas as pd
import pickle
import ast
import json
from sklearn.compose import ColumnTransformer
from utils.feature_selection import FoldAwareSelectFromModel, FoldAwareAE

# TODO: Dataframe erstellen: Spalte 1: Name des Feautres, Spalte 2: Wert
# -------------------- functions to load feat. imp from model
def load_feat_imp(model):
    #with open(model_path, 'rb') as file:
    #    model = pickle.load(file)
    
    # Cat boost specific
    #print(model)
    # bei den Modellen die keine eigene Modellklasse von uns haben, muss man gucken wie der library interne Aufruf ist
    imps = model.model.get_feature_importance()
    
    df = pd.DataFrame({
    'feature': model.model.feature_names_,
    'value': imps
    })
    
    df = df.sort_values(by = "value", ascending=False)
    df = df[df.loc[: , 'value'] > 0]
    
    return df


def feat_imp_all_models(model_path): 
    files = os.listdir(model_path)
    imps_list = []
    
    for file in files:
        print(file)
        contains_pData = bool(re.search(r"pData", file, re.IGNORECASE))
        contains_intersection = bool(re.search(r"inter|intersection", file, re.IGNORECASE))
        contains_imputed = bool(re.search(r"imp|imputed|common", file, re.IGNORECASE))
        contains_aenc = bool(re.search(r"aenc|auto|autoenc|autoencoder", file, re.IGNORECASE))
        contains_scores = bool(re.search(r"score|scores", file, re.IGNORECASE))
        
        DATA_CONFIG = {
            'use_pca': False,
            'pca_threshold': 0.85,
            'use_imputed': True,
            'select_random' : False, 
            'use_cohorts': False, 
            # Muss je nach algo angepasst werden; CatBoost eig der einzige der keines braucht, bei den anderen auf True setzen
            'requires_ohenc' : False, 
        }

        # Load data based on file type
        if contains_intersection:
            DATA_CONFIG['gene_type'] = 'intersection'
        elif contains_imputed:
            DATA_CONFIG['gene_type'] = 'common_genes'
        elif contains_aenc:
            # DATA_CONFIG['gene_type'] = 'autoencoder'
            DATA_CONFIG['gene_type'] = 'intersection'
        elif contains_scores: 
            DATA_CONFIG['gene_type'] = 'scores'
        if contains_pData:
            DATA_CONFIG['clinical_covs'] = ["AGE", "TISSUE", "GLEASON_SCORE", 'PRE_OPERATIVE_PSA']
        if contains_pData and not contains_intersection and not contains_imputed and not contains_aenc and not contains_scores: 
            DATA_CONFIG['only_pData'] = True
            DATA_CONFIG['gene_type'] = None
            
        model = load_model(os.path.join(model_path, file))  
        
        if contains_aenc: 
            pass
            # mp = ModellingProcess()
            # mp.prepare_test_data(DATA_CONFIG, PROJECT_ROOT)
            # X_cos, y_cos = mp.prepare_test_cohort_data(DATA_CONFIG, PROJECT_ROOT, set(mp.test_groups))
            # if contains_pData:
            #     ae_path = os.path.join(RESULTS_DIR, 'pipe')
                
            #     pdata_cols = ['TISSUE', 'AGE',
            #         'GLEASON_SCORE', 'PRE_OPERATIVE_PSA']
            #     exprs_cols =  list(set(X_cos[0].columns) - set(pdata_cols))
            # else: 
            #     pdata_cols = []
            #     exprs_cols =  list(set(X_cos[0].columns) - set(pdata_cols))

            # ae = FoldAwareAE(testing = True)
            # #ae.estimator = load_model(os.path.join(ae, file))  
            # preprocessor = ColumnTransformer(
            #     transformers=[
            #         ('feature_selection', ae, exprs_cols),  # Apply feature selection
            #         ('other_features', 'passthrough', pdata_cols)         # Pass through other columns
            #     ]
            # )
            
            # preprocessor.fit(X=X_cos[0])
            # # Define the pipeline
            # pipe_steps = [
            #     ('preprocessor', preprocessor),
            #     ('model', model)] 
            
            # model = Pipeline(pipe_steps)
        else: 
            components = [
                "pData" if contains_pData else "",
                "Intersection" if contains_intersection else "",
                "Imputed" if contains_imputed else "",
                "AutoEncoder" if contains_aenc else "",
                "Scores" if contains_scores else ""
            ]

            # Join non-empty components into a single string with a separator (e.g., "_")
            dataset = "_".join(filter(None, components))         
            imps = load_feat_imp(model)
            imps.loc[:, 'model'] =  'cBoost'
            imps.loc[:, 'dataset'] = dataset
            imps_list.append(imps)
        
    df = pd.concat(imps_list, axis = 0)
    return df


def load_model(model_path): 
    with open(model_path, 'rb') as file:
        model = pickle.load(file)
    
    return model
    

# --------------------- get test perf 
import os
import pandas as pd
import numpy as np
import re
from sklearn.pipeline import Pipeline

# Function to test performance of all models
def test_perf_all_models(model_path, RESULTS_DIR):
    files = os.listdir(model_path)
    test_perf = []
    for file in files:
        print(file)

        contains_pData = bool(re.search(r"pData", file, re.IGNORECASE))
        contains_intersection = bool(re.search(r"inter|intersection", file, re.IGNORECASE))
        contains_imputed = bool(re.search(r"imp|imputed|common", file, re.IGNORECASE))
        contains_aenc = bool(re.search(r"aenc|auto|autoenc|autoencoder", file, re.IGNORECASE))
        contains_scores = bool(re.search(r"score|scores", file, re.IGNORECASE))
        
        DATA_CONFIG = {
            'use_pca': False,
            'pca_threshold': 0.85,
            'use_imputed': True,
            'select_random' : False, 
            'use_cohorts': False, 
            # Muss je nach algo angepasst werden; CatBoost eig der einzige der keines braucht, bei den anderen auf True setzen
            'requires_ohenc' : False, 
        }

        # Load data based on file type
        if contains_intersection:
            DATA_CONFIG['gene_type'] = 'intersection'
        elif contains_imputed:
            DATA_CONFIG['gene_type'] = 'common_genes'
        elif contains_aenc:
            # DATA_CONFIG['gene_type'] = 'autoencoder'
            DATA_CONFIG['gene_type'] = 'intersection'
        elif contains_scores: 
            DATA_CONFIG['gene_type'] = 'scores'
        if contains_pData:
            DATA_CONFIG['clinical_covs'] = ["AGE", "TISSUE", "GLEASON_SCORE", 'PRE_OPERATIVE_PSA']
        if contains_pData and not contains_intersection and not contains_imputed and not contains_aenc and not contains_scores: 
            DATA_CONFIG['only_pData'] = True
            DATA_CONFIG['gene_type'] = None
            
        model = load_model(os.path.join(model_path, file))  
        
        mp = ModellingProcess()
        mp.prepare_test_data(DATA_CONFIG, PROJECT_ROOT)
        X_cos, y_cos = mp.prepare_test_cohort_data(DATA_CONFIG, PROJECT_ROOT, set(mp.test_groups))
        
        if contains_aenc: 
            if contains_pData:
                ae_path = os.path.join(RESULTS_DIR, 'pipe')
                
                pdata_cols = ['TISSUE', 'AGE',
                    'GLEASON_SCORE', 'PRE_OPERATIVE_PSA']
                exprs_cols =  list(set(X_cos[0].columns) - set(pdata_cols))
            else: 
                pdata_cols = []
                exprs_cols =  list(set(X_cos[0].columns) - set(pdata_cols))

            ae = FoldAwareAE(testing = True)
            #ae.estimator = load_model(os.path.join(ae, file))  
            preprocessor = ColumnTransformer(
                transformers=[
                    ('feature_selection', ae, exprs_cols),  # Apply feature selection
                    ('other_features', 'passthrough', pdata_cols)         # Pass through other columns
                ]
            )
            
            preprocessor.fit(X=X_cos[0])
            # Define the pipeline
            pipe_steps = [
                ('preprocessor', preprocessor),
                ('model', model)] 
            
            model = Pipeline(pipe_steps)
        
        # print(model)
        # print(X_cos[0].info())
        # print(model.cat_features)    
        ci1 = model.score(X_cos[0], y_cos[0])
        ci2 = model.score(X_cos[1], y_cos[1])
        print(ci1)         
        print(ci2)
        
        result = {
            'model' : file.replace(".pkl", ""), 
            'ci_coh1' : ci2, 
            'ci_coh2' : ci1
        }
        test_perf.append(result)

    return pd.DataFrame(test_perf)

In [3]:
imp = feat_imp_all_models(MODELS_PATH)
imp.to_csv(os.path.join(FEAT_IMP_PATH, 'feat_imp_cBoost.csv'))

cboost_autoencoder_paper.pkl
cboost_autoencoder_pData_paper.pkl
cboost_common_genes_only.pkl
cboost_imp_pData.pkl
cboost_inter_only.pkl
cboost_pData.pkl
cboost_really_intersect_pData.pkl


In [4]:
results = load_all_results(RESULTS_PATH)
test_perf = test_perf_all_models(MODELS_PATH, RESULTS_DIR)

cboost_autoencoder_paper.pkl


2025-01-20 17:30:27,197 - INFO - Loaded data: 496 samples, 13214 features


{'test_cohort_1', 'test_cohort_2'}


2025-01-20 17:31:16,282 - INFO - Loaded data: 332 samples, 13214 features
2025-01-20 17:32:00,157 - INFO - Loaded data: 164 samples, 13214 features
  self.model.load_state_dict(torch.load(model_path + '.pth', map_location=torch.device('cpu')))


fit_transform
<class 'pandas.core.frame.DataFrame'>
Index: 332 entries, test_cohort_1_patient_1 to test_cohort_1_patient_332
Columns: 13214 entries, ENSG00000101558 to ENSG00000001461
dtypes: float64(13214)
memory usage: 33.5+ MB
None
0.46809708494933067
0.6147842056932966
cboost_autoencoder_pData_paper.pkl


2025-01-20 17:32:41,837 - INFO - Found clinical data specification
2025-01-20 17:32:42,014 - INFO - Loaded data: 496 samples, 13218 features


{'test_cohort_1', 'test_cohort_2'}


2025-01-20 17:33:27,424 - INFO - Found clinical data specification
2025-01-20 17:33:27,443 - INFO - Loaded data: 332 samples, 13218 features
2025-01-20 17:33:57,110 - INFO - Found clinical data specification
2025-01-20 17:33:57,122 - INFO - Loaded data: 164 samples, 13218 features
  self.model.load_state_dict(torch.load(model_path + '.pth', map_location=torch.device('cpu')))


fit_transform
<class 'pandas.core.frame.DataFrame'>
Index: 332 entries, test_cohort_1_patient_1 to test_cohort_1_patient_332
Columns: 13214 entries, ENSG00000101558 to ENSG00000001461
dtypes: float64(13214)
memory usage: 33.5+ MB
None
0.6470036281746528
0.7915518824609734
cboost_common_genes_only.pkl


2025-01-20 17:34:36,658 - INFO - Loaded data: 496 samples, 15495 features


{'test_cohort_1', 'test_cohort_2'}


2025-01-20 17:35:20,962 - INFO - Loaded data: 332 samples, 15495 features
2025-01-20 17:35:56,636 - INFO - Loaded data: 164 samples, 15495 features


0.7348930314024772
0.8034894398530762
cboost_imp_pData.pkl


2025-01-20 17:36:35,374 - INFO - Found clinical data specification
2025-01-20 17:36:35,529 - INFO - Loaded data: 496 samples, 15499 features


{'test_cohort_1', 'test_cohort_2'}


2025-01-20 17:37:11,524 - INFO - Found clinical data specification
2025-01-20 17:37:11,547 - INFO - Loaded data: 332 samples, 15499 features
2025-01-20 17:37:44,633 - INFO - Found clinical data specification
2025-01-20 17:37:44,661 - INFO - Loaded data: 164 samples, 15499 features


0.7416489428249718
0.8034894398530762
cboost_inter_only.pkl


2025-01-20 17:38:09,781 - INFO - Loaded data: 496 samples, 13214 features


{'test_cohort_1', 'test_cohort_2'}


2025-01-20 17:38:34,809 - INFO - Loaded data: 332 samples, 13214 features
2025-01-20 17:39:00,996 - INFO - Loaded data: 164 samples, 13214 features


0.7062429625922683
0.8135904499540864
cboost_pData.pkl


2025-01-20 17:39:26,068 - INFO - Found clinical data specification
2025-01-20 17:39:26,156 - INFO - Only uses pData
2025-01-20 17:39:26,162 - INFO - Loaded data: 496 samples, 4 features


{'test_cohort_1', 'test_cohort_2'}


2025-01-20 17:39:46,856 - INFO - Found clinical data specification
2025-01-20 17:39:46,859 - INFO - Only uses pData
2025-01-20 17:39:46,862 - INFO - Loaded data: 332 samples, 4 features
2025-01-20 17:40:07,126 - INFO - Found clinical data specification
2025-01-20 17:40:07,129 - INFO - Only uses pData
2025-01-20 17:40:07,131 - INFO - Loaded data: 164 samples, 4 features


0.6958588765169523
0.7745638200183654
cboost_really_intersect_pData.pkl


2025-01-20 17:40:29,506 - INFO - Found clinical data specification
2025-01-20 17:40:29,621 - INFO - Loaded data: 496 samples, 13218 features


{'test_cohort_1', 'test_cohort_2'}


2025-01-20 17:41:01,419 - INFO - Found clinical data specification
2025-01-20 17:41:01,442 - INFO - Loaded data: 332 samples, 13218 features
2025-01-20 17:41:30,579 - INFO - Found clinical data specification
2025-01-20 17:41:30,590 - INFO - Loaded data: 164 samples, 13218 features


0.7366445639934943
0.8215488215488216


In [5]:
split_results = load_split_results(RESULTS_PATH)
split_results.to_csv(os.path.join(FIN_RESULTS_SPLIT_PATH, 'splits_cBoost.csv'))

In [6]:
split_results

Unnamed: 0,model,test_cohort,ci,dataset
0,cboost_autoencoder_paper,Atlanta_2014_Long,0.613728,AutoEncoder
1,cboost_autoencoder_paper,Belfast_2018_Jain,0.559907,AutoEncoder
2,cboost_autoencoder_paper,CPC_GENE_2017_Fraser,0.579002,AutoEncoder
3,cboost_autoencoder_paper,CPGEA_2020_Li,0.576516,AutoEncoder
4,cboost_autoencoder_paper,CamCap_2016_Ross_Adams,0.742896,AutoEncoder
...,...,...,...,...
4,cboost_really_intersect_pData,CamCap_2016_Ross_Adams,0.734100,pData_Intersection
5,cboost_really_intersect_pData,CancerMap_2017_Luca,0.616902,pData_Intersection
6,cboost_really_intersect_pData,DKFZ_2018_Gerhauser,0.788686,pData_Intersection
7,cboost_really_intersect_pData,MSKCC_2010_Taylor,0.690217,pData_Intersection


In [7]:
results_combined = combine_results(results, test_perf)
results_combined.to_csv(os.path.join(FIN_RESULTS_OV_PATH, 'ov_cBoost.csv'))

In [8]:
results_combined

Unnamed: 0,model,mean,sd,ci_coh1,ci_coh2
0,cboost_autoencoder_pData_paper,0.671056,0.045585,0.791552,0.647004
1,cboost_autoencoder_paper,0.603645,0.072405,0.614784,0.468097
2,cboost_common_genes_only,0.655619,0.05826,0.803489,0.734893
3,cboost_imp_pData,0.673758,0.066102,0.803489,0.741649
4,cboost_inter_only,0.692797,0.061404,0.81359,0.706243
5,cboost_pData,0.68691,0.086168,0.774564,0.695859
6,cboost_really_intersect_pData,0.66733,0.06327,0.821549,0.736645


In [9]:
#model_path = os.path.join(RESULTS_DIR, 'model', 'cboost_inter_only.pkl')
#feat_imp = load_feat_imp(model_path)
#feat_imp.to_csv(os.path.join(FEAT_IMP_PATH, 'feat_imp_cBoost.csv'))