In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from catboost import CatBoostRegressor, Pool

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
RESULTS_DIR = os.path.join(os.getcwd(), 'results_cox')
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from utils.evaluation import EarlyStoppingMonitor
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.base import BaseEstimator, RegressorMixin


# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [22]:
import os
import pandas as pd
import pickle

def load_all_results(results_path): 
    # Get a list of all CSV files in the "results" folder
    csv_files = [os.path.join(results_path, file) for file in os.listdir(results_path) if file.endswith('.csv')]

    # Read all CSV files, add a "name" column, and combine them into one dataframe
    combined_data = pd.concat(
        [
            # Read each CSV file and add a "name" column with the file name
            pd.read_csv(file).assign(model=os.path.basename(file)) for file in csv_files
        ],
        ignore_index=True  # Reset the index in the combined dataframe
    )
    combined_data = combined_data.loc[:, ['model', 'mean_score' ,'std_score']]
    combined_data = combined_data.groupby('model', as_index=False).agg(mean=('mean_score', 'mean'), sd = ('std_score', 'mean'))
    # View the combined data
    return combined_data

# Not necessary due to different sd structure
# def aggregate_results(results):
#     results_aggr = results.groupby('model', as_index=False).agg(mean=('ci', 'mean'), sd=('ci', 'std'))
#     return results_aggr


# TODO: Ergbebnisse aus Test und Nested reampling kombiniernen
def combine_results(results_nstd, results_test_1, results_test_2):
    pass


# TODO: Dataframe erstellen: Spalte 1: Name des Feautres, Spalte 2: Wert
# -------------------- functions to load feat. imp from model
def load_feat_imp(model_path):
    with open(model_path, 'rb') as file:
        model = pickle.load(file)
    
    # Cat boost specific
    #print(model)
    # bei den Modellen die keine eigene Modellklasse von uns haben, muss man gucken wie der library interne Aufruf ist
    imps = model.model.get_feature_importance()
    
    df = pd.DataFrame({
    'feature': model.model.feature_names_,
    'value': imps
    })
    
    df = df.sort_values(by = "value", ascending=False)
    df = df[df.loc[: , 'value'] > 0]
    
    return df

def load_model(model_path): 
    with open(model_path, 'rb') as file:
        model = pickle.load(file)
    
    return model
    

# --------------------- get test perf 
import os
import pandas as pd
import numpy as np
import re

# Function to test performance of all models
def test_perf_all_models(model_path):
    files = os.listdir(model_path)

    for file in files:
        print(file)

        contains_pData = bool(re.search(r"pData", file, re.IGNORECASE))
        contains_intersection = bool(re.search(r"inter|intersection", file, re.IGNORECASE))
        contains_imputed = bool(re.search(r"imp|imputed|common", file, re.IGNORECASE))
        contains_aenc = bool(re.search(r"aenc|auto|autoenc", file, re.IGNORECASE))
        contains_scores = bool(re.search(r"score|scores", file, re.IGNORECASE))

        
        DATA_CONFIG = {
            'use_pca': False,
            'pca_threshold': 0.85,
            'use_imputed': True,
            'select_random' : False, 
            'use_cohorts': False, 
            # Muss je nach algo angepasst werden; CatBoost eig der einzige der keines braucht, bei den anderen auf True setzen
            'requires_ohenc' : False, 
        }

        # Load data based on file type
        if contains_intersection:
            DATA_CONFIG['gene_type'] = 'intersection'
        elif contains_imputed:
            DATA_CONFIG['gene_type'] = 'common_genes'
        elif contains_aenc:
            DATA_CONFIG['gene_type'] = 'autoencoder'
        elif contains_scores: 
            DATA_CONFIG['gene_type'] = 'scores'
        if contains_pData:
            DATA_CONFIG['clinical_covs'] = ["AGE", "TISSUE", "GLEASON_SCORE", 'PRE_OPERATIVE_PSA']
        if contains_pData and not contains_intersection and not contains_imputed and not contains_aenc and not contains_scores: 
            DATA_CONFIG['only_pData'] = True
            DATA_CONFIG['gene_type'] = None
            
        model = load_model(os.path.join(model_path, file))  
        
        mp = ModellingProcess()
        mp.prepare_test_data(DATA_CONFIG, PROJECT_ROOT)
                 
        ci_cmplt = model.score(mp.X_test, mp.y_test)
        print(ci_cmplt)

In [24]:
results_path = os.path.join(RESULTS_DIR, 'results')
results = load_all_results(results_path)
final_results_path = os.path.join(PROJECT_ROOT, 'results_modelling')
results.to_csv(os.path.join(final_results_path, 'cBoost.csv'))

# ACTUALLY: SAVE COMBINED RESULTS TO CSV

In [25]:
results

Unnamed: 0,model,mean,sd
0,cboost_autoencoder_pData_cv.csv,0.695124,0.073754
1,cboost_common_genes_only_cv.csv,0.655619,0.05826
2,cboost_inter_only_cv.csv,0.692797,0.061404
3,cboost_inter_pData_cv.csv,0.673758,0.066102
4,cboost_only_pData_cv.csv,0.654967,0.100747
5,cboost_pData_cv.csv,0.68691,0.086168
6,cboost_really_intersect_pData_cv.csv,0.66733,0.06327
7,cboost_scores_pData_cv.csv,0.721846,0.0652


In [7]:
models_path = os.path.join(RESULTS_DIR, 'model')
test_perf_all_models(models_path)

cboost_common_genes_only.pkl


2025-01-14 16:59:43,400 - INFO - Loaded data: 496 samples, 15495 features


0.6484516656593309
cboost_imp_pData.pkl


2025-01-14 17:00:49,464 - INFO - Found clinical data specification
2025-01-14 17:00:49,787 - INFO - Loaded data: 496 samples, 15499 features


0.653073559213567
cboost_inter_only.pkl


2025-01-14 17:01:29,663 - INFO - Loaded data: 496 samples, 13214 features


0.6391367724961781
cboost_only_pData.pkl


2025-01-14 17:02:09,840 - INFO - Found clinical data specification
2025-01-14 17:02:10,000 - INFO - Only uses pData
2025-01-14 17:02:10,014 - INFO - Loaded data: 496 samples, 4 features


0.5146656237778647
cboost_really_intersect_pData.pkl


2025-01-14 17:03:22,152 - INFO - Found clinical data specification
2025-01-14 17:03:22,508 - INFO - Loaded data: 496 samples, 13218 features


0.6439008781597753
cboost_scores_pData.pkl


2025-01-14 17:03:24,809 - INFO - Found clinical data specification
2025-01-14 17:03:24,817 - INFO - Loaded data: 496 samples, 5 features


0.6626906531091122


In [23]:
model_path = os.path.join(RESULTS_DIR, 'model', 'cboost_inter_only.pkl')
load_feat_imp(model_path)


Unnamed: 0,feature,value
4275,ENSG00000122641,8.360797
10519,ENSG00000175063,5.179230
1185,ENSG00000078098,4.072356
1469,ENSG00000087586,3.562597
13181,ENSG00000273706,2.995245
...,...,...
9217,ENSG00000166501,0.168650
10982,ENSG00000179593,0.146239
9785,ENSG00000169876,0.131965
4151,ENSG00000120937,0.121057
