In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from catboost import CatBoostRegressor, Pool

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from utils.evaluation import EarlyStoppingMonitor
from sklearn.model_selection import train_test_split
from lifelines.utils import concordance_index
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.base import BaseEstimator, RegressorMixin


# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [2]:
import os
import pandas as pd
import pickle
from models.cox_pas_net_model import Cox_PASNet_Model, Cox_PASNet
import torch

def load_all_results(results_path): 
    # Get a list of all CSV files in the "results" folder
    csv_files = [os.path.join(results_path, file) for file in os.listdir(results_path) if file.endswith('.csv')]

    # Read all CSV files, add a "name" column, and combine them into one dataframe
    combined_data = pd.concat(
        [
            # Read each CSV file and add a "name" column with the file name
            pd.read_csv(file).assign(model=os.path.basename(file)) for file in csv_files
        ],
        ignore_index=True  # Reset the index in the combined dataframe
    )
    combined_data = combined_data.loc[:, ['model', 'mean_score' ,'std_score']]
    combined_data = combined_data.groupby('model', as_index=False).agg(mean=('mean_score', 'mean'), sd = ('std_score', 'mean'))
    # View the combined data
    return combined_data

# Not necessary due to different sd structure
# def aggregate_results(results):
#     results_aggr = results.groupby('model', as_index=False).agg(mean=('ci', 'mean'), sd=('ci', 'std'))
#     return results_aggr


# TODO: Ergbebnisse aus Test und Nested reampling kombiniernen
def combine_results(results_nstd, results_test_1, results_test_2):
    pass


# TODO: Dataframe erstellen: Spalte 1: Name des Feautres, Spalte 2: Wert
# -------------------- functions to load feat. imp from model
def load_feat_imp(model_path):
    with open(model_path, 'rb') as file:
        model = pickle.load(file)
    
    # Cat boost specific
    #print(model)
    # bei den Modellen die keine eigene Modellklasse von uns haben, muss man gucken wie der library interne Aufruf ist
    imps = model.model.get_feature_importance()
    
    df = pd.DataFrame({
    'feature': model.model.feature_names_,
    'value': imps
    })
    
    df = df.sort_values(by = "value", ascending=False)
    df = df[df.loc[: , 'value'] > 0]
    
    return df

def load_model(model_path): 
    with open(model_path, 'rb') as file:
        model = pickle.load(file)
    
    return model
    

# --------------------- get test perf 
import os
import pandas as pd
import numpy as np
import re

def get_weights(model_path):    
    DATA_CONFIG = {
        'use_pca': False,
        'pca_threshold': 0.85,
        'use_imputed': True,
        'select_random' : False, 
        'use_cohorts': False, 
        # Muss je nach algo angepasst werden; CatBoost eig der einzige der keines braucht, bei den anderen auf True setzen
        'requires_ohenc' : True, 
        'gene_type' : 'intersection',
        'clinical_covs' : ["AGE", "TISSUE", "GLEASON_SCORE", 'PRE_OPERATIVE_PSA'],
        'only_pDta' : False
    }

        
    net = torch.load(model_path, map_location=torch.device('cpu'))
    pathway_mask = pd.read_csv("pathway_mask.csv", index_col = 0)
    
    mp = ModellingProcess()
    mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)
    model_hull = Cox_PASNet_Model(pathway_mask= pathway_mask, clin_covs=['AGE', 'TISSUE_FFPE', 'TISSUE_Fresh_frozen',
                        'TISSUE_Snap_frozen', 'GLEASON_SCORE',
                        'PRE_OPERATIVE_PSA'])
    model_hull.model = net
    model_hull.is_fitted_ = True
    
    
    genes, pData, ytime, yevent, \
        eval_x, eval_age, eval_ytime, eval_yevent = model_hull._prepare_data(mp.X, mp.y, 0.1)
    
    
    
    w_sc1 = net.sc1.weight.data.cpu().detach().numpy()
    w_sc2 = net.sc2.weight.data.cpu().detach().numpy()
    w_sc3 = net.sc3.weight.data.cpu().detach().numpy()
    w_sc4 = net.sc4.weight.data.cpu().detach().numpy()
    np.savetxt("weights/w_sc1.csv", w_sc1, delimiter = ",")
    np.savetxt("weights/w_sc2.csv", w_sc2, delimiter = ",")
    np.savetxt("weights/w_sc3.csv", w_sc3, delimiter = ",")
    np.savetxt("weights/w_sc4.csv", w_sc4, delimiter = ",")

    pathway_node = net.tanh(net.sc1(genes))
    hidden_node = net.tanh(net.sc2(pathway_node))
    hidden_2_node = net.tanh(net.sc3(hidden_node))
    x_cat = torch.cat((hidden_2_node, pData), 1)
    lin_pred = net.sc4(x_cat)

    np.savetxt("weights/pathway_node.csv", pathway_node.cpu().detach().numpy(), delimiter = ",")
    np.savetxt("weights/hidden_node.csv", hidden_node.cpu().detach().numpy(), delimiter = ",")
    np.savetxt("weights/hidden_2_node.csv", x_cat.cpu().detach().numpy(), delimiter = ",")
    np.savetxt("weights/lin_pred.csv", lin_pred.cpu().detach().numpy(), delimiter = ",") 


# Function to test performance of all models
def test_perf_all_models(model_path):
    files = os.listdir(model_path)

    for file in files:
        print(file)

        contains_pData = bool(re.search(r"pData", file, re.IGNORECASE))
        contains_intersection = bool(re.search(r"inter|intersection", file, re.IGNORECASE))
        contains_imputed = bool(re.search(r"imp|imputed|common", file, re.IGNORECASE))
        contains_aenc = bool(re.search(r"aenc|auto|autoenc", file, re.IGNORECASE))
        contains_scores = bool(re.search(r"score|scores", file, re.IGNORECASE))

        
        DATA_CONFIG = {
            'use_pca': False,
            'pca_threshold': 0.85,
            'use_imputed': True,
            'select_random' : False, 
            'use_cohorts': False, 
            # Muss je nach algo angepasst werden; CatBoost eig der einzige der keines braucht, bei den anderen auf True setzen
            'requires_ohenc' : True, 
        }

        # Load data based on file type
        if contains_intersection:
            DATA_CONFIG['gene_type'] = 'intersection'
        elif contains_imputed:
            DATA_CONFIG['gene_type'] = 'common_genes'
        elif contains_aenc:
            DATA_CONFIG['gene_type'] = 'autoencoder'
        elif contains_scores: 
            DATA_CONFIG['gene_type'] = 'scores'
        if contains_pData:
            DATA_CONFIG['clinical_covs'] = ["AGE", "TISSUE", "GLEASON_SCORE", 'PRE_OPERATIVE_PSA']
        if contains_pData and not contains_intersection and not contains_imputed and not contains_aenc and not contains_scores: 
            DATA_CONFIG['only_pData'] = True
            DATA_CONFIG['gene_type'] = None
            
        # model_path = os.path.join(model_path, file)
        # #with open(model_path, 'rb') as file:
        # model = torch.load(model_path, map_location=torch.device('cpu'))
        # print(model)
        # #print(model.model)
        # pathway_mask = pd.read_csv("pathway_mask.csv", index_col = 0)
        
        # model_hull = Cox_PASNet_Model(pathway_mask= pathway_mask, clin_covs=['AGE', 'TISSUE_FFPE', 'TISSUE_Fresh_frozen',
        #                     'TISSUE_Snap_frozen', 'GLEASON_SCORE',
        #                     'PRE_OPERATIVE_PSA'])
        
        mp = ModellingProcess()
        mp.prepare_test_data(DATA_CONFIG, PROJECT_ROOT)
        X_cos, y_cos = mp.prepare_test_cohort_data(DATA_CONFIG, PROJECT_ROOT, mp.test_groups)
                
        # model_hull.model = model
        # model_hull.is_fitted_ = True
        # print(mp.y_test.info)        
        # ci_cmplt = model_hull.score(X_cos[0], y_cos[0])
        # print(ci_cmplt)

In [3]:
models_path = os.path.join(RESULTS_DIR, 'model')
test_perf_all_models(model_path=models_path)
#model_path = os.path.join(RESULTS_DIR, 'model', 'results_intersect_pdata_model.pth')
#get_weights(model_path)

results_intersect_pdata_model.pth


2025-01-15 12:20:50,768 - INFO - Found clinical data specification
2025-01-15 12:20:50,905 - INFO - Loaded data: 496 samples, 13221 features


test_cohort_1_patient_1


2025-01-15 12:21:15,454 - INFO - Found clinical data specification


test_cohort_1_patient_1
<bound method DataFrame.info of Empty DataFrame
Columns: [SAMPLE_ID, GSM_SAMPLE_ID, SRR_SAMPLE_ID, PAPER_SAMPLE_ID, SAMPLE_COUNT, AGE, STUDY, PLATFORM, TISSUE, SAMPLE_CLASS, SAMPLE_TYPE, SURGICAL_PROCEDURE, CLIN_TNM_STAGE, CLIN_T_STAGE, CLIN_T_STAGE_GROUP, CLIN_N_STAGE, CLIN_M_STAGE, PATH_TNM_STAGE, PATH_T_STAGE, PATH_T_STAGE_GROUP, PATH_N_STAGE, PATH_M_STAGE, GLEASON_SCORE, GLEASON_SCORE_1, GLEASON_SCORE_2, PRE_OPERATIVE_PSA, MONTH_TO_BCR, BCR_STATUS, MONTH_TO_LAST_FOLLOW_UP, OS_STATUS, MONTH_TO_DOD, DOD_STATUS, MONTH_TO_CEP, CEP_STATUS, PATIENT_ID]
Index: []

[0 rows x 35 columns]>


ValueError: Length mismatch: Expected axis has 111 elements, new values have 0 elements

In [None]:
# results_path = os.path.join(RESULTS_DIR, 'results')
# results = load_all_results(results_path)
# final_results_path = os.path.join(PROJECT_ROOT, 'results_modelling')
# results.to_csv(os.path.join(final_results_path, 'CoxPasNet.csv'))

# ACTUALLY: SAVE COMBINED RESULTS TO CSV