In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from utils.evaluation import EarlyStoppingMonitor

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [11]:
import pickle

with open('../../results/pipe/test.pkl', 'rb') as f:
    data = pickle.load(f)

## Intersection and imputed data 

### 1.1 Model with only exprs. data; Intersection + imputed

In [18]:
# set early stopping monitor 
monitor = EarlyStoppingMonitor(10, 5)

# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'pca_threshold': 0.85,
    'gene_type': 'common_genes',
    'use_imputed': True,
    'select_random' : False, 
    'use_cohorts': False, 
    'requires_ohenc' : True, 
    'only_pData': True, 
    'clinical_covs' : ["AGE", "TISSUE", "GLEASON_SCORE", 'PRE_OPERATIVE_PSA']
}

# Model configuration
# MODEL_CONFIG = {
#     'params_cv': {
#         'model__n_estimators': [500],
#         'model__learning_rate': [0.1],
#         'model__max_depth': [3, 5],
#         'model__min_samples_split': [5, 10],
#         'model__min_samples_leaf': [3, 5],
#         'model__subsample': [0.9],
#         'model__max_features': ['sqrt', None]
#     },
#     'refit': True, 
#     'do_nested_resampling': False, 
#     'monitor' : monitor, 
#     'path' : RESULTS_DIR, 
#     'fname_cv' : 'results_PCA'}

# Model configuration
MODEL_CONFIG = {
    'params_cv': {
        'model__n_estimators': [200],
        'model__learning_rate': [0.1],
        #'model__max_depth': [3, 5],
        'model__min_samples_split': [10],
        #'model__min_samples_leaf': [3, 5],
        'model__subsample': [0.8],
        'model__max_features': ['log2'], 
        'model__n_iter_no_change' : [10], 
        'model__validation_fraction' : [0.1]
    },
    'refit': True, 
    'do_nested_resampling': True, 
    #'monitor' : monitor, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'results_PCA'}

# validation_fraction=0.1 as a mean to inclued early stopping
gb_pipeline_steps = [('model', GradientBoostingSurvivalAnalysis())]


In [9]:
mp = ModellingProcess()

In [19]:
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

2025-01-05 16:24:37,706 - INFO - Loading data...
2025-01-05 16:25:16,448 - INFO - Found clinical data specification
2025-01-05 16:25:16,505 - INFO - Only uses pData
2025-01-05 16:25:16,531 - INFO - Loaded data: 1091 samples, 6 features


<class 'pandas.core.frame.DataFrame'>
Index: 1091 entries, Atlanta_2014_Long.PT081 to Stockholm_2016_Ross_Adams.STKHLM9246
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   AGE                1091 non-null   float64
 1   TISSUE             1091 non-null   object 
 2   CLIN_T_STAGE       1091 non-null   object 
 3   PATH_T_STAGE       1091 non-null   object 
 4   GLEASON_SCORE      1091 non-null   float64
 5   PRE_OPERATIVE_PSA  1091 non-null   float64
 6   MONTH_TO_BCR       1091 non-null   float64
 7   BCR_STATUS         1091 non-null   int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 76.7+ KB
None


In [20]:
mp.X

Unnamed: 0,TISSUE_FFPE,TISSUE_Fresh_frozen,TISSUE_Snap_frozen,AGE,GLEASON_SCORE,PRE_OPERATIVE_PSA
Atlanta_2014_Long.PT081,1.0,0,0,2.540970,-0.094820,-0.341467
Atlanta_2014_Long.PT127,1.0,0,0,-0.018317,-1.675161,-0.361595
Atlanta_2014_Long.PT168,1.0,0,0,0.282775,-1.675161,-0.837332
Atlanta_2014_Long.PT184,1.0,0,0,-0.469956,-0.094820,-0.251809
Atlanta_2014_Long.PT199,1.0,0,0,-0.620502,-0.094820,-0.542741
...,...,...,...,...,...,...
Stockholm_2016_Ross_Adams.STKHLM8462,0,1.0,0,0.073871,-0.048152,-0.488924
Stockholm_2016_Ross_Adams.STKHLM8659,0,1.0,0,0.073871,-0.048152,-0.003539
Stockholm_2016_Ross_Adams.STKHLM9157,0,1.0,0,0.073871,-0.048152,0.072302
Stockholm_2016_Ross_Adams.STKHLM9161,0,1.0,0,0.073871,-0.048152,-0.215895


In [19]:
len(data.predict(mp.X))

1091

In [12]:
nstd_res_result = mp.do_modelling(gb_pipeline_steps, MODEL_CONFIG)

2024-12-29 10:26:12,155 - INFO - No additional monitoring detected
2024-12-29 10:26:12,156 - INFO - Start model training...
2024-12-29 10:26:12,157 - INFO - Input data shape: X=(1091, 13219)
2024-12-29 10:26:12,158 - INFO - Nested resampling...
2024-12-29 10:26:12,159 - INFO - Starting nested resampling...
2024-12-29 10:26:12,161 - INFO - Data shape: X=(1091, 13219), groups=9 unique
2024-12-29 10:26:12,165 - INFO - 
Outer fold 1
2024-12-29 10:26:12,257 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-12-29 10:26:26,335 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_features': 'log2', 'model__min_samples_split': 10, 'model__n_estimators': 200, 'model__n_iter_no_change': 10, 'model__subsample': 0.8, 'model__validation_fraction': 0.1}
2024-12-29 10:26:26,336 - INFO - Test score: 0.676
2024-12-29 10:26:26,337 - INFO - 
Outer fold 2
2024-12-29 10:26:26,426 - INFO - Test cohort: Belfast_2018_Jain


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-12-29 10:26:37,100 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_features': 'log2', 'model__min_samples_split': 10, 'model__n_estimators': 200, 'model__n_iter_no_change': 10, 'model__subsample': 0.8, 'model__validation_fraction': 0.1}
2024-12-29 10:26:37,102 - INFO - Test score: 0.622
2024-12-29 10:26:37,104 - INFO - 
Outer fold 3
2024-12-29 10:26:37,202 - INFO - Test cohort: CPC_GENE_2017_Fraser


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-12-29 10:26:52,550 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_features': 'log2', 'model__min_samples_split': 10, 'model__n_estimators': 200, 'model__n_iter_no_change': 10, 'model__subsample': 0.8, 'model__validation_fraction': 0.1}
2024-12-29 10:26:52,550 - INFO - Test score: 0.654
2024-12-29 10:26:52,553 - INFO - 
Outer fold 4
2024-12-29 10:26:52,646 - INFO - Test cohort: CPGEA_2020_Li


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-12-29 10:27:05,575 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_features': 'log2', 'model__min_samples_split': 10, 'model__n_estimators': 200, 'model__n_iter_no_change': 10, 'model__subsample': 0.8, 'model__validation_fraction': 0.1}
2024-12-29 10:27:05,575 - INFO - Test score: 0.682
2024-12-29 10:27:05,575 - INFO - 
Outer fold 5
2024-12-29 10:27:05,670 - INFO - Test cohort: CamCap_2016_Ross_Adams


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-12-29 10:27:16,586 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_features': 'log2', 'model__min_samples_split': 10, 'model__n_estimators': 200, 'model__n_iter_no_change': 10, 'model__subsample': 0.8, 'model__validation_fraction': 0.1}
2024-12-29 10:27:16,586 - INFO - Test score: 0.821
2024-12-29 10:27:16,586 - INFO - 
Outer fold 6
2024-12-29 10:27:16,702 - INFO - Test cohort: CancerMap_2017_Luca


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-12-29 10:27:25,618 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_features': 'log2', 'model__min_samples_split': 10, 'model__n_estimators': 200, 'model__n_iter_no_change': 10, 'model__subsample': 0.8, 'model__validation_fraction': 0.1}
2024-12-29 10:27:25,618 - INFO - Test score: 0.701
2024-12-29 10:27:25,618 - INFO - 
Outer fold 7
2024-12-29 10:27:25,700 - INFO - Test cohort: DKFZ_2018_Gerhauser


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-12-29 10:27:35,328 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_features': 'log2', 'model__min_samples_split': 10, 'model__n_estimators': 200, 'model__n_iter_no_change': 10, 'model__subsample': 0.8, 'model__validation_fraction': 0.1}
2024-12-29 10:27:35,328 - INFO - Test score: 0.814
2024-12-29 10:27:35,333 - INFO - 
Outer fold 8
2024-12-29 10:27:35,432 - INFO - Test cohort: MSKCC_2010_Taylor


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-12-29 10:27:46,129 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_features': 'log2', 'model__min_samples_split': 10, 'model__n_estimators': 200, 'model__n_iter_no_change': 10, 'model__subsample': 0.8, 'model__validation_fraction': 0.1}
2024-12-29 10:27:46,129 - INFO - Test score: 0.692
2024-12-29 10:27:46,139 - INFO - 
Outer fold 9
2024-12-29 10:27:46,223 - INFO - Test cohort: Stockholm_2016_Ross_Adams


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-12-29 10:27:56,501 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_features': 'log2', 'model__min_samples_split': 10, 'model__n_estimators': 200, 'model__n_iter_no_change': 10, 'model__subsample': 0.8, 'model__validation_fraction': 0.1}
2024-12-29 10:27:56,511 - INFO - Test score: 0.713
2024-12-29 10:27:56,511 - INFO - Aggregated results:
2024-12-29 10:27:56,511 - INFO - Mean score: 0.708 ± 0.064
2024-12-29 10:27:56,511 - INFO - Individual scores: [np.float64(0.6763005780346821), np.float64(0.6221304873137334), np.float64(0.6538461538461539), np.float64(0.6820660891883221), np.float64(0.8207036535859269), np.float64(0.7008797653958945), np.float64(0.8144758735440932), np.float64(0.6917701863354038), np.float64(0.7127910901113736)]
2024-12-29 10:27:56,589 - INFO - Saved CV results to c:\Users\laeti\PCaPrognostics\models\grad_boost\results\results\results_PCA_cv.csv
2024-12-29 10:27:56,589 - INFO - Do HP Tuning for complete model; refit + set complete model
2024-

Fitting 9 folds for each of 1 candidates, totalling 9 fits


2024-12-29 10:28:12,286 - INFO - Saved model to c:\Users\laeti\PCaPrognostics\models\grad_boost\results\model
2024-12-29 10:28:12,301 - INFO - Saved pipe to c:\Users\laeti\PCaPrognostics\models\grad_boost\results\pipe


In [None]:
mp.save_results(RESULTS_DIR, 'gb_intersect_imp', model = mp.cmplt_model, cv_results = mp.resampling_cmplt, pipe = mp.cmplt_pipeline)

In [24]:
df_features = pd.DataFrame()
df_features['feat'] = mp.X.columns
df_features['imps'] = mp.cmplt_model.feature_importances_

df_features.sort_values(by = 'imps', ascending=False)[df_features['imps'] > 0]

  df_features.sort_values(by = 'imps', ascending=False)[df_features['imps'] > 0]


Unnamed: 0,feat,imps
2387,ENSG00000104313,0.006392
4361,ENSG00000123485,0.005313
5208,ENSG00000132122,0.004998
3186,ENSG00000111670,0.004415
2639,ENSG00000106025,0.003916
...,...,...
4564,ENSG00000125508,0.000064
9371,ENSG00000167487,0.000059
1493,ENSG00000088543,0.000050
8409,ENSG00000162620,0.000028


### 1.2 Model with only pData

In [None]:
# set early stopping monitor 
monitor = EarlyStoppingMonitor(10, 5)

# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'pca_threshold': 0.85,
    'gene_type': 'intersection',
    'use_imputed': True,
    'select_random' : False, 
    'use_cohorts': False,
    'only_pData' : True
}

# Model configuration
MODEL_CONFIG = {
    'params_cv': {
        'model__n_estimators': [500],
        'model__learning_rate': [0.1],
        'model__max_depth': [3, 5],
        'model__min_samples_split': [5, 10],
        'model__min_samples_leaf': [3, 5],
        'model__subsample': [0.9],
        'model__max_features': ['sqrt', None]
    },
    'refit': True, 
    'do_nested_resampling': True, 
    'monitor' : monitor, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'results_PCA'}

# validation_fraction=0.1 as a mean to inclued early stopping
gb_pipeline_steps = [('model', GradientBoostingSurvivalAnalysis())]


### 1.3 Model with both exprs. and pData