In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
MODEL_DIR = os.path.join(os.getcwd(), 'model')
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess
from sksurv.ensemble import RandomSurvivalForest

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'pca_threshold': 0.95,
    'gene_type': 'common_genes',
    'use_imputed': True,
    'use_cohorts': False
}

# Model configuration
MODEL_CONFIG = {
    'params_cv': {
        'model__n_estimators': [100],
        'model__min_samples_split': [6], 
        'model__max_features': ['sqrt'],
        'model__bootstrap' : [False], 
        'model__n_jobs': [-1], 
        'model__random_state': [1234], 
        'model__low_memory' : [True] 
    },
    'refit': True, 
    'do_nested_resampling': False, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'results_intersect'}

rsf_pipeline_steps = [('model', RandomSurvivalForest())]


In [12]:
mp = ModellingProcess()

In [13]:
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

2024-11-29 10:14:08,043 - INFO - Loading data...
2024-11-29 10:16:44,317 - INFO - Loaded data: 1091 samples, 15495 features


In [14]:
nstd_res_result = mp.do_modelling(rsf_pipeline_steps, MODEL_CONFIG)

2024-11-29 10:17:11,316 - INFO - No additional monitoring detected
2024-11-29 10:17:11,319 - INFO - Start model training...
2024-11-29 10:17:11,319 - INFO - Input data shape: X=(1091, 15495)
2024-11-29 10:17:11,322 - INFO - Do HP Tuning for complete model; refit + set complete model
2024-11-29 10:17:11,322 - INFO - Do HP Tuning for complete model


Fitting 9 folds for each of 1 candidates, totalling 9 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
# Save nested resampling result and retrained model (TODO: Add this to modelling process)

In [15]:
mp.resampling_cmplt.cv_results_

{'mean_fit_time': array([2296.95820708]),
 'std_fit_time': array([596.37276507]),
 'mean_score_time': array([3.08911371]),
 'std_score_time': array([1.99421501]),
 'param_model__bootstrap': masked_array(data=[False],
              mask=[False],
        fill_value=True),
 'param_model__low_memory': masked_array(data=[True],
              mask=[False],
        fill_value=True),
 'param_model__max_features': masked_array(data=['sqrt'],
              mask=[False],
        fill_value=np.str_('?'),
             dtype=object),
 'param_model__min_samples_split': masked_array(data=[6],
              mask=[False],
        fill_value=999999),
 'param_model__n_estimators': masked_array(data=[100],
              mask=[False],
        fill_value=999999),
 'param_model__n_jobs': masked_array(data=[-1],
              mask=[False],
        fill_value=999999),
 'param_model__random_state': masked_array(data=[1234],
              mask=[False],
        fill_value=999999),
 'params': [{'model__bootstrap': 

# RSF with selected features from lasso

In [2]:
# Data configuration
DATA_CONFIG_sel = {
    'use_pca': False,
    'pca_threshold': 0.95,
    'gene_type': 'common_genes',
    'use_imputed': True,
    'use_cohorts': False, 
    'select_random' : False
}

# Model configuration
MODEL_CONFIG_sel = {
    'params_cv': {
        'model__n_estimators': [50],
        'model__min_samples_split': [6], 
        'model__max_features': ['sqrt', None],
        'model__bootstrap' : [False], 
        'model__n_jobs': [-1], 
        'model__random_state': [1234], 
        'model__low_memory' : [True] 
    },
    'refit': True, 
    'do_nested_resampling': True, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'results_imputed_lasso_sel'}

rsf_pipeline_steps = [('model', RandomSurvivalForest())]


In [3]:
mp_sel = ModellingProcess()
mp_sel.prepare_data(DATA_CONFIG_sel, PROJECT_ROOT)

2024-12-03 12:42:31,721 - INFO - Loading data...
2024-12-03 12:45:36,268 - INFO - Loaded data: 1091 samples, 15495 features


In [4]:
sel_genes = pd.read_csv('../../data/feature_sel/lasso_coefs_inter.csv', index_col = 0)
sel_genes = sel_genes.loc[sel_genes['coefficient'] != 0].index.tolist()

In [5]:
mp_sel.X = mp_sel.X[sel_genes]

In [6]:
nstd_res_result_sel = mp_sel.do_modelling(rsf_pipeline_steps, MODEL_CONFIG_sel)

2024-12-03 12:46:05,295 - INFO - No additional monitoring detected
2024-12-03 12:46:05,297 - INFO - Start model training...
2024-12-03 12:46:05,299 - INFO - Input data shape: X=(1091, 76)
2024-12-03 12:46:05,300 - INFO - Nested resampling...
2024-12-03 12:46:05,302 - INFO - Starting nested resampling...
2024-12-03 12:46:05,304 - INFO - Data shape: X=(1091, 76), groups=9 unique
2024-12-03 12:46:05,329 - INFO - 
Outer fold 1
2024-12-03 12:46:05,335 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,
2024-12-03 12:52:44,014 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-03 12:52:44,015 - INFO - Test score: 0.665
2024-12-03 12:52:44,017 - INFO - 
Outer fold 2
2024-12-03 12:52:44,021 - INFO - Test cohort: Belfast_2018_Jain


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,
2024-12-03 12:57:45,059 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-03 12:57:45,060 - INFO - Test score: 0.719
2024-12-03 12:57:45,061 - INFO - 
Outer fold 3
2024-12-03 12:57:45,064 - INFO - Test cohort: CPC_GENE_2017_Fraser


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,
2024-12-03 13:03:55,911 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-03 13:03:55,912 - INFO - Test score: 0.654
2024-12-03 13:03:55,914 - INFO - 
Outer fold 4
2024-12-03 13:03:55,918 - INFO - Test cohort: CPGEA_2020_Li


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,
2024-12-03 13:11:23,791 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-03 13:11:23,792 - INFO - Test score: 0.712
2024-12-03 13:11:23,793 - INFO - 
Outer fold 5
2024-12-03 13:11:23,796 - INFO - Test cohort: CamCap_2016_Ross_Adams


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,
2024-12-03 13:17:25,318 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-03 13:17:25,320 - INFO - Test score: 0.861
2024-12-03 13:17:25,321 - INFO - 
Outer fold 6
2024-12-03 13:17:25,326 - INFO - Test cohort: CancerMap_2017_Luca


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,
2024-12-03 13:24:06,974 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-03 13:24:06,976 - INFO - Test score: 0.729
2024-12-03 13:24:06,978 - INFO - 
Outer fold 7
2024-12-03 13:24:06,982 - INFO - Test cohort: DKFZ_2018_Gerhauser


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,
2024-12-03 13:30:22,657 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-03 13:30:22,658 - INFO - Test score: 0.867
2024-12-03 13:30:22,659 - INFO - 
Outer fold 8
2024-12-03 13:30:22,664 - INFO - Test cohort: MSKCC_2010_Taylor


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,
2024-12-03 13:36:30,163 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-03 13:36:30,165 - INFO - Test score: 0.736
2024-12-03 13:36:30,166 - INFO - 
Outer fold 9
2024-12-03 13:36:30,171 - INFO - Test cohort: Stockholm_2016_Ross_Adams


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,
2024-12-03 13:43:01,341 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-03 13:43:01,342 - INFO - Test score: 0.740
2024-12-03 13:43:01,347 - INFO - Aggregated results:
2024-12-03 13:43:01,348 - INFO - Mean score: 0.743 ± 0.071
2024-12-03 13:43:01,350 - INFO - Individual scores: [np.float64(0.6653179190751445), np.float64(0.7194925493354812), np.float64(0.6538461538461539), np.float64(0.7122232916265641), np.float64(0.8606224627875507), np.float64(0.7286057051452945), np.float64(0.8668885191347754), np.float64(0.7364130434782609), np.float64(0.7401282483968951)]
2024-12-03 13:43:01,567 - INFO - Saved CV results to c:\Users\laeti\PCaPrognostics\models\rsf\results\results\results_imputed_lasso_sel_cv.csv
2024-12-03 13:43:01,574 - INFO - Do HP Tuning for complete m

Fitting 9 folds for each of 2 candidates, totalling 18 fits


KeyboardInterrupt: 