In [37]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
MODEL_DIR = os.path.join(os.getcwd(), 'model')
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess
from sksurv.ensemble import RandomSurvivalForest
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from utils.evaluation import EarlyStoppingMonitor

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
monitor = EarlyStoppingMonitor(10, 5)

# Data configuration
DATA_CONFIG = {
    'use_pca': True,
    'pca_threshold': 0.95,
    'gene_type': 'intersection',
    'use_imputed': False,
}

MODEL_CONFIG = {
    'params_cv': {
        'model__n_estimators': [500],
        'model__learning_rate': [0.1],
        'model__max_depth': [3, 5],
        'model__min_samples_split': [5, 10],
        'model__min_samples_leaf': [3, 5],
        'model__subsample': [0.9],
        'model__max_features': ['sqrt', None]
    },
    'refit': True, 
    'do_nested_resampling': True, 
    'monitor' : monitor, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'results_imputed'}

# validation_fraction=0.1 as a mean to inclued early stopping
rsf_pipeline_steps = [('model', GradientBoostingSurvivalAnalysis())]



In [4]:
mp = ModellingProcess()
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

2024-11-29 12:30:21,996 - INFO - Loading data...
2024-11-29 12:32:59,737 - INFO - Loaded data: 1091 samples, 15495 features


In [39]:
enc = pd.read_csv('../../encoded_exprs_data_5 (1).csv', index_col=0).set_index(mp.X.index)

In [41]:
mp.X = enc 

In [43]:
nstd_res_result = mp.do_modelling(rsf_pipeline_steps, MODEL_CONFIG)

2024-11-29 13:27:41,142 - INFO - Start model training...
2024-11-29 13:27:41,143 - INFO - Input data shape: X=(1091, 350)
2024-11-29 13:27:41,143 - INFO - Nested resampling...
2024-11-29 13:27:41,144 - INFO - Starting nested resampling...
2024-11-29 13:27:41,146 - INFO - Data shape: X=(1091, 350), groups=9 unique
2024-11-29 13:27:41,148 - INFO - 
Outer fold 1
2024-11-29 13:27:41,152 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 16 candidates, totalling 128 fits


2024-11-29 13:28:10,080 - INFO - number of iterations early stopping: 11
2024-11-29 13:28:10,080 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 500, 'model__subsample': 0.9}
2024-11-29 13:28:10,080 - INFO - Test score: 0.655
2024-11-29 13:28:10,080 - INFO - 
Outer fold 2
2024-11-29 13:28:10,080 - INFO - Test cohort: Belfast_2018_Jain


Fitting 8 folds for each of 16 candidates, totalling 128 fits


2024-11-29 13:28:31,448 - INFO - number of iterations early stopping: 105
2024-11-29 13:28:31,448 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 500, 'model__subsample': 0.9}
2024-11-29 13:28:31,448 - INFO - Test score: 0.642
2024-11-29 13:28:31,448 - INFO - 
Outer fold 3
2024-11-29 13:28:31,448 - INFO - Test cohort: CPC_GENE_2017_Fraser


Fitting 8 folds for each of 16 candidates, totalling 128 fits


2024-11-29 13:29:15,086 - INFO - number of iterations early stopping: 40
2024-11-29 13:29:15,086 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 500, 'model__subsample': 0.9}
2024-11-29 13:29:15,086 - INFO - Test score: 0.540
2024-11-29 13:29:15,086 - INFO - 
Outer fold 4
2024-11-29 13:29:15,102 - INFO - Test cohort: CPGEA_2020_Li


Fitting 8 folds for each of 16 candidates, totalling 128 fits


2024-11-29 13:29:57,660 - INFO - number of iterations early stopping: 284
2024-11-29 13:29:57,666 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 3, 'model__min_samples_split': 10, 'model__n_estimators': 500, 'model__subsample': 0.9}
2024-11-29 13:29:57,666 - INFO - Test score: 0.666
2024-11-29 13:29:57,667 - INFO - 
Outer fold 5
2024-11-29 13:29:57,669 - INFO - Test cohort: CamCap_2016_Ross_Adams


Fitting 8 folds for each of 16 candidates, totalling 128 fits


2024-11-29 13:30:41,344 - INFO - number of iterations early stopping: 23
2024-11-29 13:30:41,359 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 3, 'model__min_samples_split': 10, 'model__n_estimators': 500, 'model__subsample': 0.9}
2024-11-29 13:30:41,360 - INFO - Test score: 0.809
2024-11-29 13:30:41,360 - INFO - 
Outer fold 6
2024-11-29 13:30:41,360 - INFO - Test cohort: CancerMap_2017_Luca


Fitting 8 folds for each of 16 candidates, totalling 128 fits


2024-11-29 13:31:31,522 - INFO - number of iterations early stopping: 86
2024-11-29 13:31:31,527 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 500, 'model__subsample': 0.9}
2024-11-29 13:31:31,528 - INFO - Test score: 0.682
2024-11-29 13:31:31,529 - INFO - 
Outer fold 7
2024-11-29 13:31:31,532 - INFO - Test cohort: DKFZ_2018_Gerhauser


Fitting 8 folds for each of 16 candidates, totalling 128 fits


2024-11-29 13:32:17,567 - INFO - number of iterations early stopping: 83
2024-11-29 13:32:17,567 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 3, 'model__min_samples_split': 5, 'model__n_estimators': 500, 'model__subsample': 0.9}
2024-11-29 13:32:17,582 - INFO - Test score: 0.790
2024-11-29 13:32:17,583 - INFO - 
Outer fold 8
2024-11-29 13:32:17,584 - INFO - Test cohort: MSKCC_2010_Taylor


Fitting 8 folds for each of 16 candidates, totalling 128 fits


2024-11-29 13:33:02,237 - INFO - number of iterations early stopping: 42
2024-11-29 13:33:02,237 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 3, 'model__min_samples_split': 10, 'model__n_estimators': 500, 'model__subsample': 0.9}
2024-11-29 13:33:02,237 - INFO - Test score: 0.634
2024-11-29 13:33:02,237 - INFO - 
Outer fold 9
2024-11-29 13:33:02,237 - INFO - Test cohort: Stockholm_2016_Ross_Adams


Fitting 8 folds for each of 16 candidates, totalling 128 fits


2024-11-29 13:33:52,862 - INFO - number of iterations early stopping: 71
2024-11-29 13:33:52,878 - INFO - Best parameters: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__max_features': None, 'model__min_samples_leaf': 3, 'model__min_samples_split': 10, 'model__n_estimators': 500, 'model__subsample': 0.9}
2024-11-29 13:33:52,878 - INFO - Test score: 0.719
2024-11-29 13:33:52,878 - INFO - Aggregated results:
2024-11-29 13:33:52,878 - INFO - Mean score: 0.682 ± 0.078
2024-11-29 13:33:52,878 - INFO - Individual scores: [np.float64(0.6547687861271676), np.float64(0.6423177607732582), np.float64(0.5395010395010394), np.float64(0.6660250240615977), np.float64(0.8092016238159675), np.float64(0.6819514796054386), np.float64(0.7903494176372712), np.float64(0.6339285714285714), np.float64(0.7188660141748228)]
2024-11-29 13:33:52,930 - INFO - Saved CV results to c:\Users\laeti\PCaPrognostics\models\playground_tbd\results\results\results_imputed_cv.csv
2024-11-29 13:33:52,933 - INFO -

Fitting 9 folds for each of 16 candidates, totalling 144 fits


KeyboardInterrupt: 

In [34]:
mp.resampling_cmplt.cv_results_

{'mean_fit_time': array([176.497577]),
 'std_fit_time': array([48.86732736]),
 'mean_score_time': array([1.49711892]),
 'std_score_time': array([1.38424093]),
 'param_model__bootstrap': masked_array(data=[False],
              mask=[False],
        fill_value=True),
 'param_model__low_memory': masked_array(data=[True],
              mask=[False],
        fill_value=True),
 'param_model__max_features': masked_array(data=[None],
              mask=[False],
        fill_value=np.str_('?'),
             dtype=object),
 'param_model__min_samples_split': masked_array(data=[52],
              mask=[False],
        fill_value=999999),
 'param_model__n_estimators': masked_array(data=[50],
              mask=[False],
        fill_value=999999),
 'param_model__n_jobs': masked_array(data=[-1],
              mask=[False],
        fill_value=999999),
 'param_model__random_state': masked_array(data=[1234],
              mask=[False],
        fill_value=999999),
 'params': [{'model__bootstrap': False,