In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
MODEL_DIR = os.path.join(os.getcwd(), 'model')
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from utils.evaluation import EarlyStoppingMonitor

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
# set early stopping monitor 
monitor = EarlyStoppingMonitor(10, 5)

# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'pca_threshold': 0.95,
    'gene_type': 'intersection',
    'use_imputed': True,
    'use_cohorts': False
}

# Model configuration
MODEL_CONFIG = {
    'params_cv': {
        'model__n_estimators': [1000],
        'model__learning_rate': [0.1, 0.01, 0.005],
        'model__max_depth': [3, 5],
        'model__min_samples_split': [5, 10],
        'model__min_samples_leaf': [3, 5],
        'model__subsample': [0.9],
        'model__max_features': ['sqrt']
    },
    'refit': True, 
    'do_nested_resampling': True, 
    'monitor' : monitor, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'results'}

# validation_fraction=0.1 as a mean to inclued early stopping
gb_pipeline_steps = [('model', GradientBoostingSurvivalAnalysis())]


In [3]:
mp = ModellingProcess()

In [4]:
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

2024-11-23 10:23:11,090 - INFO - Loading data...
2024-11-23 10:25:36,462 - INFO - Loaded data: 1091 samples, 13214 features


In [5]:
nstd_res_result = mp.do_modelling(gb_pipeline_steps, MODEL_CONFIG)

2024-11-23 10:25:49,907 - INFO - Start model training...
2024-11-23 10:25:49,907 - INFO - Input data shape: X=(1091, 13214)
2024-11-23 10:25:49,922 - INFO - Nested resampling...
2024-11-23 10:25:49,922 - INFO - Starting nested resampling...
2024-11-23 10:25:49,922 - INFO - Data shape: X=(1091, 13214), groups=9 unique
2024-11-23 10:25:49,956 - INFO - 
Outer fold 1
2024-11-23 10:25:50,017 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-23 10:26:20,181 - INFO - number of iterations early stopping: 79
2024-11-23 10:26:20,266 - INFO - Best parameters: {'model__learning_rate': 0.005, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 1000, 'model__subsample': 0.8}
2024-11-23 10:26:20,267 - INFO - Test score: 0.664
2024-11-23 10:26:20,268 - INFO - 
Outer fold 2
2024-11-23 10:26:20,302 - INFO - Test cohort: Belfast_2018_Jain


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-23 10:26:43,053 - INFO - number of iterations early stopping: 138
2024-11-23 10:26:43,134 - INFO - Best parameters: {'model__learning_rate': 0.005, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 1000, 'model__subsample': 0.8}
2024-11-23 10:26:43,134 - INFO - Test score: 0.654
2024-11-23 10:26:43,134 - INFO - 
Outer fold 3
2024-11-23 10:26:43,187 - INFO - Test cohort: CPC_GENE_2017_Fraser


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-23 10:26:58,067 - INFO - number of iterations early stopping: 17
2024-11-23 10:26:58,133 - INFO - Best parameters: {'model__learning_rate': 0.005, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 1000, 'model__subsample': 0.8}
2024-11-23 10:26:58,134 - INFO - Test score: 0.709
2024-11-23 10:26:58,136 - INFO - 
Outer fold 4
2024-11-23 10:26:58,180 - INFO - Test cohort: CPGEA_2020_Li


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-23 10:27:34,255 - INFO - number of iterations early stopping: 207
2024-11-23 10:27:34,322 - INFO - Best parameters: {'model__learning_rate': 0.005, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 1000, 'model__subsample': 0.8}
2024-11-23 10:27:34,322 - INFO - Test score: 0.681
2024-11-23 10:27:34,322 - INFO - 
Outer fold 5
2024-11-23 10:27:34,365 - INFO - Test cohort: CamCap_2016_Ross_Adams


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-23 10:28:04,137 - INFO - number of iterations early stopping: 209
2024-11-23 10:28:04,211 - INFO - Best parameters: {'model__learning_rate': 0.005, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 1000, 'model__subsample': 0.8}
2024-11-23 10:28:04,212 - INFO - Test score: 0.832
2024-11-23 10:28:04,212 - INFO - 
Outer fold 6
2024-11-23 10:28:04,259 - INFO - Test cohort: CancerMap_2017_Luca


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-23 10:28:28,089 - INFO - number of iterations early stopping: 114
2024-11-23 10:28:28,149 - INFO - Best parameters: {'model__learning_rate': 0.005, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 1000, 'model__subsample': 0.8}
2024-11-23 10:28:28,149 - INFO - Test score: 0.692
2024-11-23 10:28:28,149 - INFO - 
Outer fold 7
2024-11-23 10:28:28,186 - INFO - Test cohort: DKFZ_2018_Gerhauser


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-23 10:28:56,444 - INFO - number of iterations early stopping: 72
2024-11-23 10:28:56,511 - INFO - Best parameters: {'model__learning_rate': 0.005, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 1000, 'model__subsample': 0.8}
2024-11-23 10:28:56,511 - INFO - Test score: 0.837
2024-11-23 10:28:56,511 - INFO - 
Outer fold 8
2024-11-23 10:28:56,546 - INFO - Test cohort: MSKCC_2010_Taylor


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-23 10:29:32,749 - INFO - number of iterations early stopping: 36
2024-11-23 10:29:32,813 - INFO - Best parameters: {'model__learning_rate': 0.01, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 1000, 'model__subsample': 0.8}
2024-11-23 10:29:32,813 - INFO - Test score: 0.731
2024-11-23 10:29:32,813 - INFO - 
Outer fold 9
2024-11-23 10:29:32,859 - INFO - Test cohort: Stockholm_2016_Ross_Adams


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-23 10:30:17,836 - INFO - number of iterations early stopping: 195
2024-11-23 10:30:17,909 - INFO - Best parameters: {'model__learning_rate': 0.005, 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 10, 'model__n_estimators': 1000, 'model__subsample': 0.8}
2024-11-23 10:30:17,912 - INFO - Test score: 0.723
2024-11-23 10:30:17,914 - INFO - Aggregated results:
2024-11-23 10:30:17,915 - INFO - Mean score: 0.725 ± 0.063
2024-11-23 10:30:17,916 - INFO - Individual scores: [np.float64(0.6644508670520232), np.float64(0.6537454691904954), np.float64(0.7089397089397089), np.float64(0.6811036252807187), np.float64(0.8322056833558863), np.float64(0.6918155158624367), np.float64(0.8369384359400999), np.float64(0.7309782608695652), np.float64(0.7232534593317583)]
2024-11-23 10:30:17,995 - INFO - Saved CV results to c:\Users\laeti\PCaPrognostics\models\grad_boost\results\results\results_cv.csv
2024-11-23 10:30:17,998 - INFO - Do HP

Fitting 9 folds for each of 2 candidates, totalling 18 fits
