In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
MODEL_DIR = os.path.join(os.getcwd(), 'model')
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess
from sksurv.ensemble import RandomSurvivalForest

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [None]:
# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'pca_threshold': 0.95,
    'gene_type': 'intersection',
    'use_imputed': True,
    'use_cohorts': False
}

# Model configuration
MODEL_CONFIG = {
    'params_cv': {
        'model__n_estimators': [1],
        'model__min_samples_split': [10]
    },
    'refit': True, 
    'do_nested_resampling': True, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'results'}

rsf_pipeline_steps = [('model', RandomSurvivalForest())]


In [3]:
mp = ModellingProcess()

In [4]:
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

2024-11-22 12:40:12,394 - INFO - Loading data...
2024-11-22 12:42:38,126 - INFO - Loaded data: 1091 samples, 13214 features


In [5]:
nstd_res_result = mp.do_modelling(rsf_pipeline_steps, MODEL_CONFIG)

2024-11-22 12:42:51,771 - INFO - Start model training...
2024-11-22 12:42:51,773 - INFO - Input data shape: X=(1091, 13214)
2024-11-22 12:42:51,774 - INFO - Nested resampling...
2024-11-22 12:42:51,779 - INFO - Starting nested resampling...
2024-11-22 12:42:51,783 - INFO - Data shape: X=(1091, 13214), groups=9 unique
2024-11-22 12:42:51,835 - INFO - 
Outer fold 1
2024-11-22 12:42:51,916 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-11-22 12:43:20,640 - INFO - Best parameters: {'model__min_samples_split': 10, 'model__n_estimators': 1}
2024-11-22 12:43:20,642 - INFO - Test score: 0.585
2024-11-22 12:43:20,643 - INFO - 
Outer fold 2
2024-11-22 12:43:20,695 - INFO - Test cohort: Belfast_2018_Jain


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-11-22 12:43:35,006 - INFO - Best parameters: {'model__min_samples_split': 10, 'model__n_estimators': 1}
2024-11-22 12:43:35,007 - INFO - Test score: 0.517
2024-11-22 12:43:35,008 - INFO - 
Outer fold 3
2024-11-22 12:43:35,058 - INFO - Test cohort: CPC_GENE_2017_Fraser


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-11-22 12:43:54,097 - INFO - Best parameters: {'model__min_samples_split': 10, 'model__n_estimators': 1}
2024-11-22 12:43:54,098 - INFO - Test score: 0.567
2024-11-22 12:43:54,100 - INFO - 
Outer fold 4
2024-11-22 12:43:54,157 - INFO - Test cohort: CPGEA_2020_Li


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-11-22 12:44:10,512 - INFO - Best parameters: {'model__min_samples_split': 10, 'model__n_estimators': 1}
2024-11-22 12:44:10,512 - INFO - Test score: 0.584
2024-11-22 12:44:10,512 - INFO - 
Outer fold 5
2024-11-22 12:44:10,567 - INFO - Test cohort: CamCap_2016_Ross_Adams


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-11-22 12:44:25,797 - INFO - Best parameters: {'model__min_samples_split': 10, 'model__n_estimators': 1}
2024-11-22 12:44:25,799 - INFO - Test score: 0.604
2024-11-22 12:44:25,800 - INFO - 
Outer fold 6
2024-11-22 12:44:25,862 - INFO - Test cohort: CancerMap_2017_Luca


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-11-22 12:44:44,494 - INFO - Best parameters: {'model__min_samples_split': 10, 'model__n_estimators': 1}
2024-11-22 12:44:44,494 - INFO - Test score: 0.562
2024-11-22 12:44:44,494 - INFO - 
Outer fold 7
2024-11-22 12:44:44,530 - INFO - Test cohort: DKFZ_2018_Gerhauser


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-11-22 12:45:01,938 - INFO - Best parameters: {'model__min_samples_split': 10, 'model__n_estimators': 1}
2024-11-22 12:45:01,939 - INFO - Test score: 0.703
2024-11-22 12:45:01,939 - INFO - 
Outer fold 8
2024-11-22 12:45:01,982 - INFO - Test cohort: MSKCC_2010_Taylor


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-11-22 12:45:16,863 - INFO - Best parameters: {'model__min_samples_split': 10, 'model__n_estimators': 1}
2024-11-22 12:45:16,863 - INFO - Test score: 0.630
2024-11-22 12:45:16,863 - INFO - 
Outer fold 9
2024-11-22 12:45:16,916 - INFO - Test cohort: Stockholm_2016_Ross_Adams


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2024-11-22 12:45:30,803 - INFO - Best parameters: {'model__min_samples_split': 10, 'model__n_estimators': 1}
2024-11-22 12:45:30,803 - INFO - Test score: 0.568
2024-11-22 12:45:30,803 - INFO - Aggregated results:
2024-11-22 12:45:30,803 - INFO - Mean score: 0.591 ± 0.049
2024-11-22 12:45:30,803 - INFO - Individual scores: [np.float64(0.5852601156069365), np.float64(0.516864679822795), np.float64(0.5665280665280665), np.float64(0.5837343599615015), np.float64(0.6035182679296346), np.float64(0.5618501732871234), np.float64(0.7034109816971714), np.float64(0.6298524844720497), np.float64(0.5678366520418495)]
2024-11-22 12:45:30,866 - INFO - Saved CV results to c:\Users\laeti\PCaPrognostics\models\rsf\results\results\results_cv_cv_results.csv
2024-11-22 12:45:30,882 - INFO - Do HP Tuning for complete model; refit + set complete model
2024-11-22 12:45:30,882 - INFO - Do HP Tuning for complete model


Fitting 9 folds for each of 1 candidates, totalling 9 fits


In [None]:
# Save nested resampling result and retrained model (TODO: Add this to modelling process)