In [6]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
MODEL_DIR = os.path.join(os.getcwd(), 'model')
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess
from models.deep_surv_model import DeepSurvModel

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [8]:
# Data configuration
DATA_CONFIG = {
    'use_pca': True,
    'gene_type': 'intersection',
    'use_imputed': True,
    'use_cohorts': False
}

# Model configuration
MODEL_CONFIG = {
    'params_cv'  : {
        'model__hidden_layers': [[16, 16]],
        'model__learning_rate': [0.01],
        'model__batch_size': [64, 256], 
        'model__num_epochs': [10]
    },
    'refit': True, 
    'do_nested_resampling': True}

ds_pipeline_steps = [('model', DeepSurvModel())]

In [9]:
mp = ModellingProcess()

In [10]:
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

2024-11-15 14:40:59,095 - INFO - Loading data...
2024-11-15 14:42:31,496 - INFO - Applying PCA...
2024-11-15 14:42:36,118 - INFO - Selected 756 components explaining 95.0% of variance
2024-11-15 14:42:46,550 - INFO - Loaded data: 1091 samples, 756 features


In [11]:
nstd_res_result = mp.do_modelling(ds_pipeline_steps, MODEL_CONFIG)

2024-11-15 14:42:53,428 - INFO - Start model training...
2024-11-15 14:42:53,430 - INFO - Input data shape: X=(1091, 756)
2024-11-15 14:42:53,431 - INFO - Nested resampling...
2024-11-15 14:42:53,432 - INFO - Starting nested resampling...
2024-11-15 14:42:53,434 - INFO - Data shape: X=(1091, 756), groups=9 unique
2024-11-15 14:42:53,438 - INFO - 
Outer fold 1
2024-11-15 14:42:53,456 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,
2024-11-15 14:43:06,896 - INFO - Best parameters: {'model__batch_size': 64, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-15 14:43:06,900 - INFO - Test score: 0.487
2024-11-15 14:43:06,905 - INFO - 
Outer fold 2
2024-11-15 14:43:06,951 - INFO - Test cohort: Belfast_2018_Jain


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-15 14:43:08,246 - INFO - Best parameters: {'model__batch_size': 64, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-15 14:43:08,246 - INFO - Test score: 0.617
2024-11-15 14:43:08,247 - INFO - 
Outer fold 3
2024-11-15 14:43:08,250 - INFO - Test cohort: CPC_GENE_2017_Fraser


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-15 14:43:09,505 - INFO - Best parameters: {'model__batch_size': 64, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-15 14:43:09,506 - INFO - Test score: 0.526
2024-11-15 14:43:09,507 - INFO - 
Outer fold 4
2024-11-15 14:43:09,511 - INFO - Test cohort: CPGEA_2020_Li


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-15 14:43:10,633 - INFO - Best parameters: {'model__batch_size': 256, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-15 14:43:10,633 - INFO - Test score: 0.459
2024-11-15 14:43:10,634 - INFO - 
Outer fold 5
2024-11-15 14:43:10,637 - INFO - Test cohort: CamCap_2016_Ross_Adams


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-15 14:43:11,731 - INFO - Best parameters: {'model__batch_size': 256, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-15 14:43:11,732 - INFO - Test score: 0.410
2024-11-15 14:43:11,733 - INFO - 
Outer fold 6
2024-11-15 14:43:11,737 - INFO - Test cohort: CancerMap_2017_Luca


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-15 14:43:12,860 - INFO - Best parameters: {'model__batch_size': 64, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-15 14:43:12,860 - INFO - Test score: 0.532
2024-11-15 14:43:12,861 - INFO - 
Outer fold 7
2024-11-15 14:43:12,864 - INFO - Test cohort: DKFZ_2018_Gerhauser


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-15 14:43:13,994 - INFO - Best parameters: {'model__batch_size': 256, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-15 14:43:13,995 - INFO - Test score: 0.659
2024-11-15 14:43:13,995 - INFO - 
Outer fold 8
2024-11-15 14:43:13,998 - INFO - Test cohort: MSKCC_2010_Taylor


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-15 14:43:15,616 - INFO - Best parameters: {'model__batch_size': 256, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-15 14:43:15,617 - INFO - Test score: 0.564
2024-11-15 14:43:15,618 - INFO - 
Outer fold 9
2024-11-15 14:43:15,624 - INFO - Test cohort: Stockholm_2016_Ross_Adams


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-15 14:43:17,173 - INFO - Best parameters: {'model__batch_size': 256, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-15 14:43:17,174 - INFO - Test score: 0.532
2024-11-15 14:43:17,175 - INFO - Aggregated results:
2024-11-15 14:43:17,176 - INFO - Mean score: 0.532 ± 0.072
2024-11-15 14:43:17,178 - INFO - Individual scores: [0.4867052023121387, 0.616794200563834, 0.525987525987526, 0.45877446262431826, 0.4100135317997294, 0.5323913623033858, 0.6589018302828619, 0.5644409937888198, 0.5318933513331083]
2024-11-15 14:43:17,180 - INFO - Do HP Tuning for complete model; refit + set complete model
2024-11-15 14:43:17,182 - INFO - Do HP Tuning for complete model


Fitting 9 folds for each of 2 candidates, totalling 18 fits


