In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
MODEL_DIR = os.path.join(os.getcwd(), 'model')
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess
from models.deep_surv_model import DeepSurvModel

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [2]:
# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'gene_type': 'intersection',
    'use_imputed': True,
    'use_cohorts': False
}

# Model configuration
MODEL_CONFIG = {
    'params_cv'  : {
        'model__hidden_layers': [[16, 16]],
        'model__learning_rate': [0.01],
        'model__batch_size': [64, 256], 
        'model__num_epochs': [10]
    },
    'refit': True, 
    'do_nested_resampling': True}

ds_pipeline_steps = [('model', DeepSurvModel())]


In [3]:
mp = ModellingProcess()

In [4]:
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

2024-11-14 23:47:58,284 - INFO - Loading data...
2024-11-14 23:51:12,203 - INFO - Loaded data: 1091 samples, 13214 features


In [5]:
nstd_res_result = mp.do_modelling(ds_pipeline_steps, MODEL_CONFIG)

2024-11-14 23:51:34,223 - INFO - Start model training...
2024-11-14 23:51:34,226 - INFO - Input data shape: X=(1091, 13214)
2024-11-14 23:51:34,227 - INFO - Nested resampling...
2024-11-14 23:51:34,232 - INFO - Starting nested resampling...
2024-11-14 23:51:34,235 - INFO - Data shape: X=(1091, 13214), groups=9 unique
2024-11-14 23:51:34,263 - INFO - 
Outer fold 1
2024-11-14 23:51:34,360 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-14 23:52:09,103 - INFO - Best parameters: {'model__batch_size': 64, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-14 23:52:09,104 - INFO - Test score: 0.538
2024-11-14 23:52:09,105 - INFO - 
Outer fold 2
2024-11-14 23:52:09,198 - INFO - Test cohort: Belfast_2018_Jain


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-14 23:52:20,270 - INFO - Best parameters: {'model__batch_size': 256, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-14 23:52:20,272 - INFO - Test score: 0.523
2024-11-14 23:52:20,273 - INFO - 
Outer fold 3
2024-11-14 23:52:20,344 - INFO - Test cohort: CPC_GENE_2017_Fraser


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-14 23:52:34,490 - INFO - Best parameters: {'model__batch_size': 64, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-14 23:52:34,492 - INFO - Test score: 0.644
2024-11-14 23:52:34,493 - INFO - 
Outer fold 4
2024-11-14 23:52:34,551 - INFO - Test cohort: CPGEA_2020_Li


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-14 23:52:47,126 - INFO - Best parameters: {'model__batch_size': 64, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-14 23:52:47,127 - INFO - Test score: 0.535
2024-11-14 23:52:47,127 - INFO - 
Outer fold 5
2024-11-14 23:52:47,175 - INFO - Test cohort: CamCap_2016_Ross_Adams


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-14 23:53:00,709 - INFO - Best parameters: {'model__batch_size': 256, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-14 23:53:00,710 - INFO - Test score: 0.475
2024-11-14 23:53:00,711 - INFO - 
Outer fold 6
2024-11-14 23:53:00,827 - INFO - Test cohort: CancerMap_2017_Luca


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-14 23:53:12,857 - INFO - Best parameters: {'model__batch_size': 256, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-14 23:53:12,857 - INFO - Test score: 0.500
2024-11-14 23:53:12,858 - INFO - 
Outer fold 7
2024-11-14 23:53:12,913 - INFO - Test cohort: DKFZ_2018_Gerhauser


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-14 23:53:26,750 - INFO - Best parameters: {'model__batch_size': 256, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-14 23:53:26,751 - INFO - Test score: 0.631
2024-11-14 23:53:26,753 - INFO - 
Outer fold 8
2024-11-14 23:53:26,821 - INFO - Test cohort: MSKCC_2010_Taylor


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-14 23:53:39,839 - INFO - Best parameters: {'model__batch_size': 64, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-14 23:53:39,840 - INFO - Test score: 0.538
2024-11-14 23:53:39,841 - INFO - 
Outer fold 9
2024-11-14 23:53:39,914 - INFO - Test cohort: Stockholm_2016_Ross_Adams


Fitting 8 folds for each of 2 candidates, totalling 16 fits


2024-11-14 23:53:53,260 - INFO - Best parameters: {'model__batch_size': 64, 'model__hidden_layers': [16, 16], 'model__learning_rate': 0.01, 'model__num_epochs': 10}
2024-11-14 23:53:53,261 - INFO - Test score: 0.583
2024-11-14 23:53:53,261 - INFO - Aggregated results:
2024-11-14 23:53:53,262 - INFO - Mean score: 0.552 ± 0.053
2024-11-14 23:53:53,263 - INFO - Individual scores: [np.float64(0.5378612716763006), np.float64(0.5230567861457914), np.float64(0.643970893970894), np.float64(0.5354507539300609), np.float64(0.4749661705006766), np.float64(0.5003998933617702), np.float64(0.6306156405990017), np.float64(0.5384316770186336), np.float64(0.5825177185285184)]
2024-11-14 23:53:53,273 - INFO - Do HP Tuning for complete model; refit + set complete model
2024-11-14 23:53:53,274 - INFO - Do HP Tuning for complete model


Fitting 9 folds for each of 2 candidates, totalling 18 fits
