In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess
from sksurv.ensemble import GradientBoostingSurvivalAnalysis
from utils.evaluation import EarlyStoppingMonitor

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

## Intersection and imputed data 

### 1.1 Model with only exprs. data; Intersection + imputed

In [2]:
# set early stopping monitor 
monitor = EarlyStoppingMonitor(10, 5)

# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'pca_threshold': 0.85,
    'gene_type': 'intersection',
    'use_imputed': True,
    'select_random' : False, 
    'use_cohorts': False
}

# Model configuration
MODEL_CONFIG = {
    'params_cv': {
        'model__n_estimators': [500],
        'model__learning_rate': [0.1],
        'model__max_depth': [3, 5],
        'model__min_samples_split': [5, 10],
        'model__min_samples_leaf': [3, 5],
        'model__subsample': [0.9],
        'model__max_features': ['sqrt', None]
    },
    'refit': True, 
    'do_nested_resampling': False, 
    'monitor' : monitor, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'results_PCA'}

# validation_fraction=0.1 as a mean to inclued early stopping
gb_pipeline_steps = [('model', GradientBoostingSurvivalAnalysis())]


In [3]:
mp = ModellingProcess()

In [4]:
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

2024-12-27 20:48:22,725 - INFO - Loading data...
2024-12-27 20:51:58,027 - INFO - Loaded data: 1091 samples, 13214 features


In [None]:
nstd_res_result = mp.do_modelling(gb_pipeline_steps, MODEL_CONFIG)

2024-12-27 20:52:24,054 - INFO - Start model training...
2024-12-27 20:52:24,054 - INFO - Input data shape: X=(1091, 13214)
2024-12-27 20:52:24,054 - INFO - Do HP Tuning for complete model; refit + set complete model
2024-12-27 20:52:24,067 - INFO - Do HP Tuning for complete model


Fitting 9 folds for each of 16 candidates, totalling 144 fits


In [None]:
mp.save_results(RESULTS_DIR, 'gb_intersect_imp', model = mp.cmplt_model, cv_results = mp.resampling_cmplt, pipe = mp.cmplt_pipeline)

### 1.2 Model with only pData

In [None]:
# set early stopping monitor 
monitor = EarlyStoppingMonitor(10, 5)

# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'pca_threshold': 0.85,
    'gene_type': 'intersection',
    'use_imputed': True,
    'select_random' : False, 
    'use_cohorts': False,
    'only_pData' : True
}

# Model configuration
MODEL_CONFIG = {
    'params_cv': {
        'model__n_estimators': [500],
        'model__learning_rate': [0.1],
        'model__max_depth': [3, 5],
        'model__min_samples_split': [5, 10],
        'model__min_samples_leaf': [3, 5],
        'model__subsample': [0.9],
        'model__max_features': ['sqrt', None]
    },
    'refit': True, 
    'do_nested_resampling': True, 
    'monitor' : monitor, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'results_PCA'}

# validation_fraction=0.1 as a mean to inclued early stopping
gb_pipeline_steps = [('model', GradientBoostingSurvivalAnalysis())]


### 1.3 Model with both exprs. and pData