In [2]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
MODEL_DIR = os.path.join(os.getcwd(), 'model')
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from preprocessing.data_container import DataContainer
from utils.evaluation import cindex_score
from models.modelling_process import ModellingProcess
from sksurv.ensemble import RandomSurvivalForest

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

In [2]:
# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'pca_threshold': 0.95,
    'gene_type': 'intersect',
    'use_imputed': True,
    'use_cohorts': False, 
    'select_random' : False
}

# Model configuration
MODEL_CONFIG = {
    'params_cv': {
        'model__n_estimators': [100],
        'model__min_samples_split': [6], 
        'model__max_features': ['sqrt', 'log2', 0.05],
        'model__bootstrap' : [False], 
        'model__n_jobs': [-1], 
        'model__random_state': [1234], 
        'model__low_memory' : [True] 
    },
    'refit': False, 
    'do_nested_resampling': True, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'results_intersect'}

rsf_pipeline_steps = [('model', RandomSurvivalForest())]


In [3]:
mp = ModellingProcess()

In [5]:
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

2024-12-28 01:11:13,137 - INFO - Loading data...
2024-12-28 01:12:08,421 - INFO - Loaded data: 1091 samples, 15495 features


In [6]:
nstd_res_result = mp.do_modelling(rsf_pipeline_steps, MODEL_CONFIG)

2024-12-28 01:12:12,728 - INFO - No additional monitoring detected
2024-12-28 01:12:12,729 - INFO - Start model training...
2024-12-28 01:12:12,730 - INFO - Input data shape: X=(1091, 15495)
2024-12-28 01:12:12,730 - INFO - Nested resampling...
2024-12-28 01:12:12,731 - INFO - Starting nested resampling...
2024-12-28 01:12:12,733 - INFO - Data shape: X=(1091, 15495), groups=9 unique
2024-12-28 01:12:12,736 - INFO - 
Outer fold 1
2024-12-28 01:12:12,807 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 5 candidates, totalling 40 fits


KeyboardInterrupt: 