In [1]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
MODEL_DIR = os.path.join(os.getcwd(), 'model')
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.feature_selection import SelectFromModel
from sklearn.base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
from utils.feature_selection import FoldAwareSelectFromModel
from models.modelling_process import ModellingProcess
import joblib  # Assuming pretrained models are saved as .pkl files

In [3]:
# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'pca_threshold': 0.95,
    'gene_type': 'common_genes',
    'use_imputed': True,
    'use_cohorts': False, 
    'select_random' : False, 
    'random_frac' : 0.1
}

# Model configuration
MODEL_CONFIG = {
    'params_cv': {
        'model__n_estimators': [50, 100],
        'model__min_samples_split': [6], 
        'model__max_features': ['sqrt'],
        'model__bootstrap' : [False], 
        'model__n_jobs': [-1], 
        'model__random_state': [1234], 
        'model__low_memory' : [True] 
    },
    'refit': True, 
    'do_nested_resampling': True, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'results_intersect'}

In [4]:
mp = ModellingProcess()
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

2024-12-10 15:25:57,892 - INFO - Loading data...
2024-12-10 15:28:44,731 - INFO - Loaded data: 1091 samples, 15495 features


In [5]:
mp.X = mp.X.iloc[: , :100]

In [6]:
pretrained_gb = GradientBoostingSurvivalAnalysis()
pretrained_gb.fit(mp.X, mp.y)


In [7]:
# Create the dynamic model selector
dynamic_selector = FoldAwareSelectFromModel(estimator=GradientBoostingSurvivalAnalysis(), threshold = "mean")
#dynamic_selector = SelectFromModel(pretrained_gb)

# Define the pipeline
pipe_steps = [
    ('dynamic_model_selector', dynamic_selector),
    ('model', RandomSurvivalForest())]

In [8]:
mp.do_modelling(pipe_steps, MODEL_CONFIG)

2024-12-10 15:29:06,138 - INFO - No additional monitoring detected
2024-12-10 15:29:06,138 - INFO - Start model training...
2024-12-10 15:29:06,138 - INFO - Input data shape: X=(1091, 100)
2024-12-10 15:29:06,138 - INFO - Nested resampling...
2024-12-10 15:29:06,154 - INFO - Starting nested resampling...
2024-12-10 15:29:06,154 - INFO - Data shape: X=(1091, 100), groups=9 unique
2024-12-10 15:29:06,154 - INFO - 
Outer fold 1
2024-12-10 15:29:06,170 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,


c:\Users\laeti\PCaPrognostics\models\pretrnd_models
c:\Users\laeti\PCaPrognostics\models\pretrnd_models\Atlanta_2014_Long


2024-12-10 15:30:54,439 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 100, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-10 15:30:54,440 - INFO - Test score: 0.523
2024-12-10 15:30:54,441 - INFO - 
Outer fold 2
2024-12-10 15:30:54,444 - INFO - Test cohort: Belfast_2018_Jain


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,


c:\Users\laeti\PCaPrognostics\models\pretrnd_models
c:\Users\laeti\PCaPrognostics\models\pretrnd_models\Belfast_2018_Jain


2024-12-10 15:31:48,644 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-10 15:31:48,644 - INFO - Test score: 0.562
2024-12-10 15:31:48,644 - INFO - 
Outer fold 3
2024-12-10 15:31:48,644 - INFO - Test cohort: CPC_GENE_2017_Fraser


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,


c:\Users\laeti\PCaPrognostics\models\pretrnd_models
c:\Users\laeti\PCaPrognostics\models\pretrnd_models\CPC_GENE_2017_Fraser


2024-12-10 15:33:11,369 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 100, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-10 15:33:11,369 - INFO - Test score: 0.536
2024-12-10 15:33:11,369 - INFO - 
Outer fold 4
2024-12-10 15:33:11,369 - INFO - Test cohort: CPGEA_2020_Li


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,


c:\Users\laeti\PCaPrognostics\models\pretrnd_models
c:\Users\laeti\PCaPrognostics\models\pretrnd_models\CPGEA_2020_Li


2024-12-10 15:34:27,265 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 100, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-10 15:34:27,265 - INFO - Test score: 0.683
2024-12-10 15:34:27,265 - INFO - 
Outer fold 5
2024-12-10 15:34:27,265 - INFO - Test cohort: CamCap_2016_Ross_Adams


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,


c:\Users\laeti\PCaPrognostics\models\pretrnd_models
c:\Users\laeti\PCaPrognostics\models\pretrnd_models\CamCap_2016_Ross_Adams


2024-12-10 15:35:30,729 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 100, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-10 15:35:30,729 - INFO - Test score: 0.732
2024-12-10 15:35:30,729 - INFO - 
Outer fold 6
2024-12-10 15:35:30,729 - INFO - Test cohort: CancerMap_2017_Luca


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,


c:\Users\laeti\PCaPrognostics\models\pretrnd_models
c:\Users\laeti\PCaPrognostics\models\pretrnd_models\CancerMap_2017_Luca


2024-12-10 15:36:41,031 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 100, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-10 15:36:41,031 - INFO - Test score: 0.645
2024-12-10 15:36:41,031 - INFO - 
Outer fold 7
2024-12-10 15:36:41,031 - INFO - Test cohort: DKFZ_2018_Gerhauser


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,


c:\Users\laeti\PCaPrognostics\models\pretrnd_models
c:\Users\laeti\PCaPrognostics\models\pretrnd_models\DKFZ_2018_Gerhauser


2024-12-10 15:38:06,512 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 100, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-10 15:38:06,512 - INFO - Test score: 0.794
2024-12-10 15:38:06,512 - INFO - 
Outer fold 8
2024-12-10 15:38:06,520 - INFO - Test cohort: MSKCC_2010_Taylor


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,


c:\Users\laeti\PCaPrognostics\models\pretrnd_models
c:\Users\laeti\PCaPrognostics\models\pretrnd_models\MSKCC_2010_Taylor


2024-12-10 15:39:13,726 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-10 15:39:13,726 - INFO - Test score: 0.567
2024-12-10 15:39:13,726 - INFO - 
Outer fold 9
2024-12-10 15:39:13,731 - INFO - Test cohort: Stockholm_2016_Ross_Adams


Fitting 8 folds for each of 2 candidates, totalling 16 fits


  _data = np.array(data, dtype=dtype, copy=copy,


c:\Users\laeti\PCaPrognostics\models\pretrnd_models
c:\Users\laeti\PCaPrognostics\models\pretrnd_models\Stockholm_2016_Ross_Adams


2024-12-10 15:40:19,387 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 50, 'model__n_jobs': -1, 'model__random_state': 1234}
2024-12-10 15:40:19,387 - INFO - Test score: 0.599
2024-12-10 15:40:19,387 - INFO - Aggregated results:
2024-12-10 15:40:19,387 - INFO - Mean score: 0.627 ± 0.088
2024-12-10 15:40:19,396 - INFO - Individual scores: [np.float64(0.523121387283237), np.float64(0.5617196939186468), np.float64(0.5363825363825364), np.float64(0.6830285530959256), np.float64(0.7320703653585927), np.float64(0.6454278858970941), np.float64(0.7936772046589018), np.float64(0.5667701863354038), np.float64(0.5987175160310496)]
2024-12-10 15:40:19,470 - INFO - Saved CV results to c:\Users\laeti\PCaPrognostics\models\playground_tbd\results\results\results_intersect_cv.csv
2024-12-10 15:40:19,474 - INFO - Do HP Tuning for complete model; refit + set complete model
2024-12-10 15:

Fitting 9 folds for each of 2 candidates, totalling 18 fits


KeyboardInterrupt: 

In [None]:
# class PretrainedRSFWrapper(RandomSurvivalForest):
#     def __init__(self, fold_model_path_template):
#         """
#         Parameters:
#         fold_model_path_template (str): Template for the model paths, e.g., "models/fold_{fold}.pkl"
#         """
#         self.fold_model_path_template = fold_model_path_template
#         self.model = None

#     def set_model(self, fold):
#         """Load the model for the current fold."""
#         model_path = self.fold_model_path_template.format(fold=fold)
#         self.model = joblib.load(model_path)
    
#     def get_model(self): 
#         return self.model

#     def fit(self, X, y=None):
#         return self

#     def predict(self, X):
#         if self.model is None:
#             raise ValueError("Model is not loaded. Call set_model() before predict.")
#         return self.model.predict(X)

#     def predict_proba(self, X):
#         if self.model is None:
#             raise ValueError("Model is not loaded. Call set_model() before predict_proba.")
#         return self.model.predict_proba(X)

In [5]:
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV, StratifiedKFold
# from sklearn.preprocessing import StandardScaler
# from sklearn.feature_selection import SelectFromModel

# # Define your pipeline
# pipeline = Pipeline([
#     #('scaler', StandardScaler()),  # Example preprocessing step
#     ('selector', SelectFromModel(PretrainedRSFWrapper(fold_model_path_template="models/fold_{fold}.pkl"), prefit = True)), 
#     ('model', RandomSurvivalForest())
# ])

# # Custom Cross-Validation
# cv = StratifiedKFold(n_splits=3)
# param_grid = {
#    'model__n_estimators' : [10, 15]
# }
# # Custom GridSearchCV wrapper
# class FoldAwareGridSearchCV(GridSearchCV):
#     def __init__(self):
#         self.iteration_step = 0
#         self.og_estimator = None
#         super().__init__()
        
        
#     def fit(self, X, y=None, **params):
#         if isinstance(self.estimator, Pipeline) and self.iteration_step == 0: 
#             self.og_estimator = clone(self.estimator)
            
#         else: 
#         self.estimator = clone(self.og_estimator)
#         self.estimator.steps.insert(0,['selector',SelectFromModel(RandomSurvivalForest(), prefit = True)])
#         super().fit(X, y, **params)

# # Instantiate and use FoldAwareGridSearchCV
# grid_search = FoldAwareGridSearchCV(
#     pipeline,
#     param_grid=param_grid,
#     cv=cv,
#     verbose=3
# )

# grid_search.fit(X, mp.y)  # X, y are your data
