In [6]:
import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import logging

# Setup paths
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.getcwd()))
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

# Setup directories
MODEL_DIR = os.path.join(os.getcwd(), 'model')
RESULTS_DIR = os.path.join(os.getcwd(), 'results')
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

# Imports
from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.feature_selection import SelectFromModel
from sklearn.base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
#from utils.feature_selection import FoldAwareSelectFromModel, FoldAwareAE
from models.modelling_process import ModellingProcess
import joblib  # Assuming pretrained models are saved as .pkl files

In [2]:
# Data configuration
DATA_CONFIG = {
    'use_pca': False,
    'pca_threshold': 0.95,
    'gene_type': 'intersection',
    'use_imputed': True,
    'use_cohorts': False, 
    'select_random' : False, 
    'requires_ohenc' : True, 
    'only_pData': False, 
    'clinical_covs' : ["AGE", "TISSUE", "GLEASON_SCORE", 'PRE_OPERATIVE_PSA']

}

# Model configuration
MODEL_CONFIG = {
    'params_cv': {
        'model__n_estimators': [1],
        'model__min_samples_split': [6], 
        'model__max_features': ['sqrt'],
        'model__bootstrap' : [False], 
        'model__n_jobs': [-1], 
        'model__random_state': [1234], 
        'model__low_memory' : [True] 
    },
    'refit': True, 
    'do_nested_resampling': True, 
    'path' : RESULTS_DIR, 
    'fname_cv' : 'test'}

In [3]:
mp = ModellingProcess()
mp.prepare_data(DATA_CONFIG, PROJECT_ROOT)

2025-01-09 18:50:29,570 - INFO - Loading data...
2025-01-09 18:51:21,859 - INFO - Found clinical data specification
2025-01-09 18:51:21,971 - INFO - Loaded data: 1091 samples, 13220 features


<class 'pandas.core.frame.DataFrame'>
Index: 1091 entries, Atlanta_2014_Long.PT081 to Stockholm_2016_Ross_Adams.STKHLM9246
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   AGE                1091 non-null   float64
 1   TISSUE             1091 non-null   object 
 2   CLIN_T_STAGE       1091 non-null   object 
 3   PATH_T_STAGE       1091 non-null   object 
 4   GLEASON_SCORE      1091 non-null   float64
 5   PRE_OPERATIVE_PSA  1091 non-null   float64
 6   MONTH_TO_BCR       1091 non-null   float64
 7   BCR_STATUS         1091 non-null   int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 76.7+ KB
None


In [15]:
from sklearn.feature_selection import SelectFromModel, SelectorMixin
from sklearn.base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
import joblib  
import os
import sys
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd


class FoldAwareSelectFromModel(SelectFromModel, SelectorMixin):
    def __init__(self, estimator, threshold = "median"):
        self.all_cohorts = ['Atlanta_2014_Long', 'Belfast_2018_Jain', 'CamCap_2016_Ross_Adams',
 'CancerMap_2017_Luca', 'CPC_GENE_2017_Fraser', 'CPGEA_2020_Li',
 'DKFZ_2018_Gerhauser', 'MSKCC_2010_Taylor', 'Stockholm_2016_Ross_Adams']
        super().__init__(estimator=estimator, threshold=threshold)
        self.estimator = estimator
        #self.threshold_ = threshold

    
    @_fit_context(
    # SelectFromModel.estimator is not validated yet
    prefer_skip_nested_validation=False
    )
    def fit(self, X, y=None, **fit_params):
        # Example logic to choose a model based on the data split
        root = os.path.dirname(os.path.dirname(__file__))
        root = os.path.join(root, 'pretrnd_models')
        #if root not in sys.path:
        #    sys.path.append(root)
        print(root)
        cohort_names = X.index.to_series().str.split('.').str[0]
        # Get unique cohort names
        unique_cohort_names = cohort_names.unique()
        model_path = ''
        for c in self.all_cohorts: 
            if c not in unique_cohort_names: 
                if len(model_path) > 0: 
                    model_path +=  "_"
                model_path += c  
        if model_path == '': 
            model_path = 'pretrnd_cmplt'
        model_path = os.path.join(root, model_path)
        print(model_path)
        self.estimator= joblib.load(model_path + '.pkl')  
        #super().fit(X, y, **fit_params)  # No need to fit as models are pretrained
        return self
    

class FoldAwareAE(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.all_cohorts = ['Atlanta_2014_Long', 'Belfast_2018_Jain', 'CamCap_2016_Ross_Adams',
                            'CancerMap_2017_Luca', 'CPC_GENE_2017_Fraser', 'CPGEA_2020_Li','DKFZ_2018_Gerhauser', 'MSKCC_2010_Taylor', 'Stockholm_2016_Ross_Adams']

    def fit(self, X, y=None, **fit_params):
        # Example logic to choose a model based on the data split
        root = os.path.dirname(os.path.dirname(__file__))
        root = os.path.join(root, 'pretrnd_models_ae')
        #if root not in sys.path:
        #    sys.path.append(root)
        print(root)
        cohort_names = X.index.to_series().str.split('.').str[0]
        # Get unique cohort names
        unique_cohort_names = cohort_names.unique()
        model_path = ''
        for c in self.all_cohorts: 
            if c not in unique_cohort_names: 
                if len(model_path) > 0: 
                    model_path +=  "_"
                model_path += c  
        if model_path == '': 
            model_path = 'pretrnd_cmplt'
        model_path = os.path.join(root, model_path)
        print(model_path)
        self.X_rdcd = pd.read_csv(model_path + '.csv')
        self.X_rdcd = pd.DataFrame(self.X_rdcd, index= X.index)
        #self.estimator= joblib.load(model_path + '.pkl')  
        #super().fit(X, y, **fit_params)  # No need to fit as models are pretrained
        return self
    
    def transform(self, X):
        # Ensure the estimator is fitted
        #if self.estimator is None:
        #    raise RuntimeError("The transformer has not been fitted with an estimator.")
        
        # Create AE s.t. this is the case
        #ls = self.estimator.encoder(X)
        #ls = pd.DataFrame(ls, index=X.index)
        return self.X_rdcd
    
    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

In [17]:
from sklearn.compose import ColumnTransformer

# Create the dynamic model selector
dynamic_selector = FoldAwareSelectFromModel(estimator=GradientBoostingSurvivalAnalysis(), threshold = "mean")
#dynamic_selector = SelectFromModel(pretrained_gb)
pdata_cols = ['TISSUE_FFPE', 'TISSUE_Fresh_frozen', 'TISSUE_Snap_frozen', 'AGE',
       'GLEASON_SCORE', 'PRE_OPERATIVE_PSA']
exprs_cols =  list(set(mp.X.columns) - set(pdata_cols))

ae = FoldAwareAE()
preprocessor = ColumnTransformer(
    transformers=[
        ('feature_selection', ae, exprs_cols),  # Apply feature selection
        ('other_features', 'passthrough', pdata_cols)         # Pass through other columns
    ]
)


# Define the pipeline
pipe_steps = [
    ('preprocessor', preprocessor),
    ('model', RandomSurvivalForest(n_estimators = 1))]

In [19]:
from sklearn.pipeline import Pipeline
Pipeline(pipe_steps).fit(mp.X)

NameError: name '__file__' is not defined

In [18]:
mp.do_modelling(pipe_steps, MODEL_CONFIG)

2025-01-09 18:58:30,965 - INFO - No additional monitoring detected
2025-01-09 18:58:30,967 - INFO - Start model training...
2025-01-09 18:58:30,968 - INFO - Input data shape: X=(1091, 13220)
2025-01-09 18:58:30,970 - INFO - Nested resampling...
2025-01-09 18:58:30,971 - INFO - Starting nested resampling...
2025-01-09 18:58:30,974 - INFO - Data shape: X=(1091, 13220), groups=9 unique
2025-01-09 18:58:30,979 - INFO - 
Outer fold 1
2025-01-09 18:58:31,088 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 1 candidates, totalling 8 fits


2025-01-09 18:58:33,191 - ERROR - Error during nested resampling: 
All the 8 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaProgn

ValueError: 
All the 8 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\compose\_column_transformer.py", line 976, in fit_transform
    result = self._call_func_on_transformers(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\compose\_column_transformer.py", line 885, in _call_func_on_transformers
    return Parallel(n_jobs=self.n_jobs)(jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\utils\parallel.py", line 74, in __call__
    return super().__call__(iterable_with_config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\joblib\parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
                                                ^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\joblib\parallel.py", line 1847, in _get_sequential_output
    res = func(*args, **kwargs)
          ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\utils\parallel.py", line 136, in __call__
    return self.function(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\laeti\PCaPrognostics\.venv\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\laeti\AppData\Local\Temp\ipykernel_11980\1947023944.py", line 91, in fit_transform
  File "C:\Users\laeti\AppData\Local\Temp\ipykernel_11980\1947023944.py", line 56, in fit
NameError: name '__file__' is not defined. Did you mean: '__name__'?


In [4]:
pretrained_gb = GradientBoostingSurvivalAnalysis(n_estimators=500, learning_rate=0.1, random_state=1234, max_depth = 3, max_features = 'sqrt', min_samples_leaf= 10, min_samples_split =  4, subsample = 0.8)

pretrained_gb.fit(mp.X, mp.y)




In [5]:
mp.X.columns

Index(['TISSUE_FFPE', 'TISSUE_Fresh_frozen', 'TISSUE_Snap_frozen', 'AGE',
       'GLEASON_SCORE', 'PRE_OPERATIVE_PSA', 'ENSG00000000003',
       'ENSG00000000005', 'ENSG00000000419', 'ENSG00000000457',
       ...
       'ENSG00000277972', 'ENSG00000278053', 'ENSG00000278195',
       'ENSG00000278259', 'ENSG00000278311', 'ENSG00000278318',
       'ENSG00000278505', 'ENSG00000278535', 'ENSG00000278540',
       'ENSG00000282608'],
      dtype='object', length=13220)

In [26]:
from sklearn.pipeline import Pipeline

pipe = Pipeline(pipe_steps)
pipe.fit(mp.X, mp.y)

c:\Users\laeti\PCaPrognostics\pretrnd_models
c:\Users\laeti\PCaPrognostics\pretrnd_models\pretrnd_cmplt




In [27]:
len(pipe.named_steps['preprocessor']['feature_selection'].get_support(indices = True))

657

In [28]:
mp.do_modelling(pipe_steps, MODEL_CONFIG)

2025-01-08 18:48:48,223 - INFO - No additional monitoring detected
2025-01-08 18:48:48,225 - INFO - Start model training...
2025-01-08 18:48:48,225 - INFO - Input data shape: X=(1091, 13220)
2025-01-08 18:48:48,227 - INFO - Nested resampling...
2025-01-08 18:48:48,228 - INFO - Starting nested resampling...
2025-01-08 18:48:48,230 - INFO - Data shape: X=(1091, 13220), groups=9 unique
2025-01-08 18:48:48,232 - INFO - 
Outer fold 1
2025-01-08 18:48:48,311 - INFO - Test cohort: Atlanta_2014_Long


Fitting 8 folds for each of 1 candidates, totalling 8 fits
c:\Users\laeti\PCaPrognostics\pretrnd_models
c:\Users\laeti\PCaPrognostics\pretrnd_models\Atlanta_2014_Long


2025-01-08 18:49:04,952 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 1, 'model__n_jobs': -1, 'model__random_state': 1234}
2025-01-08 18:49:04,953 - INFO - Test score: 0.578
2025-01-08 18:49:04,955 - INFO - 
Outer fold 2
2025-01-08 18:49:05,064 - INFO - Test cohort: Belfast_2018_Jain


Fitting 8 folds for each of 1 candidates, totalling 8 fits
c:\Users\laeti\PCaPrognostics\pretrnd_models
c:\Users\laeti\PCaPrognostics\pretrnd_models\Belfast_2018_Jain


2025-01-08 18:49:13,676 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 1, 'model__n_jobs': -1, 'model__random_state': 1234}
2025-01-08 18:49:13,679 - INFO - Test score: 0.475
2025-01-08 18:49:13,680 - INFO - 
Outer fold 3
2025-01-08 18:49:13,750 - INFO - Test cohort: CPC_GENE_2017_Fraser


Fitting 8 folds for each of 1 candidates, totalling 8 fits
c:\Users\laeti\PCaPrognostics\pretrnd_models
c:\Users\laeti\PCaPrognostics\pretrnd_models\CPC_GENE_2017_Fraser


2025-01-08 18:49:21,792 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 1, 'model__n_jobs': -1, 'model__random_state': 1234}
2025-01-08 18:49:21,793 - INFO - Test score: 0.356
2025-01-08 18:49:21,794 - INFO - 
Outer fold 4
2025-01-08 18:49:21,862 - INFO - Test cohort: CPGEA_2020_Li


Fitting 8 folds for each of 1 candidates, totalling 8 fits
c:\Users\laeti\PCaPrognostics\pretrnd_models
c:\Users\laeti\PCaPrognostics\pretrnd_models\CPGEA_2020_Li


2025-01-08 18:49:33,122 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 1, 'model__n_jobs': -1, 'model__random_state': 1234}
2025-01-08 18:49:33,123 - INFO - Test score: 0.487
2025-01-08 18:49:33,124 - INFO - 
Outer fold 5
2025-01-08 18:49:33,195 - INFO - Test cohort: CamCap_2016_Ross_Adams


Fitting 8 folds for each of 1 candidates, totalling 8 fits
c:\Users\laeti\PCaPrognostics\pretrnd_models
c:\Users\laeti\PCaPrognostics\pretrnd_models\CamCap_2016_Ross_Adams


2025-01-08 18:49:40,836 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 1, 'model__n_jobs': -1, 'model__random_state': 1234}
2025-01-08 18:49:40,838 - INFO - Test score: 0.689
2025-01-08 18:49:40,839 - INFO - 
Outer fold 6
2025-01-08 18:49:40,917 - INFO - Test cohort: CancerMap_2017_Luca


Fitting 8 folds for each of 1 candidates, totalling 8 fits
c:\Users\laeti\PCaPrognostics\pretrnd_models
c:\Users\laeti\PCaPrognostics\pretrnd_models\CancerMap_2017_Luca


2025-01-08 18:49:48,773 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 1, 'model__n_jobs': -1, 'model__random_state': 1234}
2025-01-08 18:49:48,774 - INFO - Test score: 0.530
2025-01-08 18:49:48,776 - INFO - 
Outer fold 7
2025-01-08 18:49:48,853 - INFO - Test cohort: DKFZ_2018_Gerhauser


Fitting 8 folds for each of 1 candidates, totalling 8 fits
c:\Users\laeti\PCaPrognostics\pretrnd_models
c:\Users\laeti\PCaPrognostics\pretrnd_models\DKFZ_2018_Gerhauser


2025-01-08 18:50:00,578 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 1, 'model__n_jobs': -1, 'model__random_state': 1234}
2025-01-08 18:50:00,579 - INFO - Test score: 0.693
2025-01-08 18:50:00,580 - INFO - 
Outer fold 8
2025-01-08 18:50:00,651 - INFO - Test cohort: MSKCC_2010_Taylor


Fitting 8 folds for each of 1 candidates, totalling 8 fits
c:\Users\laeti\PCaPrognostics\pretrnd_models
c:\Users\laeti\PCaPrognostics\pretrnd_models\MSKCC_2010_Taylor


2025-01-08 18:50:08,657 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 1, 'model__n_jobs': -1, 'model__random_state': 1234}
2025-01-08 18:50:08,658 - INFO - Test score: 0.506
2025-01-08 18:50:08,659 - INFO - 
Outer fold 9
2025-01-08 18:50:08,730 - INFO - Test cohort: Stockholm_2016_Ross_Adams


Fitting 8 folds for each of 1 candidates, totalling 8 fits
c:\Users\laeti\PCaPrognostics\pretrnd_models
c:\Users\laeti\PCaPrognostics\pretrnd_models\Stockholm_2016_Ross_Adams


2025-01-08 18:50:17,949 - INFO - Best parameters: {'model__bootstrap': False, 'model__low_memory': True, 'model__max_features': 'sqrt', 'model__min_samples_split': 6, 'model__n_estimators': 1, 'model__n_jobs': -1, 'model__random_state': 1234}
2025-01-08 18:50:17,950 - INFO - Test score: 0.651
2025-01-08 18:50:17,951 - INFO - Aggregated results:
2025-01-08 18:50:17,952 - INFO - Mean score: 0.552 ± 0.106
2025-01-08 18:50:17,953 - INFO - Individual scores: [0.5783236994219653, 0.4753826016915022, 0.35550935550935553, 0.4870067372473532, 0.6887686062246279, 0.5298587043455079, 0.6930116472545758, 0.5062111801242236, 0.6505231184610193]
2025-01-08 18:50:17,992 - INFO - Saved CV results to c:\Users\laeti\PCaPrognostics\models\playground_tbd\results\results\test_cv.csv
2025-01-08 18:50:17,995 - INFO - Do HP Tuning for complete model; refit + set complete model
2025-01-08 18:50:17,997 - INFO - Do HP Tuning for complete model


Fitting 9 folds for each of 1 candidates, totalling 9 fits
c:\Users\laeti\PCaPrognostics\pretrnd_models
c:\Users\laeti\PCaPrognostics\pretrnd_models\pretrnd_cmplt


2025-01-08 18:50:29,920 - INFO - Saved model to c:\Users\laeti\PCaPrognostics\models\playground_tbd\results\model
2025-01-08 18:50:29,937 - INFO - Saved pipe to c:\Users\laeti\PCaPrognostics\models\playground_tbd\results\pipe


({'mean_score': 0.5516217389200145,
  'std_score': 0.10555576035528377,
  'fold_results': [{'test_cohort': 'Atlanta_2014_Long',
    'test_score': 0.5783236994219653,
    'best_params': {'model__bootstrap': False,
     'model__low_memory': True,
     'model__max_features': 'sqrt',
     'model__min_samples_split': 6,
     'model__n_estimators': 1,
     'model__n_jobs': -1,
     'model__random_state': 1234},
    'inner_cv_results': {'mean_fit_time': array([3.46003792]),
     'std_fit_time': array([1.58981936]),
     'mean_score_time': array([0.21807426]),
     'std_score_time': array([0.03239409]),
     'param_model__bootstrap': masked_array(data=[False],
                  mask=[False],
            fill_value=True),
     'param_model__low_memory': masked_array(data=[True],
                  mask=[False],
            fill_value=True),
     'param_model__max_features': masked_array(data=['sqrt'],
                  mask=[False],
            fill_value='?',
                 dtype=object),
   

In [39]:
idx = mp.cmplt_pipeline.named_steps['preprocessor']['feature_selection'].get_support(indices = True)
mod = mp.cmplt_pipeline.named_steps['model']
from sklearn.inspection import permutation_importance

result = permutation_importance(mp.cmplt_pipeline, mp.X, mp.y, n_repeats=1, random_state=1234)



KeyboardInterrupt: 

In [None]:
# class PretrainedRSFWrapper(RandomSurvivalForest):
#     def __init__(self, fold_model_path_template):
#         """
#         Parameters:
#         fold_model_path_template (str): Template for the model paths, e.g., "models/fold_{fold}.pkl"
#         """
#         self.fold_model_path_template = fold_model_path_template
#         self.model = None

#     def set_model(self, fold):
#         """Load the model for the current fold."""
#         model_path = self.fold_model_path_template.format(fold=fold)
#         self.model = joblib.load(model_path)
    
#     def get_model(self): 
#         return self.model

#     def fit(self, X, y=None):
#         return self

#     def predict(self, X):
#         if self.model is None:
#             raise ValueError("Model is not loaded. Call set_model() before predict.")
#         return self.model.predict(X)

#     def predict_proba(self, X):
#         if self.model is None:
#             raise ValueError("Model is not loaded. Call set_model() before predict_proba.")
#         return self.model.predict_proba(X)

In [5]:
# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV, StratifiedKFold
# from sklearn.preprocessing import StandardScaler
# from sklearn.feature_selection import SelectFromModel

# # Define your pipeline
# pipeline = Pipeline([
#     #('scaler', StandardScaler()),  # Example preprocessing step
#     ('selector', SelectFromModel(PretrainedRSFWrapper(fold_model_path_template="models/fold_{fold}.pkl"), prefit = True)), 
#     ('model', RandomSurvivalForest())
# ])

# # Custom Cross-Validation
# cv = StratifiedKFold(n_splits=3)
# param_grid = {
#    'model__n_estimators' : [10, 15]
# }
# # Custom GridSearchCV wrapper
# class FoldAwareGridSearchCV(GridSearchCV):
#     def __init__(self):
#         self.iteration_step = 0
#         self.og_estimator = None
#         super().__init__()
        
        
#     def fit(self, X, y=None, **params):
#         if isinstance(self.estimator, Pipeline) and self.iteration_step == 0: 
#             self.og_estimator = clone(self.estimator)
            
#         else: 
#         self.estimator = clone(self.og_estimator)
#         self.estimator.steps.insert(0,['selector',SelectFromModel(RandomSurvivalForest(), prefit = True)])
#         super().fit(X, y, **params)

# # Instantiate and use FoldAwareGridSearchCV
# grid_search = FoldAwareGridSearchCV(
#     pipeline,
#     param_grid=param_grid,
#     cv=cv,
#     verbose=3
# )

# grid_search.fit(X, mp.y)  # X, y are your data
