In [1]:
import numpy as np
import pandas as pd
from functools import reduce
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
    brier_score,
    as_concordance_index_ipcw_scorer,
    as_cumulative_dynamic_auc_scorer,
    as_integrated_brier_score_scorer,
)

from sklearn.model_selection import KFold
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from sksurv.preprocessing import encode_categorical
from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv

import random as rn
import os
# set random seed
SEED = 0
np.random.seed(SEED)
rn.seed(SEED)
os.environ['PYTHONHASHSEED'] = '0'

In [2]:
# load data
df = pd.read_csv('/omics/odcf/analysis/OE0167_projects/dachs_genetic_data_platform/methylation_markers_new/processed_new/df_selectedf.csv')

# select the x, and y variables 
X = df.drop(['id', 'Diagnosis_year', 'chemradther', 'timey', 'death_all', 'Location',
               'timey_PFS', 'PFS'], axis=1)
               
Y = df[['timey_PFS', 'PFS']]
y_structured = Surv.from_arrays(Y['PFS'], Y['timey_PFS'])

X_processed = encode_categorical(X)

  columns_to_encode = {nam for nam, s in table.iteritems() if _is_categorical_or_object(s)}
  for name, series in table.iteritems():


In [5]:
## determine the tunning space
param_space = {
 "estimator__n_estimators": Integer(100,1000),
 "estimator__min_samples_split": Integer(6,20),
 "estimator__min_samples_leaf": Integer(2,10),
 "estimator__max_features": Categorical(['auto', 'sqrt', 'log2']),  
      
}     

In [None]:
n_folds_outer = 5
n_folds_inner = 3
cv_outer = KFold(n_splits=n_folds_outer, shuffle=True, random_state=SEED)
c_index_censored_scores= []
c_index_ipcw_scores = []
mean_dynamic_AUC = []
integrated_brier_scores = []
best_hyperparameters = []

for train_index, test_index in cv_outer.split(X_processed):
    X_train, X_test = X_processed.iloc[train_index], X_processed.iloc[test_index]
    y_train, y_test = y_structured[train_index], y_structured[test_index]
    
    lower, upper = np.percentile(y_test["time"], [10, 90])
    model_times = np.arange(lower, 15)
      
    # configure inner CV for tunning
    cv_inner = KFold(n_splits = n_folds_inner, shuffle = True, random_state = SEED)
     
    # define the model
    model =  RandomSurvivalForest(random_state=SEED, n_jobs = -1)

    # define tuning and search
    search = BayesSearchCV(as_integrated_brier_score_scorer(model, times=model_times), 
                          param_space, cv=cv_inner, n_jobs = -1, n_iter = 100, random_state=SEED)
      
    result = search.fit(X_train, y_train)  
    best_model = result.best_estimator_.estimator_ 
  
   ## store the best hyperparameters
    best_hyperparameters.append(result.best_params_)

   # evaluate the best model on the hold out dataset
      
    yhat = best_model.predict(X_test)   
   
    c_index_censored = concordance_index_censored(y_test['event'], y_test['time'], yhat)[0]
    c_index_censored_scores.append(c_index_censored)

    c_index_ipcw = concordance_index_ipcw(y_train, y_test, yhat)[0]
    c_index_ipcw_scores.append(c_index_ipcw)

    dauc = cumulative_dynamic_auc(y_train, y_test, yhat, model_times)
    mean_dynamic_AUC.append(dauc[-1])

    survs = best_model.predict_survival_function(X_test)
    preds = np.asarray([[fn(t) for t in model_times] for fn in survs])  
    brier_score_val = integrated_brier_score(y_train, y_test, preds, model_times)
    integrated_brier_scores.append(brier_score_val)   
    

In [None]:
## bind the best hyperparameters and performance 
folds = list(range(1, n_folds_outer+1))
performance_nestedcv = pd.DataFrame({
      'Folds' : folds,
       'Hyperparameters': best_hyperparameters,
       'C_index_censored': c_index_censored_scores,
      'C_index_ipcw': c_index_ipcw_scores,
       'Mean dynamic AUC': mean_dynamic_AUC,
      'Integrated Brier score': integrated_brier_scores})
performance_nestedcv
performance_nestedcv.to_csv('/omics/odcf/analysis/OE0167_projects/dachs_genetic_data_platform/methylation_markers_new/MS_tem_results/nestedcvRFS_performance.csv')



submit the above jobs to DKFZ cluster