In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import main_module as md

# figure fonts configuration
from matplotlib import rc
rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
rc('text', usetex=True)

main module is loaded
/home/rzhang98/code2025/CIBMTR_post_hct_survival/scripts/main_module.py


In [2]:
from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

from lifelines import CoxPHFitter
# import the score function
%run -i ../examples/concordance_index.ipynb

import xgboost as xgb
from catboost import CatBoostRegressor, Pool

from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv

In [3]:
# import data
from sklearn.model_selection import train_test_split
df_test= pd.read_csv("../data/test_validation_set.csv")
df_train = pd.read_csv("../data/train_set.csv")

## Data preprocessing pipelines
In this section, we create pipelines for preprocessing the data. The main goal here is to investigate if data imputation improves the performance.

In [4]:
# Naive preprocessor
# replace missing categorical variables by 'missing', replace missing numerical values by -1
class NaiveDataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
    
    def transform(self, X, y=None):
        X_transform = X.copy(deep = True)
        cat_cols = X_transform.select_dtypes(include = 'O').columns
        num_cols = X_transform.select_dtypes(exclude = 'O').columns
        X_transform[cat_cols] = X_transform[cat_cols].fillna("missing")
        X_transform[num_cols] = X_transform[num_cols].fillna(-1.0)
        return X_transform

    def fit(self, X, y=None):
        self.columns = X.columns
        return self 
    
    def get_feature_names_out(self, input_features = None):
        return self.columns

cat_cols = df_train.select_dtypes(include='O').columns
num_cols = df_train.select_dtypes(exclude='O').columns.drop(["ID", 'year_hct','efs', 'efs_time'])
other_cols = df_train.columns.drop(["ID", 'year_hct','efs', 'efs_time'])
# set_config(transform_output="pandas")
preproc_naive = Pipeline(
    steps = [('preprocessing',
                ColumnTransformer([('naive_missing', NaiveDataTransformer(), other_cols),
                                ('ID_year_dropper', 'drop', ["ID", 'year_hct'])],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
            ),
            ('naive_one_hot_encode',
                ColumnTransformer([('one_hot', OneHotEncoder(drop='first',
                                                             min_frequency = 0.001,
                                                             handle_unknown='ignore',
                                                             sparse_output= False), cat_cols)],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
            )
    ]
)

In [5]:
# Preprocessing based on KNN imputation 
class MissingValueTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, null_list = ["Missing Disease Status", "Missing disease status"]):
        self.null_list = null_list
        self.columns = None
    
    def transform(self, X, y=None):
        X_transform = X.copy(deep = True)
        X_transform.replace(self.null_list, np.nan, inplace = True)
        return X_transform

    def fit(self, X, y=None):
        self.columns = X.columns
        return self 
    
    def get_feature_names_out(self, input_features = None):
        return self.columns

cat_cols = df_train.select_dtypes(include='O').columns
preproc_sd = Pipeline(
    [   
        ('preprocessing',
                ColumnTransformer([
                                    ('cat_missing', MissingValueTransformer(), cat_cols),
                                    ('ID_year_dropper', 'drop', ["ID", 'year_hct'])],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
        ),
        (
            "encode_and_scale",
            ColumnTransformer(
                [
                    ('one_hot', 
                    OneHotEncoder(drop='first',
                                    min_frequency = 0.001,
                                    handle_unknown='ignore',
                                    sparse_output= False
                    ), 
                    cat_cols
                    ),
                    ('scale', StandardScaler(), ['donor_age', 'age_at_hct', 'karnofsky_score'])
                ],
                sparse_threshold=0,
                remainder='passthrough',
                verbose_feature_names_out=False,
                force_int_remainder_cols=False
            ).set_output(transform="pandas")
        ),
        (
            "impute",
            KNNImputer().set_output(transform = "pandas")
        ),
    ]
)

## Modeling Methods

In this section, we implement the actual modeling methods including 
* CoxPH model
* XGboost AFT
* Catboost AFT
* Survival Random Foreast

In [6]:
# cox propotional harzard model
def cph_model(X_train_preproc, y_train, X_test_preproc):

    train_preproc = pd.concat([X_train_preproc, y_train], axis=1)
    cph = CoxPHFitter()
    cph.fit(train_preproc, duration_col='efs_time', event_col='efs')
    preds = cph.predict_partial_hazard(X_test_preproc)
    
    return preds

In [7]:
## XGboost
params = {'objective': 'survival:aft',
          'eval_metric': 'aft-nloglik',
          'aft_loss_distribution': 'normal',
         'aft_loss_distribution_scale': 0.80,
          'tree_method': 'hist', 'learning_rate': 0.05, 'max_depth': 6}
   
def xgb_aft_model(X_train_preproc, y_train, X_test_preproc, params = params):

    # remove special character
    X_train_preproc.columns = X_train_preproc.columns.str.replace('<','')
    X_test_preproc.columns = X_test_preproc.columns.str.replace('<','')

    y_lower_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound[y_train['efs'] == 0.0] = +np.inf

    dtrain = xgb.DMatrix(X_train_preproc)
    dtrain.set_float_info('label_lower_bound', y_lower_bound)
    dtrain.set_float_info('label_upper_bound', y_upper_bound)

    bst = xgb.train(params, dtrain, num_boost_round=500, evals=[(dtrain, 'train')], verbose_eval = 0)
    dtest = xgb.DMatrix(X_test_preproc)
    preds = bst.predict(dtest)

    return -preds

In [8]:
## Catboost

# catboost does not directly take one-hot encoding
# instead, it requires an explicit declaration of catgorical features

# Here are the modified pipelines for catboost
# The basic idea is to remove the one-hot encoding process

num_cols = df_train.select_dtypes(exclude='O').columns.drop(["ID", 'year_hct','efs', 'efs_time'])
cb_preproc_naive = NaiveDataTransformer()
cb_preproc_sd = Pipeline(
    [   
        ('preprocessing',
                ColumnTransformer([
                                    ('cat_missing', MissingValueTransformer(), cat_cols),
                                    ('ID_year_dropper', 'drop', ["ID", 'year_hct']),
                                    ('scale', StandardScaler(), ['donor_age', 'age_at_hct', 'karnofsky_score'])],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
        ),
        ('impute',
                ColumnTransformer([("num_KNNimpute", KNNImputer(), num_cols),
                                   ("cat_indicate", NaiveDataTransformer(), cat_cols)],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
        )
    ]
)


def cb_aft_model(X_train, y_train, X_test, y_test, pipeline):

    X_train_proc = pipeline.fit_transform(X_train)
    X_test_proc = pipeline.fit_transform(X_test)

    y_lower_train = y_train[['efs_time']].copy(deep = True)
    y_upper_train = y_train[['efs_time']].copy(deep = True)
    # in catboost, infinity is represented by -1
    y_upper_train.iloc[y_train['efs'] == 0.0] = -1

    train_label = np.concatenate((y_lower_train, y_upper_train), axis = 1)
    train_label = pd.DataFrame(train_label, columns = ['y_lower_train', 'y_upper_train'])
    cat_features = list(X_train.select_dtypes(include= 'O').columns)

    y_lower_test = y_test[['efs_time']].copy(deep = True)
    y_upper_test = y_test[['efs_time']].copy(deep = True)
    # in catboost, infinity is represented by -1
    y_upper_test.iloc[y_test['efs'] == 0.0] = -1
    
    test_label = np.concatenate((y_lower_test, y_upper_test), axis = 1)
    test_label = pd.DataFrame(test_label, columns = ['y_lower_test', 'y_upper_test'])

    train_pool = Pool(X_train_proc,label = train_label, cat_features= cat_features)
    test_pool = Pool(X_test_proc,label = test_label, cat_features= cat_features)

    model_normal = CatBoostRegressor(iterations=500,
                                 loss_function='SurvivalAft:dist=Normal',
                                 eval_metric='SurvivalAft',
                                 verbose=0
                                )
    _ = model_normal.fit(train_pool, eval_set=test_pool)
    preds = model_normal.predict(test_pool, prediction_type='Exponent')
    
    return -preds

In [9]:
#Random survival forest
def rsf_model(X_train_preproc, y_train, X_test_preproc):

    y_train = Surv.from_dataframe("efs", "efs_time", y_train)

    rsf = RandomSurvivalForest(
        n_estimators=30,
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        n_jobs=4,
        verbose=1,
        random_state=42
    )
    rsf.fit(X_train_preproc, y_train)
    surv_funcs = rsf.predict_survival_function(X_test_preproc, return_array=False)
    preds = np.array([-np.trapz(fn.y, fn.x) for fn in surv_funcs])
    
    return preds

In [10]:
## hybrid cph

def hybrid_cph_model(X_train_preproc, y_train, X_test_preproc):

    # create embedding with logistic regression
    train_idx = X_train_preproc.index
    test_idx = X_test_preproc.index
    col = X_train_preproc.columns.to_list()
    col.append("class")
    clf = LogisticRegression(max_iter=15000)
    clf.fit(X_train_preproc, y_train["efs"])
    X_train_preproc = pd.DataFrame(data = np.concatenate((X_train_preproc.to_numpy(), np.reshape(clf.predict(X_train_preproc), (-1, 1) )), axis=1), 
                                   index = train_idx,
                                   columns = col)

    X_test_preproc = pd.DataFrame(data = np.concatenate((X_test_preproc.to_numpy(), np.reshape(clf.predict(X_test_preproc), (-1, 1) )), axis=1), 
                                  index = test_idx,
                                  columns = col)

    # train cph
    train_preproc = pd.concat([X_train_preproc, y_train], axis=1)
    cph = CoxPHFitter()
    cph.fit(train_preproc, duration_col='efs_time', event_col='efs')
    preds = cph.predict_partial_hazard(X_test_preproc)
    
    return preds

In [11]:
## hybrid XGboost

params = {'objective': 'survival:aft',
          'eval_metric': 'aft-nloglik',
          'aft_loss_distribution': 'normal',
         'aft_loss_distribution_scale': 0.80,
          'tree_method': 'hist', 'learning_rate': 0.05, 'max_depth': 6}
   
def hybrid_xgb_aft_model(X_train_preproc, y_train, X_test_preproc, params = params):

    # create embedding with logistic regression
    train_idx = X_train_preproc.index
    test_idx = X_test_preproc.index
    col = X_train_preproc.columns.to_list()
    col.append("class")
    clf = LogisticRegression(max_iter=15000)
    clf.fit(X_train_preproc, y_train["efs"])
    X_train_preproc = pd.DataFrame(data = np.concatenate((X_train_preproc.to_numpy(), np.reshape(clf.predict(X_train_preproc), (-1, 1) )), axis=1), 
                                   index = train_idx,
                                   columns = col)

    X_test_preproc = pd.DataFrame(data = np.concatenate((X_test_preproc.to_numpy(), np.reshape(clf.predict(X_test_preproc), (-1, 1) )), axis=1), 
                                  index = test_idx,
                                  columns = col)

    # train XGboost
    # remove special character
    X_train_preproc.columns = X_train_preproc.columns.str.replace('<','')
    X_test_preproc.columns = X_test_preproc.columns.str.replace('<','')

    y_lower_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound[y_train['efs'] == 0.0] = +np.inf

    dtrain = xgb.DMatrix(X_train_preproc)
    dtrain.set_float_info('label_lower_bound', y_lower_bound)
    dtrain.set_float_info('label_upper_bound', y_upper_bound)

    bst = xgb.train(params, dtrain, num_boost_round=500, evals=[(dtrain, 'train')], verbose_eval = 0)
    dtest = xgb.DMatrix(X_test_preproc)
    preds = bst.predict(dtest)

    return -preds

In [12]:
## hybrid random survival forest

def hybrid_rsf_model(X_train_preproc, y_train, X_test_preproc):

    # create embedding with logistic regression
    train_idx = X_train_preproc.index
    test_idx = X_test_preproc.index
    col = X_train_preproc.columns.to_list()
    col.append("class")
    clf = LogisticRegression(max_iter=15000)
    clf.fit(X_train_preproc, y_train["efs"])
    X_train_preproc = pd.DataFrame(data = np.concatenate((X_train_preproc.to_numpy(), np.reshape(clf.predict(X_train_preproc), (-1, 1) )), axis=1), 
                                   index = train_idx,
                                   columns = col)

    X_test_preproc = pd.DataFrame(data = np.concatenate((X_test_preproc.to_numpy(), np.reshape(clf.predict(X_test_preproc), (-1, 1) )), axis=1), 
                                  index = test_idx,
                                  columns = col)

    # train random survival forest
    y_train = Surv.from_dataframe("efs", "efs_time", y_train)

    rsf = RandomSurvivalForest(
        n_estimators=30,
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        n_jobs=4,
        verbose=1,
        random_state=42
    )
    rsf.fit(X_train_preproc, y_train)
    surv_funcs = rsf.predict_survival_function(X_test_preproc, return_array=False)
    preds = np.array([-np.trapz(fn.y, fn.x) for fn in surv_funcs])
    
    return preds

In [13]:
## hybird rf-xgb

class RFCTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
        self.rcf_classifier = None
    
    def transform(self, X, y=None):
        X_transform = X.copy(deep = True)
        proba_column = self.rcf_classifier.predict_proba(X_transform)[:, 1]
        X_transform["clf_proba"] = proba_column

        self.columns = X_transform.columns
        return X_transform

    def fit(self, X, y):
        RFC = RandomForestClassifier()
        RFC.fit(X, y)
        self.rcf_classifier = RFC
        self._is_fitted = True
        return self 
    
    def get_feature_names_out(self, input_features = None):
        return self.columns

    def __sklearn_is_fitted__(self):
        return hasattr(self, "_is_fitted") and self._is_fitted

cat_cols = df_train.select_dtypes(include='O').columns
num_cols = df_train.select_dtypes(exclude='O').columns.drop(["ID", 'year_hct', 'efs', 'efs_time'])

# Ela - pipeline
preproc_ela = Pipeline(steps=[
    ('preprocessor', MissingValueTransformer(
        null_list = ["Not done", "Not tested", "Other", "Missing disease status", "Non-resident of the U.S."])
    ),
    ('imputing one-hot encoding',
        ColumnTransformer(
            transformers=[
                ('cat_imputer', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                                        ('one-hot encoder', OneHotEncoder(drop='first', 
                                                                    sparse_output=False, 
                                                                    handle_unknown='ignore'))]), cat_cols),
                ('num_imputer', Pipeline([('imputer', SimpleImputer(strategy='mean')),
                                        ('scaler', StandardScaler())]), num_cols),
                ('ID_year_dropper', 'drop', ["ID", 'year_hct'])
            ],
            sparse_threshold=0,
            remainder='passthrough',
            verbose_feature_names_out=False,
            force_int_remainder_cols=False
        ).set_output(transform = 'pandas')
    ),
    ('random forest transform', RFCTransformer().set_output(transform = 'pandas'))
])

def rf_xgb(X_train, y_train, X_test, y_test):
    params = {
        'objective': 'survival:aft',
        'eval_metric': 'aft-nloglik',
        'tree_method': 'hist',
        'seed': 42, 
        'aft_loss_distribution': 'normal',
        'aft_loss_distribution_scale': 0.34089400351953153,
        'learning_rate': 0.07894387344725944,
        'max_depth': 8,
        'min_child_weight': 1,
        'subsample': 0.947394577078348,
        'colsample_bytree': 0.8323203114860168,
        'lambda': 0.40756304508622526,
        'alpha': 6.828765311809384
    }
    preproc_ela.fit(X_train, y_train['efs'])
    X_train_preproc = preproc_ela.transform(X_train)
    X_test_preproc = preproc_ela.transform(X_test)

    # remove special character
    X_train_preproc.columns = X_train_preproc.columns.str.replace('<','')
    X_test_preproc.columns = X_test_preproc.columns.str.replace('<','')

    y_lower_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound[y_train['efs'] == 0.0] = +np.inf

    dtrain = xgb.DMatrix(X_train_preproc)
    dtrain.set_float_info('label_lower_bound', y_lower_bound)
    dtrain.set_float_info('label_upper_bound', y_upper_bound)

    bst = xgb.train(params, dtrain, num_boost_round=500, evals=[(dtrain, 'train')], verbose_eval = 0)
    dtest = xgb.DMatrix(X_test_preproc)
    preds = bst.predict(dtest)

    return -preds
    

## Five-fold Cross validation

Here we use the five fold cross validation for getting a baseline results

In [14]:
# helper functions
# eval evalutes stratified c-index and c-index
def eval(preds, X_test, solution):
    prediction= pd.DataFrame({"ID":X_test["ID"], "prediction":preds})
    sc_score = score(solution.copy(deep=True), prediction.copy(deep=True), "ID")
    c_index = concordance_index(y_test['efs_time'], -preds, y_test['efs'])

    return sc_score, c_index

# file_output export all the information into a csv file
def file_output(filename, sc_indexes, columns):
    sc_mean = sc_indexes.mean()
    output = pd.DataFrame(np.concatenate(
                                        (sc_indexes.to_numpy(), np.expand_dims(sc_mean.to_numpy(), axis = 0)
                                     ), axis = 0), 
                                     index=[0,1,2,3,4, 'mean'],
                                     columns = columns)
    output.to_csv(filename, sep= '\t', index= True, header= True)

In [19]:
preproc_pipline = preproc_naive

n_splits = 5
kfold = KFold(n_splits = n_splits, shuffle = True, random_state = 42)
target_features = ['efs', 'efs_time']

methods_list = ['cph', 'h_cph', 'xgb_aft', 'h_xgb_aft', 'rsf', 'h_rsf', 'cat_aft']
sc_indexes = -1.0 * np.ones((n_splits, len(methods_list))) 
sc_indexes = pd.DataFrame(data = sc_indexes, columns = methods_list)

for i, (train_idx,test_idx) in enumerate(kfold.split(df_train)):
        X_train = df_train.iloc[train_idx].drop(columns = target_features)
        y_train = df_train.loc[train_idx, target_features]

        X_test = df_train.iloc[test_idx].drop(columns = target_features)
        y_test = df_train.loc[test_idx, target_features]

        preproc_pipline.fit(X_train)
        X_train_preproc = preproc_pipline.transform(X_train)
        X_test_preproc =preproc_pipline.transform(X_test)

        solution = df_train.iloc[test_idx]

        # Chi-Hao logistic hybrid versions
        # hybrid cph
        preds_hcph = hybrid_cph_model(X_train_preproc, y_train, X_test_preproc)
        score_hcph, c_index_hcph = eval(preds_hcph, X_test, solution)
        sc_indexes.loc[i, 'h_cph'] = score_hcph

        # hybrid XGB
        preds_hxgb = hybrid_xgb_aft_model(X_train_preproc, y_train, X_test_preproc)
        score_hxgb, c_index_hxgb = eval(preds_hxgb, X_test, solution)
        sc_indexes.loc[i, 'h_xgb_aft'] = score_hxgb

        # hybrid survival random forest
        preds_hrsf = hybrid_rsf_model(X_train_preproc, y_train, X_test_preproc)
        score_hrsf, c_index_hrsf = eval(preds_hrsf, X_test, solution)
        sc_indexes.loc[i, 'h_rsf'] = score_hrsf

        preds_cph = cph_model(X_train_preproc, y_train, X_test_preproc)
        preds_xgb = xgb_aft_model(X_train_preproc, y_train, X_test_preproc, params = params)
        preds_cb = cb_aft_model(X_train, y_train, X_test, y_test, cb_preproc_naive)
        preds_rsf = rsf_model(X_train_preproc, y_train, X_test_preproc)

        score_cph, c_index_cph = eval(preds_cph, X_test, solution)
        score_xgb, c_index_xgb = eval(preds_xgb, X_test, solution)
        score_cb, c_index_cb = eval(preds_cb, X_test, solution)
        score_rsf, c_index_rsf = eval(preds_rsf, X_test, solution)

        print(f"stratified c-index for fold {i}: \n \
                SC-index: cph: {score_cph}, xgb_aft: {score_xgb}, cat_aft: {score_cb}, rsf_aft: {score_rsf} \n \
                C_index: cph: {c_index_cph}, xgb_aft: {c_index_xgb}, cat_aft: {c_index_cb}, rsf_aft: {c_index_rsf} \n")

        sc_indexes.loc[i, 'cph'] = score_cph
        sc_indexes.loc[i, 'xgb_aft'] = score_xgb
        sc_indexes.loc[i, 'cat_aft'] = score_cb
        sc_indexes.loc[i, 'rsf'] = score_rsf

print("mean stratified c-index across all 5 folds:")
print(sc_indexes.mean())

file_output('output_naive(baseline)_cv.csv', sc_indexes, methods_list)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   14.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   16.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.8s finished


stratified c-index for fold 0: 
                 SC-index: cph: 0.652276376889573, xgb_aft: 0.6584019240108607, cat_aft: 0.6587152664116722, rsf_aft: 0.6251971163193465 
                 C_index: cph: 0.67454908235802, xgb_aft: 0.6760540015952347, cat_aft: 0.6821519360158316, rsf_aft: 0.6488515014630147 



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   15.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    6.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   18.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    6.0s finished


stratified c-index for fold 1: 
                 SC-index: cph: 0.6443178893129566, xgb_aft: 0.6392215188149583, cat_aft: 0.6478186882133291, rsf_aft: 0.6212123949052654 
                 C_index: cph: 0.6679025860424361, xgb_aft: 0.6607663101337165, cat_aft: 0.67152380373496, rsf_aft: 0.6429413878124829 



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   15.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   16.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.8s finished


stratified c-index for fold 2: 
                 SC-index: cph: 0.6619024320614599, xgb_aft: 0.6636890474633307, cat_aft: 0.6674275566820878, rsf_aft: 0.6369712769727391 
                 C_index: cph: 0.6739880730672791, xgb_aft: 0.6742245332664779, cat_aft: 0.6804133252445986, rsf_aft: 0.6498288447201191 



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   16.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    6.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   18.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    8.0s finished


stratified c-index for fold 3: 
                 SC-index: cph: 0.6426435877769467, xgb_aft: 0.6461064348430947, cat_aft: 0.650319985544931, rsf_aft: 0.616889016685384 
                 C_index: cph: 0.6587793129897566, xgb_aft: 0.6639460815488878, cat_aft: 0.666539994984622, rsf_aft: 0.6312815014854256 



[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   15.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   21.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    7.6s finished


stratified c-index for fold 4: 
                 SC-index: cph: 0.652361726219943, xgb_aft: 0.6562894348770086, cat_aft: 0.6646352343262615, rsf_aft: 0.6226528966871737 
                 C_index: cph: 0.669041538178832, xgb_aft: 0.672468777695469, cat_aft: 0.6761054978437013, rsf_aft: 0.6457909825945641 

mean stratified c-index across all 5 folds:
cph          0.650700
h_cph        0.651429
xgb_aft      0.652742
h_xgb_aft    0.653336
rsf          0.624585
h_rsf        0.633964
cat_aft      0.657783
dtype: float64


In [16]:
preproc_pipline = preproc_sd

n_splits = 5
kfold = KFold(n_splits = n_splits, shuffle = True, random_state = 42)
target_features = ['efs', 'efs_time']

methods_list = ['cph', 'h_cph', 'xgb_aft', 'h_xgb_aft', 'rf_xgb', 'rsf', 'h_rsf', 'cat_aft']
sc_indexes = -1.0 * np.ones((n_splits, len(methods_list))) 
sc_indexes = pd.DataFrame(data = sc_indexes, columns = methods_list)

try:
    for i, (train_idx,test_idx) in enumerate(kfold.split(df_train)):

        X_train = df_train.iloc[train_idx].drop(columns = target_features)
        y_train = df_train.loc[train_idx, target_features]

        X_test = df_train.iloc[test_idx].drop(columns = target_features)
        y_test = df_train.loc[test_idx, target_features]

        preproc_pipline.fit(X_train)
        X_train_preproc = preproc_pipline.transform(X_train)
        X_test_preproc =preproc_pipline.transform(X_test)

        solution = df_train.iloc[test_idx]

        # Ela random forest hybrid XGB
        preds_rf_xgb = rf_xgb(X_train, y_train, X_test, y_test)
        score_rf_xgb, c_index_rf_xgb = eval(preds_rf_xgb, X_test, solution)
        sc_indexes.loc[i, 'rf_xgb'] = score_rf_xgb

        # Chi-Hao logistic hybrid versions
        # hybrid cph
        preds_hcph = hybrid_cph_model(X_train_preproc, y_train, X_test_preproc)
        score_hcph, c_index_hcph = eval(preds_hcph, X_test, solution)
        sc_indexes.loc[i, 'h_cph'] = score_hcph

        # hybrid XGB
        preds_hxgb = hybrid_xgb_aft_model(X_train_preproc, y_train, X_test_preproc)
        score_hxgb, c_index_hxgb = eval(preds_hxgb, X_test, solution)
        sc_indexes.loc[i, 'h_xgb_aft'] = score_hxgb

        # hybrid survival random forest
        preds_hrsf = hybrid_rsf_model(X_train_preproc, y_train, X_test_preproc)
        score_hrsf, c_index_hrsf = eval(preds_hrsf, X_test, solution)
        sc_indexes.loc[i, 'h_rsf'] = score_hrsf

        # Yang/Ray survival random forest
        preds_rsf = rsf_model(X_train_preproc, y_train, X_test_preproc)
        score_rsf, c_index_rsf = eval(preds_rsf, X_test, solution)
        sc_indexes.loc[i, 'rsf'] = score_rsf

        # Ray baseline Cox proportional harzard
        preds_cph = cph_model(X_train_preproc, y_train, X_test_preproc)
        score_cph, c_index_cph = eval(preds_cph, X_test, solution)
        sc_indexes.loc[i, 'cph'] = score_cph

        # Ela/Ruibo xgb aft
        preds_xgb = xgb_aft_model(X_train_preproc, y_train, X_test_preproc, params = params)
        score_xgb, c_index_xgb = eval(preds_xgb, X_test, solution)
        sc_indexes.loc[i, 'xgb_aft'] = score_xgb

        # Ruibo catboost aft
        preds_cb = cb_aft_model(X_train, y_train, X_test, y_test, cb_preproc_sd)
        score_cb, c_index_cb = eval(preds_cb, X_test, solution)
        sc_indexes.loc[i, 'cat_aft'] = score_cb

        print(f"stratified c-index for fold {i}: \n \
                SC-index: cph: {score_cph}, xgb_aft: {score_xgb},  cat_aft: {score_cb}, rsf_aft: {score_rsf} \n \
                C_index: cph: {c_index_cph}, xgb_aft: {c_index_xgb}, cat_aft: {c_index_cb}, rsf_aft: {c_index_rsf}")
    print("Ray pipeline mean stratified c-index across all 5 folds:")
    print(sc_indexes.mean())

    file_output('output_proc(Ray)_cv.csv', sc_indexes, methods_list)
except:
    print("failed")
    file_output('output_proc(Ray)_cv(ERROR).csv', sc_indexes, methods_list)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   17.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   16.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.7s finished


stratified c-index for fold 0: 
                 SC-index: cph: 0.6546097516619727, xgb_aft: 0.6528320804213839,  cat_aft: 0.6543832636607791, rsf_aft: 0.6260525341646062 
                 C_index: cph: 0.6750910944429793, xgb_aft: 0.6709875615389657, cat_aft: 0.6756497011710685, rsf_aft: 0.6476231397136433


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   16.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   17.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    6.1s finished


stratified c-index for fold 1: 
                 SC-index: cph: 0.6440651847196557, xgb_aft: 0.6423926885764277,  cat_aft: 0.6418738779362735, rsf_aft: 0.6191700445422199 
                 C_index: cph: 0.6692466709315013, xgb_aft: 0.6625115021144633, cat_aft: 0.6657436489371114, rsf_aft: 0.641404241925651


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   17.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   17.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    6.1s finished


stratified c-index for fold 2: 
                 SC-index: cph: 0.6612368552262808, xgb_aft: 0.6585458563856175,  cat_aft: 0.6659247025421783, rsf_aft: 0.6327720678215303 
                 C_index: cph: 0.6738423656938993, xgb_aft: 0.6730358867562773, cat_aft: 0.6776689502136942, rsf_aft: 0.644917117407697


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   16.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.8s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   17.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.9s finished


stratified c-index for fold 3: 
                 SC-index: cph: 0.6464834164525809, xgb_aft: 0.6476285379910794,  cat_aft: 0.6436857315381221, rsf_aft: 0.6177262659067099 
                 C_index: cph: 0.6605217827424174, xgb_aft: 0.6607113678924575, cat_aft: 0.660917003217517, rsf_aft: 0.6295392730887801


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   17.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   17.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    5.5s finished


stratified c-index for fold 4: 
                 SC-index: cph: 0.6502967729721472, xgb_aft: 0.6457667910653513,  cat_aft: 0.6572208978276771, rsf_aft: 0.6229223104806936 
                 C_index: cph: 0.6695021323716606, xgb_aft: 0.6678246722483505, cat_aft: 0.6721087365985036, rsf_aft: 0.6448617786736036
Ray pipeline mean stratified c-index across all 5 folds:
cph          0.651338
h_cph        0.652316
xgb_aft      0.649433
h_xgb_aft    0.649548
rf_xgb       0.631645
rsf          0.623729
h_rsf        0.631653
cat_aft      0.652618
dtype: float64
