In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

from lifelines import CoxPHFitter
# import the score function
%run -i examples/concordance_index.ipynb

import xgboost as xgb
from catboost import CatBoostRegressor, Pool

from sksurv.ensemble import RandomSurvivalForest
from sksurv.util import Surv

In [2]:
# import data
from sklearn.model_selection import train_test_split
df_test= pd.read_csv("data/test_validation_set.csv")
df_train = pd.read_csv("data/train_set.csv")

## Data preprocessing pipelines
In this section, we create pipelines for preprocessing the data. The main goal here is to investigate if data imputation improves the performance.

In [None]:
# Naive preprocessor
# replace missing categorical variables by 'missing', replace missing numerical values by -1
class NaiveDataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
    
    def transform(self, X, y=None):
        X_transform = X.copy(deep = True)
        cat_cols = X_transform.select_dtypes(include = 'O').columns
        num_cols = X_transform.select_dtypes(exclude = 'O').columns
        X_transform[cat_cols] = X_transform[cat_cols].fillna("missing")
        X_transform[num_cols] = X_transform[num_cols].fillna(-1.0)
        return X_transform

    def fit(self, X, y=None):
        self.columns = X.columns
        return self 
    
    def get_feature_names_out(self, input_features = None):
        return self.columns

cat_cols = df_train.select_dtypes(include='O').columns
num_cols = df_train.select_dtypes(exclude='O').columns.drop(["ID", 'year_hct','efs', 'efs_time'])
other_cols = df_train.columns.drop(["ID", 'year_hct','efs', 'efs_time'])
# set_config(transform_output="pandas")
preproc_naive = Pipeline(
    steps = [('preprocessing',
                ColumnTransformer([('naive_missing', NaiveDataTransformer(), other_cols),
                                ('ID_year_dropper', 'drop', ["ID", 'year_hct'])],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
            ),
            ('naive_one_hot_encode',
                ColumnTransformer([('one_hot', OneHotEncoder(drop='first',
                                                             min_frequency = 0.001,
                                                             handle_unknown='ignore',
                                                             sparse_output= False), cat_cols)],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
            )
    ]
)

In [None]:
# Preprocessing based on KNN imputation 
class MissingValueTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, null_list = ["Missing Disease Status", "Missing disease status"]):
        self.null_list = null_list
        self.columns = None
    
    def transform(self, X, y=None):
        X_transform = X.copy(deep = True)
        X_transform.replace(self.null_list, np.nan, inplace = True)
        return X_transform

    def fit(self, X, y=None):
        self.columns = X.columns
        return self 
    
    def get_feature_names_out(self, input_features = None):
        return self.columns

cat_cols = df_train.select_dtypes(include='O').columns
preproc_sd = Pipeline(
    [   
        ('preprocessing',
                ColumnTransformer([
                                    ('cat_missing', MissingValueTransformer(), cat_cols),
                                    ('ID_year_dropper', 'drop', ["ID", 'year_hct'])],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
        ),
        (
            "encode_and_scale",
            ColumnTransformer(
                [
                    ('one_hot', 
                    OneHotEncoder(drop='first',
                                    min_frequency = 0.001,
                                    handle_unknown='ignore',
                                    sparse_output= False
                    ), 
                    cat_cols
                    ),
                    ('scale', StandardScaler(), ['donor_age', 'age_at_hct', 'karnofsky_score'])
                ],
                sparse_threshold=0,
                remainder='passthrough',
                verbose_feature_names_out=False,
                force_int_remainder_cols=False
            ).set_output(transform="pandas")
        ),
        (
            "impute",
            KNNImputer().set_output(transform = "pandas")
        ),
    ]
)

## Modeling Methods

In this section, we implement the actual modeling methods including 
* CoxPH model
* XGboost AFT
* Catboost AFT
* Survival Random Foreast

In [None]:
# cox propotional harzard model
def cph_model(X_train_preproc, y_train, X_test_preproc):

    train_preproc = pd.concat([X_train_preproc, y_train], axis=1)
    cph = CoxPHFitter()
    cph.fit(train_preproc, duration_col='efs_time', event_col='efs')
    preds = cph.predict_partial_hazard(X_test_preproc)
    
    return preds

In [None]:
## XGboost
params = {'objective': 'survival:aft',
          'eval_metric': 'aft-nloglik',
          'aft_loss_distribution': 'normal',
         'aft_loss_distribution_scale': 0.80,
          'tree_method': 'hist', 'learning_rate': 0.05, 'max_depth': 6}
   
def xgb_aft_model(X_train_preproc, y_train, X_test_preproc, params = params):

    # remove special character
    X_train_preproc.columns = X_train_preproc.columns.str.replace('<','')
    X_test_preproc.columns = X_test_preproc.columns.str.replace('<','')

    y_lower_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound[y_train['efs'] == 0.0] = +np.inf

    dtrain = xgb.DMatrix(X_train_preproc)
    dtrain.set_float_info('label_lower_bound', y_lower_bound)
    dtrain.set_float_info('label_upper_bound', y_upper_bound)

    bst = xgb.train(params, dtrain, num_boost_round=500, evals=[(dtrain, 'train')], verbose_eval = 0)
    dtest = xgb.DMatrix(X_test_preproc)
    preds = bst.predict(dtest)

    return -preds

In [None]:
## Catboost

# catboost does not directly take one-hot encoding
# instead, it requires an explicit declaration of catgorical features

# Here are the modified pipelines for catboost
# The basic idea is to remove the one-hot encoding process

num_cols = df_train.select_dtypes(exclude='O').columns.drop(["ID", 'year_hct','efs', 'efs_time'])
cb_preproc_naive = NaiveDataTransformer()
cb_preproc_sd = Pipeline(
    [   
        ('preprocessing',
                ColumnTransformer([
                                    ('cat_missing', MissingValueTransformer(), cat_cols),
                                    ('ID_year_dropper', 'drop', ["ID", 'year_hct']),
                                    ('scale', StandardScaler(), ['donor_age', 'age_at_hct', 'karnofsky_score'])],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
        ),
        ('impute',
                ColumnTransformer([("num_KNNimpute", KNNImputer(), num_cols),
                                   ("cat_indicate", NaiveDataTransformer(), cat_cols)],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
        )
    ]
)


def cb_aft_model(X_train, y_train, X_test, y_test, pipeline):

    X_train_proc = pipeline.fit_transform(X_train)
    X_test_proc = pipeline.fit_transform(X_test)

    y_lower_train = y_train[['efs_time']].copy(deep = True)
    y_upper_train = y_train[['efs_time']].copy(deep = True)
    # in catboost, infinity is represented by -1
    y_upper_train.iloc[y_train['efs'] == 0.0] = -1

    train_label = np.concatenate((y_lower_train, y_upper_train), axis = 1)
    train_label = pd.DataFrame(train_label, columns = ['y_lower_train', 'y_upper_train'])
    cat_features = list(X_train.select_dtypes(include= 'O').columns)

    y_lower_test = y_test[['efs_time']].copy(deep = True)
    y_upper_test = y_test[['efs_time']].copy(deep = True)
    # in catboost, infinity is represented by -1
    y_upper_test.iloc[y_test['efs'] == 0.0] = -1
    
    test_label = np.concatenate((y_lower_test, y_upper_test), axis = 1)
    test_label = pd.DataFrame(test_label, columns = ['y_lower_test', 'y_upper_test'])

    train_pool = Pool(X_train_proc,label = train_label, cat_features= cat_features)
    test_pool = Pool(X_test_proc,label = test_label, cat_features= cat_features)

    model_normal = CatBoostRegressor(iterations=500,
                                 loss_function='SurvivalAft:dist=Normal',
                                 eval_metric='SurvivalAft',
                                 verbose=0
                                )
    _ = model_normal.fit(train_pool, eval_set=test_pool)
    preds = model_normal.predict(test_pool, prediction_type='Exponent')
    
    return -preds

In [None]:
#Random survival forest
def rsf_model(X_train_preproc, y_train, X_test_preproc):

    y_train = Surv.from_dataframe("efs", "efs_time", y_train)

    rsf = RandomSurvivalForest(
        n_estimators=30,
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        n_jobs=4,
        verbose=1,
        random_state=42
    )
    rsf.fit(X_train_preproc, y_train)
    surv_funcs = rsf.predict_survival_function(X_test_preproc, return_array=False)
    preds = np.array([-np.trapz(fn.y, fn.x) for fn in surv_funcs])
    
    return preds

In [None]:
## hybrid cph

def hybrid_cph_model(X_train_preproc, y_train, X_test_preproc):

    # create embedding with logistic regression
    train_idx = X_train_preproc.index
    test_idx = X_test_preproc.index
    col = X_train_preproc.columns.to_list()
    col.append("class")
    clf = LogisticRegression(max_iter=15000)
    clf.fit(X_train_preproc, y_train["efs"])
    X_train_preproc = pd.DataFrame(data = np.concatenate((X_train_preproc.to_numpy(), np.reshape(clf.predict(X_train_preproc), (-1, 1) )), axis=1), 
                                   index = train_idx,
                                   columns = col)

    X_test_preproc = pd.DataFrame(data = np.concatenate((X_test_preproc.to_numpy(), np.reshape(clf.predict(X_test_preproc), (-1, 1) )), axis=1), 
                                  index = test_idx,
                                  columns = col)

    # train cph
    train_preproc = pd.concat([X_train_preproc, y_train], axis=1)
    cph = CoxPHFitter()
    cph.fit(train_preproc, duration_col='efs_time', event_col='efs')
    preds = cph.predict_partial_hazard(X_test_preproc)
    
    return preds

In [None]:
## hybrid XGboost

params = {'objective': 'survival:aft',
          'eval_metric': 'aft-nloglik',
          'aft_loss_distribution': 'normal',
         'aft_loss_distribution_scale': 0.80,
          'tree_method': 'hist', 'learning_rate': 0.05, 'max_depth': 6}
   
def hybrid_xgb_aft_model(X_train_preproc, y_train, X_test_preproc, params = params):

    # create embedding with logistic regression
    train_idx = X_train_preproc.index
    test_idx = X_test_preproc.index
    col = X_train_preproc.columns.to_list()
    col.append("class")
    clf = LogisticRegression(max_iter=15000)
    clf.fit(X_train_preproc, y_train["efs"])
    X_train_preproc = pd.DataFrame(data = np.concatenate((X_train_preproc.to_numpy(), np.reshape(clf.predict(X_train_preproc), (-1, 1) )), axis=1), 
                                   index = train_idx,
                                   columns = col)

    X_test_preproc = pd.DataFrame(data = np.concatenate((X_test_preproc.to_numpy(), np.reshape(clf.predict(X_test_preproc), (-1, 1) )), axis=1), 
                                  index = test_idx,
                                  columns = col)

    # train XGboost
    # remove special character
    X_train_preproc.columns = X_train_preproc.columns.str.replace('<','')
    X_test_preproc.columns = X_test_preproc.columns.str.replace('<','')

    y_lower_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound[y_train['efs'] == 0.0] = +np.inf

    dtrain = xgb.DMatrix(X_train_preproc)
    dtrain.set_float_info('label_lower_bound', y_lower_bound)
    dtrain.set_float_info('label_upper_bound', y_upper_bound)

    bst = xgb.train(params, dtrain, num_boost_round=500, evals=[(dtrain, 'train')], verbose_eval = 0)
    dtest = xgb.DMatrix(X_test_preproc)
    preds = bst.predict(dtest)

    return -preds

In [None]:
## hybrid random survival forest

def hybrid_rsf_model(X_train_preproc, y_train, X_test_preproc):

    # create embedding with logistic regression
    train_idx = X_train_preproc.index
    test_idx = X_test_preproc.index
    col = X_train_preproc.columns.to_list()
    col.append("class")
    clf = LogisticRegression(max_iter=15000)
    clf.fit(X_train_preproc, y_train["efs"])
    X_train_preproc = pd.DataFrame(data = np.concatenate((X_train_preproc.to_numpy(), np.reshape(clf.predict(X_train_preproc), (-1, 1) )), axis=1), 
                                   index = train_idx,
                                   columns = col)

    X_test_preproc = pd.DataFrame(data = np.concatenate((X_test_preproc.to_numpy(), np.reshape(clf.predict(X_test_preproc), (-1, 1) )), axis=1), 
                                  index = test_idx,
                                  columns = col)

    # train random survival forest
    y_train = Surv.from_dataframe("efs", "efs_time", y_train)

    rsf = RandomSurvivalForest(
        n_estimators=30,
        max_depth=10,
        min_samples_split=20,
        min_samples_leaf=10,
        n_jobs=4,
        verbose=1,
        random_state=42
    )
    rsf.fit(X_train_preproc, y_train)
    surv_funcs = rsf.predict_survival_function(X_test_preproc, return_array=False)
    preds = np.array([-np.trapz(fn.y, fn.x) for fn in surv_funcs])
    
    return preds

In [None]:
## hybird rf-xgb

class RFCTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
        self.rcf_classifier = None
    
    def transform(self, X, y=None):
        X_transform = X.copy(deep = True)
        proba_column = self.rcf_classifier.predict_proba(X_transform)[:, 1]
        X_transform["clf_proba"] = proba_column

        self.columns = X_transform.columns
        return X_transform

    def fit(self, X, y):
        RFC = RandomForestClassifier()
        RFC.fit(X, y)
        self.rcf_classifier = RFC
        self._is_fitted = True
        return self 
    
    def get_feature_names_out(self, input_features = None):
        return self.columns

    def __sklearn_is_fitted__(self):
        return hasattr(self, "_is_fitted") and self._is_fitted

cat_cols = df_train.select_dtypes(include='O').columns
num_cols = df_train.select_dtypes(exclude='O').columns.drop(["ID", 'year_hct', 'efs', 'efs_time'])

# Ela - pipeline
preproc_ela = Pipeline(steps=[
    ('preprocessor', MissingValueTransformer(
        null_list = ["Not done", "Not tested", "Other", "Missing disease status", "Non-resident of the U.S."])
    ),
    ('imputing one-hot encoding',
        ColumnTransformer(
            transformers=[
                ('cat_imputer', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                                        ('one-hot encoder', OneHotEncoder(drop='first', 
                                                                    sparse_output=False, 
                                                                    handle_unknown='ignore'))]), cat_cols),
                ('num_imputer', Pipeline([('imputer', SimpleImputer(strategy='mean')),
                                        ('scaler', StandardScaler())]), num_cols),
                ('ID_year_dropper', 'drop', ["ID", 'year_hct'])
            ],
            sparse_threshold=0,
            remainder='passthrough',
            verbose_feature_names_out=False,
            force_int_remainder_cols=False
        ).set_output(transform = 'pandas')
    ),
    ('random forest transform', RFCTransformer().set_output(transform = 'pandas'))
])

def rf_xgb(X_train, y_train, X_test, y_test):
    params = {
        'objective': 'survival:aft',
        'eval_metric': 'aft-nloglik',
        'tree_method': 'hist',
        'seed': 42, 
        'aft_loss_distribution': 'normal',
        'aft_loss_distribution_scale': 0.34089400351953153,
        'learning_rate': 0.07894387344725944,
        'max_depth': 8,
        'min_child_weight': 1,
        'subsample': 0.947394577078348,
        'colsample_bytree': 0.8323203114860168,
        'lambda': 0.40756304508622526,
        'alpha': 6.828765311809384
    }
    preproc_ela.fit(X_train, y_train['efs'])
    X_train_preproc = preproc_ela.transform(X_train)
    X_test_preproc = preproc_ela.transform(X_test)

    # remove special character
    X_train_preproc.columns = X_train_preproc.columns.str.replace('<','')
    X_test_preproc.columns = X_test_preproc.columns.str.replace('<','')

    y_lower_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound = y_train['efs_time'].copy(deep = True)
    y_upper_bound[y_train['efs'] == 0.0] = +np.inf

    dtrain = xgb.DMatrix(X_train_preproc)
    dtrain.set_float_info('label_lower_bound', y_lower_bound)
    dtrain.set_float_info('label_upper_bound', y_upper_bound)

    bst = xgb.train(params, dtrain, num_boost_round=500, evals=[(dtrain, 'train')], verbose_eval = 0)
    dtest = xgb.DMatrix(X_test_preproc)
    preds = bst.predict(dtest)

    return -preds

In [None]:
# helper functions
# eval evalutes stratified c-index and c-index
def eval(preds, X_test, solution):
    prediction= pd.DataFrame({"ID":X_test["ID"], "prediction":preds})
    sc_score = score(solution.copy(deep=True), prediction.copy(deep=True), "ID")
    c_index = concordance_index(y_test['efs_time'], -preds, y_test['efs'])

    return sc_score, c_index

# file_output export all the information into a csv file
def file_output(filename, sc_indexes, columns):
    sc_mean = sc_indexes.mean()
    output = pd.DataFrame(np.concatenate(
                                        (sc_indexes.to_numpy(), np.expand_dims(sc_mean.to_numpy(), axis = 0)
                                     ), axis = 0), 
                                     index=[0,1,2,3,4, 'mean'],
                                     columns = columns)
    output.to_csv(filename, sep= '\t', index= True, header= True)