In [2]:
import pandas as pd
import numpy as np
import optuna

from sklearn.model_selection import KFold,GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sksurv.preprocessing import encode_categorical
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sksurv.util import Surv
from lohrasb.best_estimator import BaseModel

import xgboost as xgb
from xgboost import XGBRegressor

from lohrasb.best_estimator import BaseModel
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from xgbse.metrics import (
    concordance_index,
    approx_brier_score
)
from xgbse import (
    XGBSEKaplanNeighbors,
    XGBSEKaplanTree,
    XGBSEBootstrapEstimator)

from xgbse.converters import (
    convert_data_to_xgb_format,
    convert_to_structured
)
from xgbse.metrics import (
    concordance_index,
    approx_brier_score,
      dist_calibration_score
)

from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
    brier_score,
    as_concordance_index_ipcw_scorer,
    as_cumulative_dynamic_auc_scorer,
    as_integrated_brier_score_scorer,
)

import random as rn
import os
# set random seed
SEED = 0
np.random.seed(SEED)
rn.seed(SEED)
os.environ['PYTHONHASHSEED'] = '0'

from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load data
df = pd.read_csv('/omics/odcf/analysis/OE0167_projects/dachs_genetic_data_platform/methylation_markers_new/processed_new/df_selectedf.csv')

In [4]:
# select the x, and y variables 
X = df.drop(['id', 'Diagnosis_year', 'chemradther', 'timey', 'death_all', 'Location',
               'timey_PFS', 'PFS'], axis=1)
               
Y = df[['timey_PFS', 'PFS']]

y_structured = Surv.from_arrays(Y['PFS'], Y['timey_PFS'])

In [5]:
## determine preprocessor
# Identify the column types
numerical_columns_selector = selector(dtype_exclude=object)
numerical_columns = numerical_columns_selector(X)

ordinal_columns = ['TNM_adj']
binary_columns = ['Sex']

# Define the transformers
binary_transformer = OneHotEncoder(drop = 'first') ## must only retain one variable otherwise cause error 
ordinal_transformer = OrdinalEncoder(categories=[['II', 'III']])
continuous_transformer = StandardScaler()

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('binary', binary_transformer, binary_columns),
        ('ordinal', ordinal_transformer, ordinal_columns),
        ('continuous', continuous_transformer, numerical_columns)])
    

In [6]:
### too computationally consuming and kinda complicated to run nested CV, so just try train test split
X_train, X_test, y_train, y_test = train_test_split(X, y_structured, test_size=1/5, random_state=SEED)


In [7]:
## further split the training set for tunning
X_train, X_vali, y_train, y_vali = train_test_split(X_train, y_train, test_size=1/3, random_state=SEED)

In [8]:
## preprocess the data and make the data for XGboost
X_scaler = preprocessor.fit(X_train)
X_train =  X_scaler.transform(X_train)
X_vali = X_scaler.transform(X_vali)
X_test = X_scaler.transform(X_test) 

dtrain = convert_data_to_xgb_format(X_train, y_train, 'survival:cox')
dval = convert_data_to_xgb_format(X_vali, y_vali, 'survival:cox')
dtest = convert_data_to_xgb_format(X_test, y_test, 'survival:cox') 

### first try ordinary XGBoost (survival:cox) ##

In [17]:
## define the hyperparameters and search 
base_params = {'verbosity': 1,
              'objective': 'survival:cox',
              'eval_metric': 'cox-nloglik',
               'booster': 'dart',
              'tree_method': 'hist'}

def objective(trial):
    params = {'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1),
              'max_depth': trial.suggest_int('max_depth', 3, 18),
              'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
              'gamma': trial.suggest_int('gamma', 1, 9),
              'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
              'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0)}  # Search space
    params.update(base_params)
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'valid-cox-nloglik')
    bst = xgb.train(params, dtrain, num_boost_round=10000,
                    evals=[(dtrain, 'train'), (dval, 'valid')],callbacks=[pruning_callback],
                    early_stopping_rounds=50, verbose_eval=False)
    if bst.best_iteration >= 25:
        return bst.best_score
    else:
        return np.inf  # Reject models with < 25 trees

In [18]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200)
print('Completed hyperparameter tuning with best cox-nloglik = {}.'.format(study.best_trial.value))
params = {}
params.update(base_params)
params.update(study.best_trial.params)

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are not used.

Parameters: { "n_estimators" } are

In [19]:
bst = xgb.train(params, dtrain, num_boost_round=10000,
                evals=[(dtrain, 'train'), (dval, 'valid')],
                early_stopping_rounds=50)

Parameters: { "n_estimators" } are not used.

[0]	train-cox-nloglik:6.05315	valid-cox-nloglik:5.45092
[1]	train-cox-nloglik:5.96628	valid-cox-nloglik:5.43211
[2]	train-cox-nloglik:5.86947	valid-cox-nloglik:5.40919
[3]	train-cox-nloglik:5.78630	valid-cox-nloglik:5.40503
[4]	train-cox-nloglik:5.70808	valid-cox-nloglik:5.39108
[5]	train-cox-nloglik:5.63606	valid-cox-nloglik:5.37498
[6]	train-cox-nloglik:5.57926	valid-cox-nloglik:5.36536
[7]	train-cox-nloglik:5.52540	valid-cox-nloglik:5.36185
[8]	train-cox-nloglik:5.46793	valid-cox-nloglik:5.36384
[9]	train-cox-nloglik:5.42420	valid-cox-nloglik:5.37257
[10]	train-cox-nloglik:5.36943	valid-cox-nloglik:5.36818
[11]	train-cox-nloglik:5.32366	valid-cox-nloglik:5.36003
[12]	train-cox-nloglik:5.29215	valid-cox-nloglik:5.35230
[13]	train-cox-nloglik:5.25844	valid-cox-nloglik:5.34776
[14]	train-cox-nloglik:5.24600	valid-cox-nloglik:5.34845
[15]	train-cox-nloglik:5.21401	valid-cox-nloglik:5.34073
[16]	train-cox-nloglik:5.18592	valid-cox-nloglik:5.3

In [23]:
preds = bst.predict(dval)
print(concordance_index_censored(y_vali['event'], y_vali['time'], preds)[0])

0.6450429559796245


In [24]:
print(concordance_index_ipcw(y_train, y_vali, preds)[0])

0.6489715215266553


In [25]:
## performanc in the test set, it's getting even worse
preds = bst.predict(dtest)
print(concordance_index_censored(y_test['event'], y_test['time'], preds)[0])

0.5825544142418337


In [26]:
print(concordance_index_ipcw(y_train, y_test, preds)[0])

0.5950579303644151


### 2. ordinary XGBoost (survival:aft) ##

In [27]:
dtrain = convert_data_to_xgb_format(X_train, y_train, 'survival:aft')
dval = convert_data_to_xgb_format(X_vali, y_vali, 'survival:aft')
dtest = convert_data_to_xgb_format(X_test, y_test, 'survival:aft') 

In [30]:
## define the hyperparameters and search 
base_params = {'verbosity': 0,
               'objective': 'survival:aft',
              'eval_metric': 'aft-nloglik',
              'tree_method': 'hist'}

def objective(trial):
    params = {'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1),
              'max_depth': trial.suggest_int('max_depth', 3, 18),
              'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
              'gamma': trial.suggest_int('gamma', 1, 9),
              'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
              'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0)}  # Search space
    params.update(base_params)
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'valid-aft-nloglik')
    bst = xgb.train(params, dtrain, num_boost_round=10000,
                    evals=[(dtrain, 'train'), (dval, 'valid')],callbacks=[pruning_callback],
                    early_stopping_rounds=50, verbose_eval=False)
    if bst.best_iteration >= 25:
        return bst.best_score
    else:
        return np.inf  # Reject models with < 2 trees

In [31]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=200)
print('Completed hyperparameter tuning with best aft-nloglik = {}.'.format(study.best_trial.value))
params = {}
params.update(base_params)
params.update(study.best_trial.params)

Completed hyperparameter tuning with best aft-nloglik = 1.9807462198667574.


In [32]:
bst = xgb.train(params, dtrain, num_boost_round=10000,
                evals=[(dtrain, 'train'), (dval, 'valid')],
                early_stopping_rounds=50)

[0]	train-aft-nloglik:4.78499	valid-aft-nloglik:4.94169
[1]	train-aft-nloglik:4.32281	valid-aft-nloglik:4.54882
[2]	train-aft-nloglik:3.91886	valid-aft-nloglik:4.19251
[3]	train-aft-nloglik:3.56974	valid-aft-nloglik:3.88182
[4]	train-aft-nloglik:3.26527	valid-aft-nloglik:3.62575
[5]	train-aft-nloglik:3.00201	valid-aft-nloglik:3.40400
[6]	train-aft-nloglik:2.77096	valid-aft-nloglik:3.20577
[7]	train-aft-nloglik:2.57123	valid-aft-nloglik:3.03893
[8]	train-aft-nloglik:2.39569	valid-aft-nloglik:2.89976
[9]	train-aft-nloglik:2.24407	valid-aft-nloglik:2.78364
[10]	train-aft-nloglik:2.10982	valid-aft-nloglik:2.67932
[11]	train-aft-nloglik:1.99075	valid-aft-nloglik:2.57952
[12]	train-aft-nloglik:1.88671	valid-aft-nloglik:2.49602
[13]	train-aft-nloglik:1.79509	valid-aft-nloglik:2.42432
[14]	train-aft-nloglik:1.71498	valid-aft-nloglik:2.36489
[15]	train-aft-nloglik:1.64373	valid-aft-nloglik:2.31197
[16]	train-aft-nloglik:1.58156	valid-aft-nloglik:2.26927
[17]	train-aft-nloglik:1.52679	valid-aft-

In [35]:
preds = bst.predict(dval)
print(concordance_index_censored(y_vali['event'], y_vali['time'], -preds)[0])

0.6491484832357637


In [36]:
## performanc in the test set,
preds = bst.predict(dtest)
print(concordance_index_censored(y_test['event'], y_test['time'], -preds)[0])

0.6101331043366252


### 3. XGboost with survival embedding


#####  xgbse._kaplan_neighbors, no tunning

In [38]:
from xgbse._kaplan_neighbors import DEFAULT_PARAMS

In [39]:
DEFAULT_PARAMS

{'objective': 'survival:aft',
 'eval_metric': 'aft-nloglik',
 'aft_loss_distribution': 'normal',
 'aft_loss_distribution_scale': 1,
 'tree_method': 'hist',
 'learning_rate': 0.05,
 'max_depth': 8,
 'booster': 'dart',
 'subsample': 0.5,
 'min_child_weight': 50,
 'colsample_bynode': 0.5}

In [40]:
xgbse_model = XGBSEKaplanNeighbors(DEFAULT_PARAMS, n_neighbors=30)

In [41]:
lower, upper = np.percentile(y_test["time"], [10, 90])  
TIME_BINS =  np.arange(lower, upper + 1)
xgbse_model.fit(
        pd.DataFrame(X_train), y_train,
        validation_data = (pd.DataFrame(X_vali), y_vali),
        early_stopping_rounds=10,
        time_bins=TIME_BINS
    )

In [42]:
preds = xgbse_model.predict(pd.DataFrame(X_test))

In [43]:
concordance_index(y_test, preds)

0.6534498133897018

In [44]:
approx_brier_score(y_test, preds)

0.1974326659061161

although the performance was better than the original XGBoost, but still worse than the Cox

#####  XGBSEKaplanTree with XGBSEBootstrapEstimator gird search
can only tune the hyperparameters outside of the dictionary

In [9]:
### too computationally consuming and kinda complicated to run nested CV, so just try train test split
X_train, X_test, y_train, y_test = train_test_split(X, y_structured, test_size=1/5, random_state=SEED)

In [10]:
## preprocess the data and make the data for XGboost
X_scaler = preprocessor.fit(X_train)
X_train =  X_scaler.transform(X_train)
#X_vali = X_scaler.transform(X_vali)
X_test = X_scaler.transform(X_test) 

dtrain = convert_data_to_xgb_format(X_train, y_train, 'survival:cox')
#dval = convert_data_to_xgb_format(X_vali, y_vali, 'survival:cox')
dtest = convert_data_to_xgb_format(X_test, y_test, 'survival:cox') 

In [17]:
estimator_params = {
    'n_estimators' :[100, 200, 400, 500, 800, 1000]
}

PARAMS_TREE = {
    'objective': 'survival:aft',
    'eval_metric': 'cox-nloglik',
    'tree_method': 'exact', 
    'max_depth': 100, 
    'booster':'dart', 
    'subsample': 0.5,
    'min_child_weight': 30, 
    'colsample_bynode': 0.5
}  ### it was impossible to tune the hyperparameters inside the dictionary...

In [18]:
base_model = XGBSEKaplanTree(PARAMS_TREE)
lower, upper = np.percentile(y_train["time"], [10, 90])
TIME_BINS = np.arange(lower, 15)
estimator=XGBSEBootstrapEstimator(base_model)
fit_params = {"time_bins":TIME_BINS}

In [20]:
obj = BaseModel().optimize_by_gridsearchcv(
            estimator=estimator,
            fit_params = fit_params,
            estimator_params=estimator_params,
            measure_of_accuracy=make_scorer(concordance_index, greater_is_better=True),
            verbose=3,
            n_jobs=-1,
            random_state=42,
            cv=KFold(3)).fit(pd.DataFrame(X_train), y_train)

2023-10-19 11:10:19,199 :: root :: The optimization will be based on make_scorer(concordance_index) metric!
Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END ..................n_estimators=100;, score=0.616 total time=  19.1s
[CV 2/3] END ..................n_estimators=100;, score=0.648 total time=  18.8s
[CV 3/3] END ..................n_estimators=100;, score=0.630 total time=  19.3s
[CV 1/3] END ..................n_estimators=200;, score=0.614 total time=  36.4s
[CV 2/3] END ..................n_estimators=200;, score=0.649 total time=  38.6s
[CV 3/3] END ..................n_estimators=200;, score=0.628 total time=  38.6s
[CV 1/3] END ..................n_estimators=400;, score=0.615 total time= 1.3min
[CV 2/3] END ..................n_estimators=400;, score=0.645 total time= 1.3min
[CV 3/3] END ..................n_estimators=400;, score=0.628 total time= 1.3min
[CV 1/3] END ..................n_estimators=500;, score=0.614 total time= 1.6min
[CV 2/3] END .........