In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
import optuna
from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb
from lifelines import CoxPHFitter



np.random.seed(42)

In [2]:
%run -i ../examples/concordance_index.ipynb

In this notebook I tried AFT XGBoost and Cox Proportionla Hazard XGBoost. The scores are low. I tried three things:
1. AFT XGBoost using the full set of features with imputation and scaling, parameters from Optuna study 
2. AFT XGBoost using the full set of features, no preprocessing, parameters from the same Optuna study
3. AFT XGBoost using the set of features coming from lasso plus race_group, preprocessing, parameters from a new Optuna study
4. Cox Proportional Hazard XGBoost using the full set of features with imputation and scaling, parameters from a new Optuna study

As a baseline, I ran Cox PH regressor. I didn't use cross validation for CPH score, as this was just a reference. To obtain best parameters for XGBoost, I ran an Optuna study.

Overall, the results from all experiments are dissapointing. The baseline score is much better (0.0.6488453642587069). The scores are:
1. 0.44892073693943507
2. 0.42750284722382664
3. 0.5
4. 0.0.514463294984588

The scores were obtained by evaluating models on the validation set, coming from test_validation_set.csv, split into two, with random seed 42. 

As you can see, changing the size of the feature set as well as the type of XGBoost had a pretty big effect on the score. One could experiment further with feature selection and imputation techniques. However, an out-of-the-box Cox PH regressor performed so much better that it calls into question the validity of using XGBoost.


In [3]:
# Importing data:

df_train = pd.read_csv("../data/train_set.csv")

df_train_copy = df_train.copy(deep=True)

# Categorical columns

categ_columns = df_train_copy.select_dtypes(include = ['object']).columns

to_replace = ["Not done", "Not tested", "Other", "Missing disease status", "Non-resident of the U.S."]
df_train_copy.loc[:,categ_columns] = df_train_copy[categ_columns].replace(to_replace, "missing")
df_train_copy.loc[:,categ_columns] = df_train_copy[categ_columns].fillna('missing')

# Numerical columns:

num_columns = df_train_copy.select_dtypes(include = ['float64']).columns
df_train_copy.loc[:, num_columns] = df_train_copy[num_columns].fillna(-1.0)

# Same for validation set:

df_val_test = pd.read_csv('../data/test_validation_set.csv')
df_val1, df_test  = train_test_split(df_val_test, test_size=0.5,random_state=42)
df_val1_copy =df_val1.copy(deep=True)
df_val1_copy.loc[:,categ_columns] = df_val1_copy[categ_columns].replace(to_replace, "missing")
df_val1_copy.loc[:,categ_columns] = df_val1_copy[categ_columns].fillna('missing')

df_val1_copy.loc[:, num_columns] = df_val1_copy[num_columns].fillna(-1.0)

y_train = df_train_copy.iloc[:,-2:]

In [4]:
# Preprocessing pipeline:

def create_preprocessor(categ_data, num_data):
    
    cat_imputer = SimpleImputer(strategy='most_frequent')
    #num_imputer = KNNImputer(n_neighbors=5)
    num_imputer = SimpleImputer(strategy='mean')
    scaler = StandardScaler()

    num_pipeline = Pipeline([
        ('imputer', num_imputer),
        ('scaler', scaler)
    ])

    cat_pipeline = Pipeline([
        ('imputer', cat_imputer),
        ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat_imputer', cat_pipeline, categ_data),
            ('num_imputer', num_pipeline, num_data)
        ],
        verbose_feature_names_out=False
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
    ])
    
    return pipeline


In [5]:
categ_data = df_train_copy.select_dtypes(include='object').columns.tolist()
num_data = df_train_copy.drop(["efs", "efs_time","ID"], axis=1).select_dtypes(include='number').columns.tolist()

processor = create_preprocessor(categ_data, num_data)
X_train = processor.fit_transform(df_train_copy)
X_Val = processor.transform(df_val1_copy)
feature_names = processor.named_steps['preprocessor'].get_feature_names_out()

Cox Proportional Hazard as a baseline:

In [6]:
CPH = CoxPHFitter()
train_data = pd.DataFrame(X_train, columns = feature_names, index = df_train_copy.index)
train_plus = pd.concat([train_data, df_train_copy[['efs', 'efs_time']]], axis =1)
val_data = pd.DataFrame(X_Val, columns = feature_names, index = df_val1_copy.index)


In [7]:
CPH.fit(train_plus, duration_col='efs_time', event_col='efs')
predicted_hazards = CPH.predict_partial_hazard(val_data)




In [8]:
submissionCPH = pd.DataFrame({'ID': df_val1_copy["ID"], 'prediction': predicted_hazards}) 
score_cph = score(df_val1_copy.copy(deep=True), submissionCPH.copy(deep=True), "ID")
print(f" The stratified concordance score of Cox Proportional Hazard: {score_cph}.")

 The stratified concordance score of Cox Proportional Hazard: 0.6488453642587069.


### 1. AFT XGBoost with preprocessing the data

In [9]:
def objective(trial):
    # Sample hyperparameters from a reasonable search space
    params = {
        'objective': 'survival:aft',
        'eval_metric': 'aft-nloglik',
        'aft_loss_distribution': trial.suggest_categorical('aft_loss_distribution', ['logistic', 'normal']),
        'aft_loss_distribution_scale': trial.suggest_loguniform('aft_loss_distribution_scale', 0.1, 10.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 0.1, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 0.1, 10.0),
        'tree_method': 'hist', 
        'seed': 42
    }

    df_train_copy1 = df_train_copy.copy()  
    y_train1 = y_train.copy()
    categ_data = df_train_copy1.select_dtypes(include='object').columns.tolist()
    num_data = df_train_copy1.drop(["efs", "efs_time", "ID"], axis=1).select_dtypes(include='number').columns.tolist()

    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    results = []

    for train_index, val_index in skf.split(df_train_copy1, df_train_copy['efs']): 
        X_train, X_val = df_train_copy1.iloc[train_index], df_train_copy.iloc[val_index]
        y_tr, y_val = y_train1.iloc[train_index], y_train1.iloc[val_index]
        y_lower_bound_tr = np.where(y_tr['efs']==0, y_tr['efs_time'],y_tr['efs_time'])
        y_lower_bound_val = np.where(y_val['efs']==0, y_val['efs_time'],y_val['efs_time'])
        y_upper_bound_tr = np.where(y_tr['efs']==0, np.inf,y_tr['efs_time'] )
        y_upper_bound_val = np.where(y_val['efs']==0, np.inf,y_val['efs_time'] )

        preprocessor = create_preprocessor(categ_data, num_data)

        X_train_transformed = preprocessor.fit_transform(X_train)
        X_val_transformed = preprocessor.transform(X_val)

        
        dtrain_cv = xgb.DMatrix(X_train_transformed)
        dval_cv = xgb.DMatrix(X_val_transformed)
        dtrain_cv.set_float_info('label_lower_bound', y_lower_bound_tr)
        dtrain_cv.set_float_info('label_upper_bound', y_upper_bound_tr)

        dval_cv.set_float_info('label_lower_bound', y_lower_bound_val)
        dval_cv.set_float_info('label_upper_bound', y_upper_bound_val)


        booster = xgb.train(
            params=params,
            dtrain=dtrain_cv,
            num_boost_round=100,
            evals=[(dval_cv, "validation")],
            early_stopping_rounds=10,
            verbose_eval=False
        )


    
        preds = booster.predict(dval_cv)
        submission = pd.DataFrame({'ID': df_train_copy.loc[val_index]["ID"], 'prediction': preds}) 
        score_aft1= score(df_train_copy.iloc[val_index].copy(deep=True), submission.copy(deep=True), "ID")
        results.append(score_aft1)


    return -np.mean(results)

study = optuna.create_study(direction='minimize') 
study.optimize(objective, n_trials=100)  

best_params = study.best_params
print(f"Best parameters: {best_params}")


[I 2025-04-13 19:09:52,770] A new study created in memory with name: no-name-addc6930-2c34-4903-957d-c31769cf969a
  'aft_loss_distribution_scale': trial.suggest_loguniform('aft_loss_distribution_scale', 0.1, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 0.1, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 0.1, 10.0),
[I 2025-04-13 19:09:54,356] Trial 0 finished with value: -0.34992034799294797 and parameters: {'aft_loss_distribution': 'logistic', 'aft_loss_distribution_scale': 2.8737126349956177, 'learning_rate': 0.0012327304651387286, 'max_depth': 4, 'min_child_weight': 3, 'subsample': 0.7873266453129999, 'colsample_bytree': 0.9047447513468163, 'lambda': 0.12554374302864005, 'alpha': 0.4882867481019298}. Best is trial 0 with value: -0.34992034799294797.
  'aft_loss_d

Best parameters: {'aft_loss_distribution': 'logistic', 'aft_loss_distribution_scale': 0.2072777696479105, 'learning_rate': 0.051476738630183136, 'max_depth': 3, 'min_child_weight': 8, 'subsample': 0.6347551391126972, 'colsample_bytree': 0.9208506777871813, 'lambda': 3.819690810957991, 'alpha': 0.5813388572207947}


In [10]:
best_params

{'aft_loss_distribution': 'logistic',
 'aft_loss_distribution_scale': 0.2072777696479105,
 'learning_rate': 0.051476738630183136,
 'max_depth': 3,
 'min_child_weight': 8,
 'subsample': 0.6347551391126972,
 'colsample_bytree': 0.9208506777871813,
 'lambda': 3.819690810957991,
 'alpha': 0.5813388572207947}

In [11]:
# Final parameters:

final_params = {
    'objective': 'survival:aft',
    'eval_metric': 'aft-nloglik',
    'tree_method': 'hist',
    'seed': 42, 
    **best_params 
}


In [12]:
# Training on best parameters from Optuna study:

categ_data = df_train_copy.select_dtypes(include='object').columns.tolist()
num_data = df_train_copy.drop(["efs", "efs_time","ID"], axis=1).select_dtypes(include='number').columns.tolist()
processor = create_preprocessor(categ_data, num_data)
X_train = processor.fit_transform(df_train_copy)
y_train = df_train.iloc[:,-2:]

X_val =processor.transform(df_val1_copy)

y_lower_bound = np.where(y_train['efs']==0, y_train['efs_time'],y_train['efs_time'])
y_upper_bound = np.where(y_train['efs']==0, np.inf,y_train['efs_time'] )
dtrain = xgb.DMatrix(X_train)
dtrain.set_float_info('label', y_train['efs_time']) 

dtrain.set_float_info('label_lower_bound', y_lower_bound)
dtrain.set_float_info('label_upper_bound', y_upper_bound)
booster1 = xgb.train(final_params, dtrain, num_boost_round=100,
                evals=[(dtrain, 'train')])

[0]	train-aft-nloglik:14.88027
[1]	train-aft-nloglik:25.27849
[2]	train-aft-nloglik:19.20058
[3]	train-aft-nloglik:16.42351
[4]	train-aft-nloglik:15.11110
[5]	train-aft-nloglik:15.75894
[6]	train-aft-nloglik:15.74598
[7]	train-aft-nloglik:19.30874
[8]	train-aft-nloglik:16.95310
[9]	train-aft-nloglik:16.66659
[10]	train-aft-nloglik:18.61549
[11]	train-aft-nloglik:15.62832
[12]	train-aft-nloglik:15.61294
[13]	train-aft-nloglik:15.58665
[14]	train-aft-nloglik:15.05935
[15]	train-aft-nloglik:15.00094
[16]	train-aft-nloglik:14.85324
[17]	train-aft-nloglik:14.51772
[18]	train-aft-nloglik:14.44167
[19]	train-aft-nloglik:14.30917
[20]	train-aft-nloglik:14.24688
[21]	train-aft-nloglik:14.18847
[22]	train-aft-nloglik:14.10204
[23]	train-aft-nloglik:14.07636
[24]	train-aft-nloglik:14.03789
[25]	train-aft-nloglik:13.90322
[26]	train-aft-nloglik:13.85577
[27]	train-aft-nloglik:13.83931
[28]	train-aft-nloglik:13.65399
[29]	train-aft-nloglik:13.53644
[30]	train-aft-nloglik:13.47225
[31]	train-aft-nlo

In [13]:
X_Val1  = xgb.DMatrix(X_Val)
aft_preds1 = booster1.predict(X_Val1)

In [14]:
submission1 = pd.DataFrame({'ID': df_val1_copy["ID"], 'prediction': aft_preds1}) 
score_aft1= score(df_val1_copy.copy(deep=True), submission1.copy(deep=True), "ID")
print(f" The stratified concordance score of AFT XGBoost: {score_aft1}.")

 The stratified concordance score of AFT XGBoost: 0.44892073693943507.


### 2. AFT XGBoost without preprocessing the data

In [15]:
for col in df_train_copy.select_dtypes(include='object').columns:
    df_train_copy[col] = df_train_copy[col].astype('category')
train_no_prep = df_train_copy.drop(["efs", "efs_time","ID"], axis=1)
val_no_prep = df_val1_copy.drop(["efs", "efs_time","ID"], axis=1)
dtrain1 = xgb.DMatrix(train_no_prep,enable_categorical=True)
dtrain1.set_float_info('label', y_train['efs_time']) 
dtrain1.set_float_info('label_lower_bound', y_lower_bound)
dtrain1.set_float_info('label_upper_bound', y_upper_bound)
booster2 = xgb.train(final_params, dtrain1, num_boost_round=100,
                evals=[(dtrain1, 'train')])

[0]	train-aft-nloglik:14.72714
[1]	train-aft-nloglik:15.92770
[2]	train-aft-nloglik:19.29353
[3]	train-aft-nloglik:19.48729
[4]	train-aft-nloglik:19.29764
[5]	train-aft-nloglik:14.44131
[6]	train-aft-nloglik:14.54911
[7]	train-aft-nloglik:13.81510
[8]	train-aft-nloglik:13.97067
[9]	train-aft-nloglik:14.17884
[10]	train-aft-nloglik:13.67268
[11]	train-aft-nloglik:13.50080
[12]	train-aft-nloglik:13.15776
[13]	train-aft-nloglik:12.79776
[14]	train-aft-nloglik:12.69050
[15]	train-aft-nloglik:12.28743
[16]	train-aft-nloglik:12.04957
[17]	train-aft-nloglik:11.94855
[18]	train-aft-nloglik:11.82336
[19]	train-aft-nloglik:11.77791
[20]	train-aft-nloglik:11.73911
[21]	train-aft-nloglik:11.57185
[22]	train-aft-nloglik:11.34831
[23]	train-aft-nloglik:11.10052
[24]	train-aft-nloglik:11.05807
[25]	train-aft-nloglik:11.02198
[26]	train-aft-nloglik:11.01110
[27]	train-aft-nloglik:10.99257
[28]	train-aft-nloglik:10.97160
[29]	train-aft-nloglik:10.95865
[30]	train-aft-nloglik:10.94972
[31]	train-aft-nlo

In [16]:
for col in val_no_prep.select_dtypes(include='object').columns:
    val_no_prep[col] = val_no_prep[col].astype('category')

X_Val1  = xgb.DMatrix(val_no_prep, enable_categorical = True)
aft_preds2 = booster2.predict(X_Val1)


In [17]:
submission2 = pd.DataFrame({'ID': df_val1_copy["ID"], 'prediction': aft_preds2}) 
score_aft2= score(df_val1_copy.copy(deep=True), submission2.copy(deep=True), "ID")
print(f" The stratified concordance score of AFT XGBoost w/o imputation: {score_aft2}.")

 The stratified concordance score of AFT XGBoost w/o imputation: 0.42750284722382664.


### 3. AFT XGBoost with a smaller set of features and with preprocessing

In [18]:
lasso_features = [ 'hla_match_drb1_low', 'age_at_hct', 'comorbidity_score', 'karnofsky_score', 'hla_match_drb1_high']    
categ_data1 = ['race_group']

new_features = lasso_features + categ_data1

In [19]:
lasso_df_train_copy = df_train_copy[new_features]
lasso_df_val1_copy = df_val1_copy[lasso_df_train_copy.columns]

In [20]:
num_data1 = lasso_features
processor1 = create_preprocessor(categ_data1, num_data1)
X_train1 = processor1.fit_transform(lasso_df_train_copy)
X_Val1 = processor1.transform(lasso_df_val1_copy)
feature_names1 = processor1.named_steps['preprocessor'].get_feature_names_out()

Cox Proportional Hazard regression for comparison:

In [21]:
CPH1 = CoxPHFitter()
train_data1 = pd.DataFrame(X_train1, columns = feature_names1, index = df_train_copy.index)
train_plus1 = pd.concat([train_data1, df_train_copy[['efs', 'efs_time']]], axis =1)
val_data1 = pd.DataFrame(X_Val1, columns = feature_names1, index = df_val1_copy.index)
CPH1.fit(train_plus1, duration_col='efs_time', event_col='efs')
predicted_hazards1 = CPH1.predict_partial_hazard(val_data1)
submissionCPH1 = pd.DataFrame({'ID': df_val1_copy["ID"], 'prediction': predicted_hazards1}) 
score_cph1 = score(df_val1_copy.copy(deep=True), submissionCPH1.copy(deep=True), "ID")
print(score_cph1)


0.5767127119512953


AFT XGBoost with a smaller feature set:

In [22]:
dtrain2 = xgb.DMatrix(X_train1)
X_Val2  = xgb.DMatrix(X_Val1)
dtrain2.set_float_info('label', y_train['efs_time']) 
dtrain2.set_float_info('label_lower_bound', y_lower_bound)
dtrain2.set_float_info('label_upper_bound', y_upper_bound)

In [23]:
def objective(trial):
    # Sample hyperparameters from a reasonable search space
    params = {
        'objective': 'survival:aft',
        'eval_metric': 'aft-nloglik',
        'aft_loss_distribution': trial.suggest_categorical('aft_loss_distribution', ['logistic', 'normal']),
        'aft_loss_distribution_scale': trial.suggest_loguniform('aft_loss_distribution_scale', 0.1, 10.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 0.1, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 0.1, 10.0),
        'tree_method': 'hist', 
        'seed': 42
    }

    df_train_copy1 = lasso_df_train_copy.copy()  
    y_train1 = y_train.copy()
    categ_data = df_train_copy1.select_dtypes(include='object').columns.tolist()
    num_data = df_train_copy1.select_dtypes(include='number').columns.tolist()

    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    results = []

    for train_index, val_index in skf.split(df_train_copy1, df_train_copy['efs']): 
        X_train, X_val = df_train_copy1.iloc[train_index], df_train_copy.iloc[val_index]
        y_tr, y_val = y_train1.iloc[train_index], y_train1.iloc[val_index]
        y_lower_bound_tr = np.where(y_tr['efs']==0, y_tr['efs_time'],y_tr['efs_time'])
        y_lower_bound_val = np.where(y_val['efs']==0, y_val['efs_time'],y_val['efs_time'])
        y_upper_bound_tr = np.where(y_tr['efs']==0, np.inf,y_tr['efs_time'] )
        y_upper_bound_val = np.where(y_val['efs']==0, np.inf,y_val['efs_time'] )

        preprocessor = create_preprocessor(categ_data, num_data)

        X_train_transformed = preprocessor.fit_transform(X_train)
        X_val_transformed = preprocessor.transform(X_val)

        
        dtrain_cv = xgb.DMatrix(X_train_transformed)
        dval_cv = xgb.DMatrix(X_val_transformed)
        dtrain_cv.set_float_info('label_lower_bound', y_lower_bound_tr)
        dtrain_cv.set_float_info('label_upper_bound', y_upper_bound_tr)

        dval_cv.set_float_info('label_lower_bound', y_lower_bound_val)
        dval_cv.set_float_info('label_upper_bound', y_upper_bound_val)


        booster = xgb.train(
            params=params,
            dtrain=dtrain_cv,
            num_boost_round=100,
            evals=[(dval_cv, "validation")],
            early_stopping_rounds=10,
            verbose_eval=False
        )


    
        preds = booster.predict(dval_cv)
        submission = pd.DataFrame({'ID': df_train_copy.loc[val_index]["ID"], 'prediction': preds}) 
        score_aft1= score(df_train_copy.iloc[val_index].copy(deep=True), submission.copy(deep=True), "ID")
        results.append(score_aft1)


    return -np.mean(results)

study_lasso = optuna.create_study(direction='minimize') 
study_lasso.optimize(objective, n_trials=100)  

best_params_lasso = study_lasso.best_params
print(f"Best parameters: {best_params_lasso}")


[I 2025-04-13 19:12:10,609] A new study created in memory with name: no-name-aa165680-e295-4919-a3e0-80ceb03b874b
  'aft_loss_distribution_scale': trial.suggest_loguniform('aft_loss_distribution_scale', 0.1, 10.0),
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 0.1, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 0.1, 10.0),
  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
[I 2025-04-13 19:12:10,946] Trial 0 finished with value: -0.4547256492535058 and parameters: {'aft_loss_distribution': 'logistic', 'aft_loss_distribution_scale': 8.07115333483

Best parameters: {'aft_loss_distribution': 'normal', 'aft_loss_distribution_scale': 0.12463912873796922, 'learning_rate': 0.010579800855265184, 'max_depth': 4, 'min_child_weight': 4, 'subsample': 0.9981721889588449, 'colsample_bytree': 0.5001495900534939, 'lambda': 9.389063799429849, 'alpha': 1.5101704937715863}


In [24]:
# Final parameters:

final_params1 = {
    'objective': 'survival:aft',
    'eval_metric': 'aft-nloglik',
    'tree_method': 'hist',
    'seed': 42, 
    **best_params_lasso 
}

In [25]:
booster3 = xgb.train(final_params1, dtrain2, num_boost_round=100,
                evals=[(dtrain2, 'train')])

[0]	train-aft-nloglik:27.60615
[1]	train-aft-nloglik:27.60532
[2]	train-aft-nloglik:27.60449
[3]	train-aft-nloglik:27.60357
[4]	train-aft-nloglik:27.60255
[5]	train-aft-nloglik:27.60147
[6]	train-aft-nloglik:27.60033
[7]	train-aft-nloglik:27.59914
[8]	train-aft-nloglik:27.59787
[9]	train-aft-nloglik:27.59649
[10]	train-aft-nloglik:27.59505
[11]	train-aft-nloglik:27.59352
[12]	train-aft-nloglik:27.59189
[13]	train-aft-nloglik:27.59015
[14]	train-aft-nloglik:27.58837
[15]	train-aft-nloglik:27.58659
[16]	train-aft-nloglik:27.58479
[17]	train-aft-nloglik:27.58296
[18]	train-aft-nloglik:27.58105
[19]	train-aft-nloglik:27.57910
[20]	train-aft-nloglik:27.57714
[21]	train-aft-nloglik:27.57520
[22]	train-aft-nloglik:27.57319
[23]	train-aft-nloglik:27.57104
[24]	train-aft-nloglik:27.56876
[25]	train-aft-nloglik:27.56638
[26]	train-aft-nloglik:27.56395
[27]	train-aft-nloglik:27.56140
[28]	train-aft-nloglik:27.55873
[29]	train-aft-nloglik:27.55588
[30]	train-aft-nloglik:27.55297
[31]	train-aft-nlo

In [26]:
aft_preds3 = booster3.predict(X_Val2)
submission3 = pd.DataFrame({'ID': df_val1_copy["ID"], 'prediction': aft_preds3}) 
score_aft3= score(df_val1_copy.copy(deep=True), submission3.copy(deep=True), "ID")
print(f" The stratified concordance score of AFT XGBoost with smaller feature set: {score_aft3}.")


 The stratified concordance score of AFT XGBoost with smaller feature set: 0.5.


In [27]:
best_params

{'aft_loss_distribution': 'logistic',
 'aft_loss_distribution_scale': 0.2072777696479105,
 'learning_rate': 0.051476738630183136,
 'max_depth': 3,
 'min_child_weight': 8,
 'subsample': 0.6347551391126972,
 'colsample_bytree': 0.9208506777871813,
 'lambda': 3.819690810957991,
 'alpha': 0.5813388572207947}

### 4. Cox Proportional Hazard with XGBoost and preprocessing the data

In [36]:
time = y_train['efs_time'].values  
event = y_train['efs'].values 

In [37]:
categ_data = df_train_copy.select_dtypes(include='object').columns.tolist()
num_data = df_train_copy.drop(["efs", "efs_time","ID"], axis=1).select_dtypes(include='number').columns.tolist()

processor = create_preprocessor(categ_data, num_data)
X_train = processor.fit_transform(df_train_copy)
X_Val = processor.transform(df_val1_copy)
feature_names = processor.named_steps['preprocessor'].get_feature_names_out()

In [38]:

def objective(trial):

    params = {
        'objective': 'survival:cox',  
        'eval_metric': 'cox-nloglik', 
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 0.1, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 0.1, 10.0),
        'tree_method': 'hist',  
        'seed': 42
    }
    
    df_train_copy1 = df_train_copy.copy()  
    y_train1 = y_train.copy()
    categ_data = df_train_copy1.select_dtypes(include='object').columns.tolist()
    num_data = df_train_copy1.drop(["efs", "efs_time", "ID"], axis=1).select_dtypes(include='number').columns.tolist()

    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    results = []  

    for train_index, val_index in skf.split(df_train_copy1, df_train_copy['efs']):
        X_train, X_val = df_train_copy1.iloc[train_index], df_train_copy1.iloc[val_index]
        time = y_train1['efs_time'].values[train_index]  
        event = y_train1['efs'].values[train_index]  

        preprocessor = create_preprocessor(categ_data, num_data)
        X_train_transformed = preprocessor.fit_transform(X_train)
        X_val_transformed = preprocessor.transform(X_val)
        
        dtrain_cv = xgb.DMatrix(X_train_transformed, label=time, weight=event)
        
        dval_cv = xgb.DMatrix(X_val_transformed)
        
        booster = xgb.train(
            params=params,
            dtrain=dtrain_cv,
            num_boost_round=100,
            evals=[(dval_cv, "validation")],
            early_stopping_rounds=10,
            verbose_eval=False
        )
        
        preds = booster.predict(dval_cv)
        submission = pd.DataFrame({'ID':df_train_copy.loc[val_index]['ID'],'prediction':preds})
        
        score_xgcph = score(df_train_copy.iloc[val_index].copy(deep=True), submission.copy(deep=True), "ID")
        results.append(score_xgcph)

    return -np.mean(results)


XGCPH_study = optuna.create_study(direction='minimize')  
XGCPH_study.optimize(objective, n_trials=100)  

best_XGCPH_params = XGCPH_study.best_params
print(f"Best parameters: {best_XGCPH_params}")


[I 2025-04-13 19:17:16,345] A new study created in memory with name: no-name-9e5d612d-84fa-49d4-87cc-9e65369a18a8
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
  'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 0.1, 10.0),
  'alpha': trial.suggest_loguniform('alpha', 0.1, 10.0),
  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
[I 2025-04-13 19:17:16,577] Trial 0 finished with value: -0.5107577039078376 and parameters: {'learning_rate': 0.0011032412581873376, 'max_depth': 8, 'min_child_weight': 7, 'subsample': 0.6575540225284406, 'colsample_bytree': 0.5914004001332642, 'lambda': 6.298335841815216, '

Best parameters: {'learning_rate': 0.0026818461035399894, 'max_depth': 9, 'min_child_weight': 4, 'subsample': 0.9933602131439074, 'colsample_bytree': 0.9308683544915706, 'lambda': 8.329209443640135, 'alpha': 0.13387430423219943}


In [39]:
XGCPH_final_params  = {
    'objective': 'survival:cox',
    'eval_metric': 'cox-nloglik',
    'tree_method': 'hist',
    'seed': 42, 
    **best_XGCPH_params 
}

In [40]:
dtrain = xgb.DMatrix(X_train, label=time, weight=event)
dtest = xgb.DMatrix(X_Val)

In [41]:
boosterCPH = xgb.train(XGCPH_final_params, dtrain, num_boost_round=15, evals=[(dtrain, 'train')])

predictionCPH = boosterCPH.predict(dtest)

[0]	train-cox-nloglik:9.04517
[1]	train-cox-nloglik:9.04482
[2]	train-cox-nloglik:9.04443
[3]	train-cox-nloglik:9.04407
[4]	train-cox-nloglik:9.04379
[5]	train-cox-nloglik:9.04372
[6]	train-cox-nloglik:9.04362
[7]	train-cox-nloglik:9.04353
[8]	train-cox-nloglik:9.04317
[9]	train-cox-nloglik:9.04283
[10]	train-cox-nloglik:9.04250
[11]	train-cox-nloglik:9.04216
[12]	train-cox-nloglik:9.04184
[13]	train-cox-nloglik:9.04153
[14]	train-cox-nloglik:9.04123


In [42]:
submissionXGCPH = pd.DataFrame({'ID': df_val1_copy["ID"], 'prediction': predictionCPH}) 
score_CPH= score(df_val1_copy.copy(deep=True), submissionXGCPH.copy(deep=True), "ID")
print(f" The stratified concordance score of Cox PH XGBoost: {score_CPH}.")


 The stratified concordance score of Cox PH XGBoost: 0.514463294984588.
