In [25]:
import pandas as pd
import numpy as numpy
import xgboost as xgb

In [48]:
train = pd.read_csv('../data/train_set.csv')
X_train = train.iloc[:,:-2]
y_train = train.iloc[:,-2:]

In [51]:
X_train.head()

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10
0,16466,Intermediate,No,Poor,No,1.0,4.0,TBI + Cy +- Other,No,4.0,...,90.0,No,,Related,"N/A, Mel not given",5.0,No,1.0,No,7.0
1,4038,Intermediate,No,Poor,No,2.0,8.0,No TBI,No,6.0,...,60.0,No,,Unrelated,MEL,8.0,No,2.0,No,10.0
2,7006,High,Yes,Favorable,No,2.0,8.0,No TBI,No,6.0,...,90.0,No,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,Yes,10.0
3,9798,High - TED AML case <missing cytogenetics,No,Poor,Yes,2.0,8.0,No TBI,No,6.0,...,90.0,No,Permissive mismatched,Unrelated,MEL,8.0,No,2.0,No,10.0
4,14496,TBD cytogenetics,No,,No,2.0,8.0,No TBI,No,6.0,...,80.0,No,Permissive mismatched,Unrelated,"N/A, Mel not given",8.0,No,2.0,No,10.0


In [52]:
y_train.head()

Unnamed: 0,efs,efs_time
0,0.0,93.779
1,1.0,12.088
2,0.0,25.724
3,0.0,43.373
4,1.0,8.593


### Accelerated Failure Time model with XGBoost (predicts survival time directly):

In [30]:
y_lower_bound = np.where(y_train['efs']==0, y_train['efs_time'],y_train['efs_time'])
y_upper_bound = np.where(y_train['efs']==0, np.inf,y_train['efs_time'] )

In [74]:
# A very small example:

train_set1=X_train[['hla_high_res_8','hla_match_drb1_high']]
dtrain = xgb.DMatrix(train_set1)
dtrain.set_float_info('label_lower_bound', y_lower_bound)
dtrain.set_float_info('label_upper_bound', y_upper_bound)

params = {'objective': 'survival:aft',
          'eval_metric': 'aft-nloglik',
          'aft_loss_distribution': 'normal',
          'aft_loss_distribution_scale': 1.20,
          'tree_method': 'hist', 'learning_rate': 0.05, 'max_depth': 2}
booster1 = xgb.train(params, dtrain, num_boost_round=5,
                evals=[(dtrain, 'train')])

[0]	train-aft-nloglik:6.32165
[1]	train-aft-nloglik:5.95112
[2]	train-aft-nloglik:5.61608
[3]	train-aft-nloglik:5.31312
[4]	train-aft-nloglik:5.03916


In [81]:
# Making predictions for a single input:

train_columns = ['hla_high_res_8', 'hla_match_drb1_high'] 
new_data = pd.DataFrame([[7, 1]], columns=train_columns)
dnew = xgb.DMatrix(new_data)
prediction1 = booster1.predict(dnew)
print(prediction1)
predicted_survival_time = np.exp(prediction1)
print(f"The number of months the patient is expected to survive is {predicted_survival_time[0]}.")


[1.2035155]
The number of months the patient is expected to survive is 3.3318095207214355.


In [99]:
#!pip install lifelines

#### Concordance index for  aft

In [None]:
from lifelines.utils import concordance_index

val = pd.read_csv('../data/test_validation_set.csv')
X_test = val.iloc[:,:-2][train_columns]
y_test = val.iloc[:,-2:]
d_test = xgb.DMatrix(X_test)
aft_preds = booster1.predict(d_test)

c_index_aft = concordance_index(y_test['efs_time'].values, aft_preds, event_observed=y_test['efs'].values)
print("C-index for AFT model:", c_index_aft)

C-index for Cox model: 0.5191337236715146


### Cox Proportional Hazards Model with XGBoost (regression):

In [None]:
time = y_train['efs_time'].values  
event = y_train['efs'].values 

dtrain = xgb.DMatrix(train_set1, label=time, weight=event)

params = {
    'objective': 'survival:cox', 
    'eval_metric': 'cox-nloglik',
    'tree_method': 'hist',       
    'learning_rate': 0.05,
    'max_depth': 2
}

booster2 = xgb.train(params, dtrain, num_boost_round=5, evals=[(dtrain, 'train')])

prediction2 = bst.predict(dnew)




[0]	train-cox-nloglik:9.04482
[1]	train-cox-nloglik:9.04452
[2]	train-cox-nloglik:9.04440
[3]	train-cox-nloglik:9.04437
[4]	train-cox-nloglik:9.04439


In [80]:
hazard_ratios = np.exp(prediction2)
hazard_ratios


array([110634.26], dtype=float32)

In [110]:
cox_preds = booster2.predict(d_test)

c_index_cox = concordance_index(y_test['efs_time'].values, cox_preds, event_observed=y_test['efs'].values)

print("C-index for Cox model:", c_index_cox)

C-index for Cox model: 0.4886050077393845
