## XGBoost Survival Embeddings

examples from : https://github.com/loft-br/xgboost-survival-embeddings

#### Install Pytorch  and other imports

In [1]:

! pip3 install torch==1.12.1
import sys
sys.path.append('/usr/local/lib/python3.10/site-packages')
import torch
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from lohrasb.best_estimator import BaseModel
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from xgbse.converters import convert_to_structured
from xgbse.metrics import (
    concordance_index,
    approx_brier_score
)
from xgbse import (
    XGBSEKaplanNeighbors,
    XGBSEKaplanTree,
    XGBSEBootstrapEstimator
)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
2023-02-21 20:45:32,287 :: matplotlib :: matplotlib data path: /Users/hjavedani/Documents/Lohrasb/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
2023-02-21 20:45:32,294 :: matplotlib :: CONFIGDIR=/Users/hjavedani/.matplotlib
2023-02-21 20:45:32,296 :: matplotlib :: interactive is False
2023-02-21 20:45:32,297 :: matplotlib :: platform is darwin
2023-02-21 20:45:33,387 :: matplotlib :: CACHEDIR=/Users/hjavedani/.matplotlib
2023-02-21 20:45:33,391 :: matplotlib.font_manager :: Using fontManager instance from /Users/hjavedani/.matplotlib/fontlist-v330.json


#### Example : XGBoost Survival Embeddings (XGBSEKaplanNeighbors)
  


For more information refer to this link : https://loft-br.github.io/xgboost-survival-embeddings/examples/confidence_interval.html



In [2]:
from pycox.datasets import metabric
# read data metabric
df = metabric.read_df()
df.head()

2023-02-21 20:45:34,011 :: h5py._conv :: Creating converter from 7 to 5
2023-02-21 20:45:34,011 :: h5py._conv :: Creating converter from 5 to 7
2023-02-21 20:45:34,012 :: h5py._conv :: Creating converter from 7 to 5
2023-02-21 20:45:34,013 :: h5py._conv :: Creating converter from 5 to 7


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,duration,event
0,5.603834,7.811392,10.797988,5.967607,1.0,1.0,0.0,1.0,56.84,99.333336,0
1,5.284882,9.581043,10.20462,5.66497,1.0,0.0,0.0,1.0,85.940002,95.73333,1
2,5.920251,6.776564,12.431715,5.873857,0.0,1.0,0.0,1.0,48.439999,140.233337,0
3,6.654017,5.341846,8.646379,5.655888,0.0,0.0,0.0,0.0,66.910004,239.300003,0
4,5.456747,5.339741,10.555724,6.008429,1.0,0.0,0.0,1.0,67.849998,56.933334,1


#### Define labels and train-test split 


In [3]:
# splitting to X, T, E format
X = df.drop(['duration', 'event'], axis=1)
y = convert_to_structured(df['duration'], df['event'])

# splitting between train, and validation 
(X_train, X_test,
 y_train, y_test) = \
train_test_split(X, y, test_size=0.2, random_state=42)

#### Example 1: Basic usage


In [4]:
# estimator params
fit_params = None
# fitting xgbse model
estimator = XGBSEKaplanNeighbors()
estimator_params ={
     'n_neighbors': [50,70],
}
#xgbse_model.fit(X, y)

# predicting
#event_probs = xgbse_model.predict(X)
# event_probs.head()

#### Define BestModel optimzed by random search

In [5]:
obj = BaseModel().optimize_by_randomsearchcv(
            estimator=estimator,
            fit_params = fit_params,
            estimator_params=estimator_params,
            measure_of_accuracy=make_scorer(concordance_index, greater_is_better=True),
            verbose=3,
            n_jobs=-1,
            n_iter=2,
            random_state=42,
            cv=KFold(2),
        )


#### Define Pipeline and predict

In [6]:
pipeline =Pipeline([
            ('obj', obj)

 ])

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

2023-02-21 20:45:34,909 :: dev :: The optimization will be based on make_scorer(concordance_index) metric!
2023-02-21 20:45:34,909 :: dev :: The optimization will be based on make_scorer(concordance_index) metric!
Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2] END ....................n_neighbors=50;, score=0.631 total time= 4.1min
[CV 2/2] END ....................n_neighbors=50;, score=0.622 total time=  43.7s
[CV 1/2] END ....................n_neighbors=70;, score=0.632 total time=  48.6s
[CV 2/2] END ....................n_neighbors=70;, score=0.619 total time=  52.4s
2023-02-21 20:52:56,014 :: dev :: The optimization will be based on make_scorer(concordance_index) metric!
2023-02-21 20:52:56,014 :: dev :: The optimization will be based on make_scorer(concordance_index) metric!


#### Metrics

In [7]:


print(f'C-index: {concordance_index(y_test, y_pred)}')
print(f'Avg. Brier Score: {approx_brier_score(y_test, y_pred)}')


C-index: 0.6290448103547145
Avg. Brier Score: 0.1583786250333026


#### Example 2: Define estimator and set its arguments 


In [8]:
estimator_params = {
    'n_estimators' :[100,200]

}

PARAMS_TREE = {
    'objective': 'survival:cox',
    'eval_metric': 'cox-nloglik',
    'tree_method': 'hist', 
    'max_depth': 100, 
    'booster':'dart', 
    'subsample': 1.0,
    'min_child_weight': 50, 
    'colsample_bynode': 1.0
}
base_model = XGBSEKaplanTree(PARAMS_TREE)


In [9]:
TIME_BINS = np.arange(15, 315, 15)

#### Define estimator and fit params


In [10]:
estimator=XGBSEBootstrapEstimator(base_model)
fit_params = {"time_bins":TIME_BINS}


#### Define BaseModel estimator using random search CV

In [11]:

obj = BaseModel().optimize_by_randomsearchcv(
            estimator=estimator,
            fit_params = fit_params,
            estimator_params=estimator_params,
            measure_of_accuracy=make_scorer(approx_brier_score, greater_is_better=False),
            verbose=3,
            n_jobs=-1,
            n_iter=2,
            random_state=42,
            cv=KFold(2),
        )



#### Build sklearn pipeline

In [12]:


pipeline =Pipeline([
            ('obj', obj)

 ])


#### Run Pipeline

In [13]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


2023-02-21 20:52:56,942 :: dev :: The optimization will be based on make_scorer(approx_brier_score, greater_is_better=False) metric!
2023-02-21 20:52:56,942 :: dev :: The optimization will be based on make_scorer(approx_brier_score, greater_is_better=False) metric!
Fitting 2 folds for each of 2 candidates, totalling 4 fits
[CV 1/2] END .................n_estimators=100;, score=-0.173 total time=   3.1s
[CV 2/2] END .................n_estimators=100;, score=-0.179 total time=   2.8s
[CV 1/2] END .................n_estimators=200;, score=-0.173 total time=   5.4s
[CV 2/2] END .................n_estimators=200;, score=-0.179 total time=   5.2s
2023-02-21 20:53:20,125 :: dev :: The optimization will be based on make_scorer(approx_brier_score, greater_is_better=False) metric!
2023-02-21 20:53:20,125 :: dev :: The optimization will be based on make_scorer(approx_brier_score, greater_is_better=False) metric!


#### Check performance of the pipeline

In [14]:
print(f'C-index: {concordance_index(y_test, y_pred)}')
print(f'Avg. Brier Score: {approx_brier_score(y_test, y_pred)}')


C-index: 0.6395715573031867
Avg. Brier Score: 0.17269133802814857
