### XGBoost Survival Embeddings

examples from : https://github.com/loft-br/xgboost-survival-embeddings

#### Install required libraries

In [1]:
! pip install git+https://github.com/TorkamaniLab/lohrasb.git --force-reinstall
! pip install pandas  category_encoders feature-engine scikit-learn optuna xgbse pycox
! python setup.py install && python -c "import torch" 

Collecting git+https://github.com/TorkamaniLab/lohrasb.git
  Cloning https://github.com/TorkamaniLab/lohrasb.git to /private/var/folders/v1/xbcjnd1x5rn7ct1m_rnsblk80000gp/T/pip-req-build-xhzpdidh
  Running command git clone --filter=blob:none --quiet https://github.com/TorkamaniLab/lohrasb.git /private/var/folders/v1/xbcjnd1x5rn7ct1m_rnsblk80000gp/T/pip-req-build-xhzpdidh
  Resolved https://github.com/TorkamaniLab/lohrasb.git to commit 65dcc33f9b4ada2ad7454b776068e957b1b11b31
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting aiosignal==1.3.1 (from lohrasb==4.1.0)
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting alembic==1.12.0 (from lohrasb==4.1.0)
  Obtaining dependency information for alembic==1.12.0 from https://files.pythonhosted.org/packages/a2/8b/46919127496036c8e990b2b236454a0d8655fd46e1df2fd35610a9cbc842/alembic-1.12.0-py3-none-any.whl.metadata
  Using cached alembic-1.12.0-py3-none-any.whl.metadata (7.2 kB)
Collecting ansi2html==1.8.0 (from loh

In [2]:
# Standard library imports
import sys  # For system-related utilities like getting Python version

# Append custom paths for Python libraries
sys.path.append('/usr/local/lib/python3.10/site-packages')

# Third-party library imports
import numpy as np  # Aliased for better readability
import pandas as pd  # Aliased for better readability
import sklearn  # For machine learning utilities
import torch  # For deep learning

# Scikit-learn specific imports
from sklearn.model_selection import KFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer

# Custom imports for survival models
from xgbse.converters import convert_to_structured
from xgbse.metrics import concordance_index, approx_brier_score
from xgbse import XGBSEKaplanNeighbors, XGBSEKaplanTree, XGBSEBootstrapEstimator

# Local (or application-specific) imports
import lohrasb
from lohrasb.best_estimator import BaseModel
import torch
import xgbse

# Print out versions of key libraries
print(f'Python version : {sys.version}')
print(f'lohrasb version : {lohrasb.__version__}')
print(f'sklearn version : {sklearn.__version__}')
print(f'pandas version : {pd.__version__}')  # Using the alias
print(f'numpy version : {np.__version__}')
print(f'xgbse version : {xgbse.__version__}')
print(f'torch version : {torch.__version__}')


  from .autonotebook import tqdm as notebook_tqdm


[Errno 2] No such file or directory: '/Users/hjavedani/.pyenv/versions/3.7.8/lib/python3.7/site-packages/lohrasb/config.yaml'
In this module, the default logging will be applied. The error is [Errno 2] No such file or directory: '/Users/hjavedani/.pyenv/versions/3.7.8/lib/python3.7/site-packages/lohrasb/config.yaml' which will be skipped!
default logger setting is applied !


2023-09-03 18:09:54,866	INFO util.py:90 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2023-09-03 18:09:55,893	INFO util.py:90 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


Python version : 3.7.8 (default, Feb 27 2023, 18:11:31) 
[Clang 14.0.0 (clang-1400.0.29.202)]
lohrasb version : 4.1.0
sklearn version : 1.0.2
pandas version : 1.3.5
numpy version : 1.21.6
xgbse version : 0.2.3
torch version : 1.13.1


#### Example : XGBoost Survival Embeddings (XGBSEKaplanNeighbors)
  


For more information refer to this link : https://loft-br.github.io/xgboost-survival-embeddings/examples/confidence_interval.html



In [3]:
from pycox.datasets import metabric
# read data metabric
df = metabric.read_df()
df.head()

Dataset 'metabric' not locally available. Downloading...
Done


Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,duration,event
0,5.603834,7.811392,10.797988,5.967607,1.0,1.0,0.0,1.0,56.84,99.333336,0
1,5.284882,9.581043,10.20462,5.66497,1.0,0.0,0.0,1.0,85.940002,95.73333,1
2,5.920251,6.776564,12.431715,5.873857,0.0,1.0,0.0,1.0,48.439999,140.233337,0
3,6.654017,5.341846,8.646379,5.655888,0.0,0.0,0.0,0.0,66.910004,239.300003,0
4,5.456747,5.339741,10.555724,6.008429,1.0,0.0,0.0,1.0,67.849998,56.933334,1


#### Define labels and train-test split 


In [4]:
# splitting to X, T, E format
X = df.drop(['duration', 'event'], axis=1)
y = convert_to_structured(df['duration'], df['event'])

# splitting between train, and validation 
(X_train, X_test,
 y_train, y_test) = \
train_test_split(X, y, test_size=0.2, random_state=42)

#### Example 1: Basic usage


In [5]:
# estimator params
fit_params = None
# fitting xgbse model
estimator = XGBSEKaplanNeighbors()
estimator_params ={
     'n_neighbors': [50,70],
}
#xgbse_model.fit(X, y)

# predicting
#event_probs = xgbse_model.predict(X)
# event_probs.head()

kwargs = {  # extra params for model if any
            'main_random_kwargs':{},
            # params for RandomSearchCV 
            'random_search_kwargs' : {
            'estimator':estimator,
            'param_distributions':estimator_params,
            'scoring': make_scorer(concordance_index, greater_is_better=True),
            'n_jobs':-1,
            'cv':KFold(2),
            'n_iter':2,
            'random_state' : 42,
            'refit':True,
            }
            }



#### Define BestModel optimzed by random search

In [6]:
obj = BaseModel().optimize_by_randomsearchcv(
 kwargs=kwargs
        )


#### Define Pipeline and predict

In [7]:
pipeline =Pipeline([
            ('obj', obj)

 ])

pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)

#### Metrics

In [8]:


print(f'C-index: {concordance_index(y_test, y_pred)}')
print(f'Avg. Brier Score: {approx_brier_score(y_test, y_pred)}')


C-index: 0.6290448103547145
Avg. Brier Score: 0.1583786250333026


#### Example 2: Define estimator and set its arguments 


In [9]:
estimator_params = {
    'n_estimators' :[100,200]

}

PARAMS_TREE = {
    'objective': 'survival:cox',
    'eval_metric': 'cox-nloglik',
    'tree_method': 'hist', 
    'max_depth': 100, 
    'booster':'dart', 
    'subsample': 1.0,
    'min_child_weight': 50, 
    'colsample_bynode': 1.0
}
base_model = XGBSEKaplanTree(PARAMS_TREE)


In [10]:
TIME_BINS = np.arange(15, 315, 15)

#### Define estimator and fit params


In [11]:
estimator=XGBSEBootstrapEstimator(base_model)
fit_params = {"time_bins":TIME_BINS}


#### Define BaseModel estimator using random search CV

In [12]:
kwargs = {  
            # params for RandomSearchCV 
            'main_random_kwargs':{},
            'fit_random_kwargs' : fit_params,
            'random_search_kwargs' : {
            'estimator':estimator,
            'param_distributions':estimator_params,
            'scoring': make_scorer(approx_brier_score, greater_is_better=False),
            'n_jobs':-1,
            'cv':KFold(2),
            'n_iter':2,
            'random_state' : 42,
            'refit':True,
            }
            }


obj = BaseModel().optimize_by_randomsearchcv(
        kwargs=kwargs
        )




#### Build sklearn pipeline

In [13]:


pipeline =Pipeline([
            ('obj', obj)

 ])


#### Run Pipeline

In [14]:
pipeline.fit(X_train,y_train)
y_pred = pipeline.predict(X_test)


#### Check performance of the pipeline

In [15]:
print(f'C-index: {concordance_index(y_test, y_pred)}')
print(f'Avg. Brier Score: {approx_brier_score(y_test, y_pred)}')


C-index: 0.6395715573031867
Avg. Brier Score: 0.17269133802814857
