In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_recall_curve, roc_curve, auc
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.metrics import matthews_corrcoef
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt 
import seaborn as sns

import time 
import sys 
import os
from tqdm import tqdm
import itertools
import json
import pickle

import xgboost as xgb
from xgboost import plot_tree
import lifelines

sys.path.append('./../src/')
from utils import *
from utils_xgboost import *

# Best parameter

In [2]:
params = {'verbosity': 0,
              'objective': 'survival:aft',
              'eval_metric': 'aft-nloglik',
              'tree_method': 'hist',
              'learning_rate': 0.01,
              'aft_loss_distribution': 'logistic',
              'aft_loss_distribution_scale': 1.2,
              'max_depth': 10,
              'lambda': 0.01,
              'alpha': 0.1}


num_boost_round = 500
test_size=0.3
seeds = [999, 7, 42, 1995, 1303, 2405, 1996, 200, 0, 777]

In [3]:
# dataset
data_df = pd.read_csv('./../Data/breast_cancer/1000_features_survival_3classes.csv',
                      index_col=0).drop(['index'],axis=1)

data_df_event_time = data_df[['event', 'time']]


data_df = pd.get_dummies(data_df.drop(['event', 'time'], axis=1),dtype=int)
scaler = MinMaxScaler()
data_df = pd.DataFrame(scaler.fit_transform(data_df), columns=data_df.columns)
data_df['event'] = [int(e) for e in data_df_event_time['event']]
data_df['time'] = data_df_event_time['time']

data_df = data_df.fillna(data_df.mean())

val_ls = []
test_ls = []
elapsed_time_ls = []

for seed in tqdm(seeds):
    data_train, data_tmp = train_test_split(data_df, test_size=test_size, random_state=seed)
    data_val, data_test = train_test_split(data_tmp, test_size=test_size, random_state=seed)
    
    X_train = data_train.drop(['event', 'time','y'], axis=1)
    y_lower_train = data_train['time']
    y_upper_train = np.array([t if e else np.inf for t,e in zip(data_train['time'], data_train['event'])])
    dtrain = xgb.DMatrix(X_train.values)
    dtrain.set_float_info('label_lower_bound', y_lower_train)
    dtrain.set_float_info('label_upper_bound', y_upper_train)
    
    X_val = data_val.drop(['event', 'time','y'], axis=1)
    y_lower_val = data_val['time']
    y_upper_val = np.array([t if e else np.inf for t,e in zip(data_val['time'], data_val['event'])])
    dvalid = xgb.DMatrix(X_val.values)
    dvalid.set_float_info('label_lower_bound', y_lower_val)
    dvalid.set_float_info('label_upper_bound', y_upper_val)
    
    X_test = data_test.drop(['event', 'time','y'], axis=1)
    # X_test = pd.get_dummies(X_test, dtype=int)
    y_lower_test = data_test['time']
    y_upper_test = np.array([t if e else np.inf for t,e in zip(data_test['time'], data_test['event'])])
    dtest = xgb.DMatrix(X_test.values)
    dtest.set_float_info('label_lower_bound', y_lower_test)
    dtest.set_float_info('label_upper_bound', y_upper_test)
    
    start = time.time()
    bst = xgb.train(params, dtrain, num_boost_round=num_boost_round,
                    evals=[(dtrain, 'train'), (dvalid, 'valid')],
                    early_stopping_rounds=50, verbose_eval=False)
    end = time.time()
    elapsed_time_ls = elapsed_time_ls + [end-start]
    
    # Run prediction on the validation set
    df = pd.DataFrame({'Label (lower bound)': y_lower_val,
                       'Label (upper bound)': y_upper_val,
                       'Predicted label': bst.predict(dvalid)})
    
    val_ls = val_ls + [lifelines.utils.concordance_index(event_times = data_val['time'], 
                                      predicted_scores = df['Predicted label'], 
                                      event_observed = data_val['event'])]
    
    # Run prediction on the validation set
    df = pd.DataFrame({'Label (lower bound)': y_lower_test,
                       'Label (upper bound)': y_upper_test,
                       'Predicted label': bst.predict(dtest)})
    
    
    test_ls = test_ls + [lifelines.utils.concordance_index(event_times = data_test['time'], 
                                      predicted_scores = df['Predicted label'], 
                                      event_observed = data_test['event'])]


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [01:02<00:00, 62.68s/it]


In [9]:
print("Valid:",np.mean(val_ls), "Test:",np.mean(test_ls), "Elapsed time:", np.mean(elapsed_time_ls))
val_ls, test_ls, elapsed_time_ls


Valid: 0.7217400400588532 Test: 0.6987346872424727 Elapsed time: 56.54037404060364


([0.7405298341173557,
  0.7283249617820485,
  0.6840081429540827,
  0.7060506050605061,
  0.6753731343283582,
  0.7619270346117867,
  0.7119680225723019,
  0.7765814266487214,
  0.747538831765478,
  0.6850984067478912],
 [0.7337526205450734,
  0.7516425755584757,
  0.6550724637681159,
  0.6197836166924265,
  0.6818181818181818,
  0.8142589118198874,
  0.6375,
  0.8690364826941066,
  0.6941176470588235,
  0.5303643724696356],
 [53.92562985420227,
  55.571752071380615,
  60.55286741256714,
  53.668174505233765,
  55.136093616485596,
  53.71381735801697,
  60.97635793685913,
  58.58614492416382,
  55.26191329956055,
  58.01098942756653])