In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
import sklearn 
from sksurv.functions import StepFunction
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import cumulative_dynamic_auc 
from model_evaluation import evaluate_model
from sksurv.column import encode_categorical
from sksurv.ensemble import RandomSurvivalForest
from dotenv import load_dotenv
from pipeline import create_pipeline
from preprocessing import prepare_train_test
import os
from xgbse import XGBSEKaplanNeighbors, XGBSEDebiasedBCE,XGBSEStackedWeibull
from xgbse.converters import convert_to_structured
# importing metrics
from xgbse.metrics import (
    concordance_index,
    approx_brier_score,
    dist_calibration_score
)

In [2]:
load_dotenv()
root = os.environ.get("root_folder")

from preprocessing import load_data

pheno_df_train, pheno_df_test, readcounts_df_train, readcounts_df_test = load_data(root)


In [7]:
 
covariates = ['Sex', 'Age']
#covariates = ['Age', 'BodyMassIndex', 'Smoking', 'BPTreatment','SystolicBP', 'NonHDLcholesterol']

X_train, X_test, y_train, y_test, test_sample_ids = prepare_train_test(pheno_df_train, pheno_df_test, covariates)


In [4]:
# Select all covariates
# df_train = pheno_df_train.join(readcounts_df_train)
#df_test = pheno_df_test.join(readcounts_df_test)
#selection = (df_train.columns != 'Event') & (df_train.columns != 'Event_time')
#covariates = df_train.columns[selection]
#X_train, X_test, y_train, y_test, test_sample_ids = prepare_train_test(pheno_df_train, pheno_df_test, covariates)


In [8]:
from xgboost_wrapper import XGBS 
xgb_params={'aft_loss_distribution': 'normal',
            'aft_loss_distribution_scale': 1,
            'booster': 'dart',
            'colsample_bynode': 0.5,
            'eval_metric': 'aft-nloglik',
            'learning_rate': 0.05,
            'max_depth': 8,
            'min_child_weight': 50,
            'objective': 'survival:aft',
            'subsample': 0.5,
            'tree_method': 'hist'}                                  
                                  
# fitting with early stopping
xgb_model = XGBS()  

model = create_pipeline(xgb_model)
model.fit(X_train, y_train) #, num_boost_round=1000, validation_data=None, early_stopping_rounds=None, verbose_eval=0)

# predicting
preds = model.predict(X_test)

from model_evaluation import evaluate_xgb_model 

evaluate_xgb_model(model, X_train, X_test, y_train, y_test)


Unnamed: 0,Harrell C,Concordance index IPCW
train,0.735132,0.691686
test,0.722202,0.661502


In [9]:
xgb_params={'aft_loss_distribution': 'normal',
                                'aft_loss_distribution_scale': 1,
                                'booster': 'dart', 'colsample_bynode': 0.5,
                                'eval_metric': 'aft-nloglik',
                                'learning_rate': 0.05, 'max_depth': 8,
                                'min_child_weight': 50,
                                'objective': 'survival:aft', 'subsample': 0.5,
                                'tree_method': 'hist'}
# fitting with early stopping
xgb_model = XGBSEStackedWeibull(xgb_params)

model = create_pipeline(xgb_model)
model.fit(X_train, y_train)

# predicting
preds = model.predict(X_test)


# running metrics
print(f'C-index: {concordance_index(y_test, preds)}')
print(f'Avg. Brier Score: {approx_brier_score(y_test, preds)}')
print(f"""D-Calibration: {dist_calibration_score(y_test, preds) > 0.05}""")

C-index: 0.7122934941271162
Avg. Brier Score: 0.014045313691248828
D-Calibration: True


In [14]:
from model_evaluation import evaluate_xgbse_model

evaluate_xgbse_models(model, X_train, X_test, y_train, y_test)

Unnamed: 0,Harrell C,Concordance index IPCW
train,0.714462,0.652576
test,0.71983,0.660205
