In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pathlib
import sklearn 
from sksurv.functions import StepFunction
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.metrics import cumulative_dynamic_auc 
from model_evaluation import evaluate_model
from sksurv.column import encode_categorical
from sksurv.ensemble import RandomSurvivalForest
from dotenv import load_dotenv
from pipeline import create_pipeline
from preprocessing import prepare_train_test
import os
from xgbse import XGBSEKaplanNeighbors, XGBSEDebiasedBCE,XGBSEStackedWeibull
from xgbse.converters import convert_to_structured
from xgboost_wrapper import XGBSurvival
# importing metrics
from xgbse.metrics import (
    concordance_index,
    approx_brier_score,
    dist_calibration_score
)
from optuna import create_study
from optuna.samplers import TPESampler
from optuna.integration import XGBoostPruningCallback
from sklearn.model_selection import RepeatedKFold

# repeated K-folds
N_SPLITS = 10
N_REPEATS = 1

# Optuna
N_TRIALS = 100
MULTIVARIATE = True

# XGBoost
EARLY_STOPPING_ROUNDS = 100

RS = 124  # random state
N_JOBS =-1  # number of parallel threads

In [2]:
load_dotenv()
root = os.environ.get("root_folder")

from preprocessing import load_data

pheno_df_train, pheno_df_test, readcounts_df_train, readcounts_df_test = load_data(root)


In [3]:
 
covariates = ['Sex', 'Age']
#covariates = ['Age', 'BodyMassIndex', 'Smoking', 'BPTreatment','SystolicBP', 'NonHDLcholesterol']

X_train, X_test, y_train, y_test, test_sample_ids = prepare_train_test(pheno_df_train, pheno_df_test, covariates)


In [4]:
def objective(
    trial,
    X,
    y,
    random_state=22,
    n_splits=3,
    n_repeats=2,
    n_jobs=1,
    early_stopping_rounds=50,
):
    # XGBoost parameters
    
    params = {
    "objective": "survival:aft",
    "eval_metric": "aft-nloglik",
    "aft_loss_distribution": "normal",
    "aft_loss_distribution_scale": 1.20,
    "tree_method": "hist",
    "learning_rate": trial.suggest_float("learning_rate", 5e-3, 5e-2, log = True),
    "max_depth": trial.suggest_int("max_depth", 2, 12),
    "booster": "dart", 
    "subsample": trial.suggest_float("subsample", 0.4, 0.8, log = True),
    "alpha": trial.suggest_float("alpha", 0.01, 10.0, log = True),
    "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log = True),
    "gamma": trial.suggest_float("lambda", 1e-8, 10.0, log = True),
    }
    
  
    model = XGBSurvival(params, num_boost_round = 10000)
    
    rkf = RepeatedKFold(
        n_splits=n_splits, n_repeats=n_repeats, random_state=random_state
    )
    X_values = X.values
    y_values = y
    score = 0
    for train_index, test_index in rkf.split(X_values):
        X_A, X_B = X_values[train_index, :], X_values[test_index, :]
        y_A, y_B = y_values[train_index], y_values[test_index]
        model.fit(
            X_A,
            y_A,
            validation_data=(X_B, y_B),
            verbose_eval=0,
            early_stopping_rounds=early_stopping_rounds,
        )
        score += model.score(X_B, y_B)
    score /= n_repeats
    return score

In [5]:
sampler = TPESampler(seed=RS, multivariate=MULTIVARIATE)
study = create_study(direction="maximize", sampler=sampler)
study.optimize(
    lambda trial: objective(
        trial,
        X_train,
        y_train,
        random_state=RS,
        n_splits=N_SPLITS,
        n_repeats=N_REPEATS,
        n_jobs=1,
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    ),
    n_trials=N_TRIALS,
    n_jobs=1,
)

# display params
hp = study.best_params
for key, value in hp.items():
    print(f"{key:>20s} : {value}")
print(f"{'best objective value':>20s} : {study.best_value}")

[32m[I 2022-11-01 10:50:25,279][0m A new study created in memory with name: no-name-51179677-d53a-40e6-b736-156d5b99a719[0m
[32m[I 2022-11-01 10:58:16,805][0m Trial 0 finished with value: 6.25956059930631 and parameters: {'learning_rate': 0.006383147943705678, 'max_depth': 10, 'subsample': 0.5947624376832011, 'alpha': 0.23698646958939215, 'lambda': 2.8997003928473344e-05}. Best is trial 0 with value: 6.25956059930631.[0m
[32m[I 2022-11-01 11:01:50,117][0m Trial 1 finished with value: 6.211760128110495 and parameters: {'learning_rate': 0.009396352712286402, 'max_depth': 9, 'subsample': 0.5261503118106888, 'alpha': 0.0699388814119597, 'lambda': 0.10453876091537753}. Best is trial 0 with value: 6.25956059930631.[0m
[32m[I 2022-11-01 11:06:06,481][0m Trial 2 finished with value: 6.986572267741128 and parameters: {'learning_rate': 0.019863883897580023, 'max_depth': 3, 'subsample': 0.5240026227682785, 'alpha': 8.413274807593929, 'lambda': 0.00016158093114227923}. Best is trial 2 w

       learning_rate : 0.032833188230587194
           max_depth : 10
           subsample : 0.61828926669036
               alpha : 0.012673256334558281
              lambda : 6.468264510932119
best objective value : 7.077211389875966


Retrain the model with the optimal parameters

hp  = {'learning_rate': 0.032833188230587194,
 'max_depth': 10,
 'subsample': 0.61828926669036,
 'alpha': 0.012673256334558281,
 'lambda': 6.468264510932119}

In [6]:
hp

{'learning_rate': 0.032833188230587194,
 'max_depth': 10,
 'subsample': 0.61828926669036,
 'alpha': 0.012673256334558281,
 'lambda': 6.468264510932119}

In [5]:
hp = {'learning_rate': 0.032833188230587194,
 'max_depth': 10,
 'subsample': 0.61828926669036,
 'alpha': 0.012673256334558281,
 'lambda': 6.468264510932119}

In [6]:
params = {
    "objective": "survival:aft",
    "eval_metric": "aft-nloglik",
    "aft_loss_distribution": "normal",
    "aft_loss_distribution_scale": 1.20,
    "tree_method": "hist",
    "booster": "dart", 
    'learning_rate': 0.032833188230587194,
    'max_depth': 10,
    'subsample': 0.61828926669036,
    'alpha': 0.012673256334558281,
    'lambda': 6.468264510932119}
    
hp = params
hp["verbosity"] = 0
hp["n_estimators"] = 10000
hp["seed"] = RS

model = XGBSurvival(hp, num_boost_round= 10000)
rkf = RepeatedKFold(n_splits=N_SPLITS, n_repeats=N_REPEATS, random_state=RS)
X_values = X_train.values
y_values = y_train
score = 0
for train_index, test_index in rkf.split(X_values):
    X_A, X_B = X_values[train_index, :], X_values[test_index, :]
    y_A, y_B = y_values[train_index], y_values[test_index]
    model.fit(
        X_A,
        y_A,
        validation_data=(X_B, y_B),
        verbose_eval = 0,
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
    )
    score += model.score(X_test.values(), y_test)
score /= N_REPEATS * N_SPLITS

ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
                categorical type is supplied, DMatrix parameter `enable_categorical` must
                be set to `True`.Sex, Age

In [7]:
model.fit(
    X_A,
    y_A,
    validation_data=(X_B, y_B),
    verbose_eval=0,
    early_stopping_rounds=EARLY_STOPPING_ROUNDS,
)


<xgboost_wrapper.XGBSurvival at 0x7f389924ce50>

In [9]:
X_test

Unnamed: 0,Sex,Age
Simulated_2211,0,73.86
Simulated_1629,0,48.07
Simulated_1690,1,49.06
Simulated_1367,1,58.499
Simulated_3387,1,47.242
...,...,...
Simulated_1628,1,56.091
Simulated_3424,0,46.889
Simulated_2215,1,70.841
Simulated_3426,1,25.92


In [11]:
model.score(X_test.values, y_test)


0.6606803824195129

In [25]:
import xgboost as xgb
xgb.DMatrix(X_train)

ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
                categorical type is supplied, DMatrix parameter `enable_categorical` must
                be set to `True`.Sex, Age