In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
import optuna
from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb
from lifelines import CoxPHFitter
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")


np.random.seed(42)

In [2]:
%run -i ../examples/concordance_index.ipynb

In this notebook I tried AFT XGBoost I tried the  following:
1. Split the data into train and validation set
2. Fitted the Cox PH regressor and evaluated it on the validation set with score of 0.0.6539783375777234;
3. Used random forest classifier to predict efs, and added the predictions to the feature set;
4. Obtained hyper parameters from an Optuna study;
5. Performed unstratified 8-fold cross-validation and got the mean stratified C-score of 0.8860016472162744 (either data leakage or model is overfitting);
6. Evaluated the model on the validation set and got the score of 0.6468318190816368. 

I didn't use any sophisticated preprocessing, just the simple imputer.

In [3]:
# Importing data:

data = pd.read_csv("../data/train_set.csv")

df_train_copy = data.copy(deep=True)

# Categorical columns

categ_columns = df_train_copy.select_dtypes(include = ['object']).columns

to_replace = ["Not done", "Not tested", "Other", "Missing disease status", "Non-resident of the U.S."]
df_train_copy.loc[:,categ_columns] = df_train_copy[categ_columns].replace(to_replace, "missing")
df_train_copy.loc[:,categ_columns] = df_train_copy[categ_columns].fillna('missing')

# Numerical columns:

num_columns = df_train_copy.select_dtypes(include = ['float64']).columns
df_train_copy.loc[:, num_columns] = df_train_copy[num_columns].fillna(-1.0)
target_features = ['efs', 'efs_time']


In [4]:
# Preprocessing pipeline:

def create_preprocessor(categ_data, num_data):
    
    cat_imputer = SimpleImputer(strategy='most_frequent')
    num_imputer = SimpleImputer(strategy='mean')
    scaler = StandardScaler()

    num_pipeline = Pipeline([
        ('imputer', num_imputer),
        ('scaler', scaler)
    ])

    cat_pipeline = Pipeline([
        ('imputer', cat_imputer),
        ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat_imputer', cat_pipeline, categ_data),
            ('num_imputer', num_pipeline, num_data)
        ],
        verbose_feature_names_out=False
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor)
    ])
    
    return pipeline


In [5]:
# Splitting into train and validation sets 

df_train, df_val  = train_test_split(df_train_copy, test_size=0.2,random_state=42,stratify=df_train_copy['efs'])
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [6]:
# Encoding both sets for Cox PH regressor:

categ_data = df_train.select_dtypes(include='object').columns.tolist()
num_data = df_train.drop(["efs", "efs_time","ID"], axis=1).select_dtypes(include='number').columns.tolist()

processor = create_preprocessor(categ_data, num_data)
X_train = processor.fit_transform(df_train)
X_Val = processor.transform(df_val)
feature_names = processor.named_steps['preprocessor'].get_feature_names_out()

Cox Proportional Hazard as a baseline:

In [7]:
CPH = CoxPHFitter()
train_data = pd.DataFrame(X_train, columns = feature_names, index = df_train.index)
train_plus = pd.concat([train_data, df_train[['efs', 'efs_time']]], axis =1)
val_data = pd.DataFrame(X_Val, columns = feature_names, index = df_val.index)


In [8]:
CPH.fit(train_plus, duration_col='efs_time', event_col='efs')
predicted_hazards = CPH.predict_partial_hazard(val_data)

In [9]:
submissionCPH = pd.DataFrame({'ID': df_val["ID"], 'prediction': predicted_hazards}) 
score_cph = score(df_val.copy(deep=True), submissionCPH.copy(deep=True), "ID")
print(f" The stratified concordance score of Cox Proportional Hazard: {score_cph}.")

 The stratified concordance score of Cox Proportional Hazard: 0.6539783375777234.


### 1. Using the Random Forest Classifier to predict whether or not a row is censored

In [10]:
# Same preprocessed stuff

y_train_forest = df_train['efs']

y_val_forest = df_val['efs']
RFC = RandomForestClassifier()
RFC.fit(X_train,y_train_forest)

In [11]:
# Since by the below the training data has balanced labels, I can use accuracy as a metric

print(y_train_forest.shape)
print(y_train_forest.sum())

(18432,)
9942.0


In [12]:
y_pred = RFC.predict(X_Val)
accuracy = accuracy_score(y_val_forest,y_pred)
print(f"The accuracy of the Random Forest regressor on the validation set: {accuracy}")

The accuracy of the Random Forest regressor on the validation set: 0.6742621527777778


In [13]:
# Adding a new column

proba_column1 = RFC.predict_proba(X_train)[:, 1] 
df_train1 = df_train.copy(deep=True)
df_train1["clf_proba"] = proba_column1

proba_column2 = RFC.predict_proba(X_Val)[:, 1]
df_val1 = df_val.copy(deep=True)
df_val1["clf_proba"] = proba_column2


### 2. Optuna study to get hyper parameters

In [14]:
def objective(trial):
    # Sample hyperparameters from a 9 dimensional search space
    params = {
        'objective': 'survival:aft',
        'eval_metric': 'aft-nloglik',
        'aft_loss_distribution': trial.suggest_categorical('aft_loss_distribution', ['logistic', 'normal']),
        'aft_loss_distribution_scale': trial.suggest_loguniform('aft_loss_distribution_scale', 0.1, 10.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'lambda': trial.suggest_loguniform('lambda', 0.1, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 0.1, 10.0),
        'tree_method': 'hist', 
        'seed': 42
    }

    df_train_copy1 = df_train1.copy()  
    y_train1 = df_train[['efs','efs_time']]
    categ_data = df_train_copy1.select_dtypes(include='object').columns.tolist()
    num_data = df_train_copy1.drop(["efs", "efs_time", "ID"], axis=1).select_dtypes(include='number').columns.tolist()

    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    results = []

    for train_index, val_index in skf.split(df_train_copy1, df_train_copy1['efs']): 
        X_train, X_val = df_train_copy1.iloc[train_index], df_train_copy1.iloc[val_index]
        y_tr, y_val = y_train1.iloc[train_index], y_train1.iloc[val_index]
        
        y_lower_bound_tr = y_tr['efs_time']        
        y_lower_bound_val = y_val['efs_time'] 
        y_upper_bound_tr = np.where(y_tr['efs']==0, np.inf,y_tr['efs_time'] )
        y_upper_bound_val = np.where(y_val['efs']==0, np.inf,y_val['efs_time'] )

        preprocessor = create_preprocessor(categ_data, num_data)

        X_train_transformed = preprocessor.fit_transform(X_train)
        X_val_transformed = preprocessor.transform(X_val)

        
        dtrain_cv = xgb.DMatrix(X_train_transformed)
        dval_cv = xgb.DMatrix(X_val_transformed)
        dtrain_cv.set_float_info('label_lower_bound', y_lower_bound_tr)
        dtrain_cv.set_float_info('label_upper_bound', y_upper_bound_tr)

        dval_cv.set_float_info('label_lower_bound', y_lower_bound_val)
        dval_cv.set_float_info('label_upper_bound', y_upper_bound_val)


        booster = xgb.train(
            params=params,
            dtrain=dtrain_cv,
            num_boost_round=200,
            evals=[(dval_cv, "validation")],
            verbose_eval=False
        )


    
        preds = -booster.predict(dval_cv)
        submission = pd.DataFrame({'ID': X_val["ID"], 'prediction': preds}) 
        score_aft= score(X_val.copy(deep=True), submission.copy(deep=True), "ID")
        results.append(score_aft)


    return np.mean(results)

pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction='maximize',pruner = pruner, sampler=optuna.samplers.TPESampler(seed=42)) 
study.optimize(objective, n_trials=100)  

best_params = study.best_params
print(f"Best parameters: {best_params}")


[I 2025-04-18 00:40:46,559] A new study created in memory with name: no-name-a7d39cca-795d-4d05-9689-63e7e772a383
[I 2025-04-18 00:40:48,330] Trial 0 finished with value: 0.8658437595683127 and parameters: {'aft_loss_distribution': 'normal', 'aft_loss_distribution_scale': 2.9106359131330706, 'learning_rate': 0.030049873591901578, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 0.6232334448672797, 'colsample_bytree': 0.9464704583099741, 'lambda': 1.5930522616241016, 'alpha': 2.6070247583707675}. Best is trial 0 with value: 0.8658437595683127.
[I 2025-04-18 00:40:50,047] Trial 1 finished with value: 0.8285386216616077 and parameters: {'aft_loss_distribution': 'normal', 'aft_loss_distribution_scale': 4.622589001020831, 'learning_rate': 0.009445600138094694, 'max_depth': 4, 'min_child_weight': 2, 'subsample': 0.721696897183815, 'colsample_bytree': 0.8099025726528951, 'lambda': 0.7309539835912913, 'alpha': 0.38234752246751863}. Best is trial 0 with value: 0.8658437595683127.
[I 2025-04-

Best parameters: {'aft_loss_distribution': 'normal', 'aft_loss_distribution_scale': 0.3563253359779637, 'learning_rate': 0.09063846463307294, 'max_depth': 7, 'min_child_weight': 5, 'subsample': 0.8356992326488509, 'colsample_bytree': 0.9690171324318362, 'lambda': 2.9287633042919166, 'alpha': 7.542816373061915}


In [17]:
# Final parameters:

final_params = {
    'objective': 'survival:aft',
    'eval_metric': 'aft-nloglik',
    'tree_method': 'hist',
    'seed': 42, 
    **best_params
}

### 3. 8-fold cross validation using parameters from Optuna study:

In [18]:
n_splits = 8
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=42)
target_features = ['efs', 'efs_time']
results = []
for i, (train_idx,test_idx) in enumerate(kfold.split(df_train1)):
   
    X_train = df_train1.iloc[train_idx].drop(columns = target_features)
    y_train = df_train1.loc[train_idx,target_features]
    X_val = df_train1.iloc[test_idx].drop(columns = target_features)
    y_test = df_train1.loc[test_idx, target_features]

    y_lower_bound_tr = y_train['efs_time']        
    y_lower_bound_val = y_test['efs_time'] 
    y_upper_bound_tr = np.where(y_train['efs']==0, np.inf,y_train['efs_time'] )
    y_upper_bound_val = np.where(y_test['efs']==0, np.inf,y_test['efs_time'] )

    categ_data = df_train1.select_dtypes(include='object').columns.tolist()
    num_data = df_train1.drop(["efs", "efs_time", "ID"], axis=1).select_dtypes(include='number').columns.tolist()

    processor = create_preprocessor(categ_data, num_data)
    X_train_processed = processor.fit_transform(X_train)
    X_val_processed =processor.transform(X_val)

    dtrain = xgb.DMatrix(X_train_processed)
    dtrain.set_float_info('label', y_train['efs_time']) 
    dtrain.set_float_info('label_lower_bound', y_lower_bound_tr)
    dtrain.set_float_info('label_upper_bound', y_upper_bound_tr)

    booster1 = xgb.train(final_params, dtrain, num_boost_round=100,
                evals=[(dtrain, 'train')],verbose_eval=False)
    
    X_Val1  = xgb.DMatrix(X_val_processed)
    X_Val1.set_float_info('label', y_test['efs_time'])
    X_Val1.set_float_info('label_lower_bound', y_lower_bound_val)
    X_Val1.set_float_info('label_upper_bound', y_upper_bound_val)



    aft_preds = -booster1.predict(X_Val1)
    submission = pd.DataFrame({'ID': df_train.iloc[test_idx]["ID"], 'prediction': aft_preds}) 
    score_aft = score(df_train.iloc[test_idx].copy(deep=True), submission.copy(deep=True), "ID")
    results.append(score_aft)

    
    print(f"stratified c-index for fold {i}: \n \
            SC-index: score_aft xgb_aft: {score_aft}")
print(f"The mean performance is {np.mean(results)} ")

stratified c-index for fold 0: 
             SC-index: score_aft xgb_aft: 0.8856383400667769
stratified c-index for fold 1: 
             SC-index: score_aft xgb_aft: 0.8835790703629994
stratified c-index for fold 2: 
             SC-index: score_aft xgb_aft: 0.8883686643334778
stratified c-index for fold 3: 
             SC-index: score_aft xgb_aft: 0.8786552380994533
stratified c-index for fold 4: 
             SC-index: score_aft xgb_aft: 0.8922312671656006
stratified c-index for fold 5: 
             SC-index: score_aft xgb_aft: 0.8892520185221634
stratified c-index for fold 6: 
             SC-index: score_aft xgb_aft: 0.8860390558231056
stratified c-index for fold 7: 
             SC-index: score_aft xgb_aft: 0.8842495233566185
The mean performance is 0.8860016472162744 


### 4. Evaluation on the df_val set:

In [19]:
# Preprocessing data_train, since we added a new column:

categ_data = df_train.select_dtypes(include='object').columns.tolist()
num_data = df_train.drop(["efs", "efs_time","ID"], axis=1).select_dtypes(include='number').columns.tolist()

processor = create_preprocessor(categ_data, num_data)
X_train1 = processor.fit_transform(df_train1)
X_Val1 = processor.transform(df_val1)


# Training on best parameters from Optuna study:

y_lower_bound = df_train1['efs_time']
y_upper_bound = np.where(df_train1['efs']==0, np.inf,df_train1['efs_time'] )
dtrain = xgb.DMatrix(X_train1)
dtrain.set_float_info('label', df_train1['efs_time']) 

dtrain.set_float_info('label_lower_bound', y_lower_bound)
dtrain.set_float_info('label_upper_bound', y_upper_bound)
booster1 = xgb.train(final_params, dtrain, num_boost_round=100,
                evals=[(dtrain, 'train')],
                verbose_eval=False)

# Evaluating:

X_Val11  = xgb.DMatrix(X_Val1)
aft_preds1 = -booster1.predict(X_Val11)
submission1 = pd.DataFrame({'ID': df_val["ID"], 'prediction': aft_preds1}) 
score_aft1= score(df_val.copy(deep=True), submission1.copy(deep=True), "ID")
print(f" The stratified concordance score of AFT XGBoost on the validation set: {score_aft1}.")

 The stratified concordance score of AFT XGBoost on the validation set: 0.6468318190816368.
