In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb
from lifelines.utils import concordance_index
import optuna

import warnings
warnings.filterwarnings("ignore")

In [13]:
data = pd.read_csv('train.csv')
X = data.drop(['efs_time', 'efs', 'ID'], axis=1) 
y = data[['efs_time', 'efs']]

# test data
data_test = pd.read_csv('test.csv')
X_test = data_test.drop(columns=['ID'], axis=1)

def preprocess_data(df: pd.DataFrame):
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns

    for col in categorical_cols:
        value_counts = df[col].value_counts()
        category_map = {cat: idx for idx, cat in enumerate(value_counts.index)}
        df[col] = df[col].map(category_map)
        df[col] = df[col].astype('category')
    
    df['year_hct'] = df['year_hct'] - 2000
    return df

X = preprocess_data(X)
X_test = preprocess_data(X_test)

In [31]:
def objective(trial):
    param = {
        'objective': 'survival:aft',
        # 'eval_metric': 'aft-nloglik',
        'tree_method': 'hist',
        'lambda': trial.suggest_float('lambda', 1e-6, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-6, 10.0, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.8, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42,
        'verbosity': 0
    }
    
    X_val_5folds = pd.DataFrame([], columns=X.columns)
    y_val_5folds = pd.DataFrame([], columns=y.columns)   
    y_pred_5folds = []

    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    for train_index, val_index in skf.split(X, X['race_group']):
        X_tr, X_val = X.iloc[train_index], X.iloc[val_index]
        y_tr, y_val = y.iloc[train_index], y.iloc[val_index]
        
        # Create lower and upper bounds for AFT model
        lower_bounds = y_tr['efs_time'].values  # Lower bound is observed time for all
        
        # For upper bound: same as observed time for events, infinity for censored
        upper_bounds = y_tr['efs_time'].values.copy()
        upper_bounds[y_tr['efs'] == 0] = float('inf')  # Set censored observations to infinity
        
        # Create validation bounds too
        val_lower_bounds = y_val['efs_time'].values
        val_upper_bounds = y_val['efs_time'].values.copy()
        val_upper_bounds[y_val['efs'] == 0] = float('inf')
        
        # Create DMatrix with proper label bounds
        dtrain = xgb.DMatrix(
            X_tr, 
            label=y_tr['efs_time'],
            enable_categorical=True,
            feature_weights=None,
            label_lower_bound=lower_bounds,
            label_upper_bound=upper_bounds
        )
        
        dval = xgb.DMatrix(
            X_val, 
            label=y_val['efs_time'], 
            enable_categorical=True,
            feature_weights=None,
            label_lower_bound=val_lower_bounds,
            label_upper_bound=val_upper_bounds
        )
        
        model = xgb.train(
            param, 
            dtrain, 
            evals=[(dval, 'eval')], 
            num_boost_round=1000,
            early_stopping_rounds=100, 
            verbose_eval=False
        )

        X_val_5folds = pd.concat([X_val_5folds, X_val], axis=0, ignore_index=True)
        y_val_5folds = pd.concat([y_val_5folds, y_val], axis=0, ignore_index=True)
        preds = model.predict(dval)
        y_pred_5folds += list(preds)

    race_groups = X_val_5folds['race_group'].unique()
    c_index_scores_by_race = []
    for race in race_groups:
        race_mask = X_val_5folds['race_group'] == race
        c_index_race = concordance_index(
            y_val_5folds['efs_time'][race_mask], 
            np.array(y_pred_5folds)[race_mask], 
            y_val_5folds['efs'][race_mask]
        )
        c_index_scores_by_race.append(c_index_race)

    return np.mean(c_index_scores_by_race) - np.sqrt(np.var(c_index_scores_by_race))

In [32]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=3, n_jobs=1,
               show_progress_bar=True)

print('Best hyperparameters:', study.best_params)
print('Best C-index:', study.best_value)

[I 2025-02-27 14:35:53,674] A new study created in memory with name: no-name-79ff1882-e228-4012-a16a-012261e80618


  0%|          | 0/3 [00:00<?, ?it/s]

[I 2025-02-27 14:35:56,696] Trial 0 finished with value: 0.6649324128038797 and parameters: {'lambda': 0.0008583933121248923, 'alpha': 2.6655344100412997, 'learning_rate': 0.098400465628949, 'subsample': 0.9184946812328655, 'colsample_bytree': 0.9878581857773272, 'max_depth': 4, 'min_child_weight': 8}. Best is trial 0 with value: 0.6649324128038797.
[I 2025-02-27 14:36:05,186] Trial 1 finished with value: 0.6647546032694026 and parameters: {'lambda': 1.5702965989264461, 'alpha': 0.007953893879969672, 'learning_rate': 0.02557106160030199, 'subsample': 0.9348231222585843, 'colsample_bytree': 0.8999920032919763, 'max_depth': 6, 'min_child_weight': 3}. Best is trial 0 with value: 0.6649324128038797.
[I 2025-02-27 14:36:08,575] Trial 2 finished with value: 0.6627480190355703 and parameters: {'lambda': 0.21078566223939313, 'alpha': 0.027667958495350357, 'learning_rate': 0.06795783245395481, 'subsample': 0.9156206571980972, 'colsample_bytree': 0.8886727617944985, 'max_depth': 6, 'min_child_we