In [12]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
import optuna
from sklearn.preprocessing import LabelEncoder 
from lifelines.utils import concordance_index
import bisect

In [29]:
data = pd.read_csv('train.csv')
X = data.drop(['efs_time', 'efs', 'ID'], axis=1) 
y_time = data['efs_time']
y_event = data['efs']

# test data
data_test = pd.read_csv('test.csv')
X_test = data_test.drop(columns=['ID'], axis=1)

def preprocess_data(df: pd.DataFrame):
    numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns

    for col in categorical_cols:
        value_counts = df[col].value_counts()
        category_map = {cat: idx for idx, cat in enumerate(value_counts.index)}
        df[col] = df[col].map(category_map)
        df[col] = df[col].astype('category')
    
    df['year_hct'] = df['year_hct'] - 2000
    return df

X = preprocess_data(X)
X_test = preprocess_data(X_test)

In [5]:
import numpy as np
import bisect

class CindexObjectiveSurrogate:
    def __init__(self, time, event):
        """
        Build a list of valid pairs (i, j) such that sample i had an event (event==1)
        and time[i] < time[j]. The surrogate loss is computed over these pairs.
        """
        self.time = time
        self.event = event
        self.valid_pairs = []
        n = len(time)
        for i in range(n):
            if event[i] == 1:  # i must have the event observed
                for j in range(n):
                    if time[i] < time[j]:
                        self.valid_pairs.append((i, j))
                        
    def __call__(self, y_pred, dataset=None):
        """
        For each valid pair (i, j), we use the surrogate loss:
        
            L_{ij} = log(1 + exp(-(y_pred[i] - y_pred[j])))
        
        The gradients and Hessians are computed from this loss.
        """
        n = len(y_pred)
        gradients = np.zeros(n)
        hessians = np.zeros(n)
        
        # For numerical stability, we can later consider vectorized implementation
        for (i, j) in self.valid_pairs:
            delta = y_pred[i] - y_pred[j]
            # Compute sigmoid over (r_j - r_i) using the logistic function.
            # Note: sigmoid = 1/(1+exp(delta)) is equivalent to σ(r_j - r_i)
            sigmoid_ij = 1.0 / (1.0 + np.exp(delta))
            
            # The derivative of L_ij with respect to y_pred[i] is -sigmoid_ij,
            # and for y_pred[j] it is +sigmoid_ij.
            gradients[i] -= sigmoid_ij
            gradients[j] += sigmoid_ij
            
            # The second derivative for this pair is sigmoid*(1-sigmoid)
            second_deriv = sigmoid_ij * (1 - sigmoid_ij)
            hessians[i] += second_deriv
            hessians[j] += second_deriv
            
        return gradients, hessians

# Example usage in the objective function for LightGBM:
def objective_surrogate(trial):
    params = {
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 25, 127),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-06, 10., log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-06, 10., log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'verbose': -1,
        'num_threads': 4,
        'seed': 42
    }

    # ... Stratified K-fold and data splitting as in your current code ...
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    race_groups = X['race_group'].unique()
    fold_scores = []
    fold = -1

    for train_idx, val_idx in skf.split(X, X['race_group']):
        fold += 1
        print(f'Running in fold {fold}...')
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_time_train, y_time_val = y_time.iloc[train_idx], y_time.iloc[val_idx]
        y_event_train, y_event_val = y_event.iloc[train_idx], y_event.iloc[val_idx]

        train_data = lgb.Dataset(X_train, label=y_time_train)
        val_data = lgb.Dataset(X_val, label=y_time_val)

        def cindex_eval(y_pred, data_val):
            # Calculate stratified C-index for validation data
            race_specific_scores = []
            for race in race_groups:
                race_mask = X_val['race_group'] == race
                if sum(race_mask) > 1:  # Only calculate if we have at least 2 samples
                    surv = np.array([(e, t) for e, t in zip(y_event_val[race_mask], y_time_val[race_mask])], 
                                    dtype=[('event', bool), ('time', float)])
                    race_cindex = concordance_index_censored(surv['event'], surv['time'], y_pred[race_mask])[0]
                    race_specific_scores.append(race_cindex)
            stratified_cindex = np.mean(race_specific_scores) - np.std(race_specific_scores)
            return 'stratified-c-index', stratified_cindex, True
        
        # Use our surrogate objective function here.
        params['objective'] = CindexObjectiveSurrogate(y_time_train.values, y_event_train.values)
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=1000,
            feval=cindex_eval,
            callbacks=[lgb.early_stopping(stopping_rounds=100)]
        )
        
        y_pred = model.predict(X_val)
        stratified_cindex = cindex_eval(y_pred, X_val)[1]
        fold_scores.append(stratified_cindex)
    
    print('5 folds stratified C-index:', fold_scores)
    return np.mean(fold_scores)

In [None]:
# Now create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective_surrogate, n_trials=3, n_jobs=1)
best_params = study.best_params

## Custom Objective Function with LightGBM:

In [31]:
class CoxObjective:
    def __init__(self, time, event):
        self.time = time
        self.event = event
        self.sorted_indices = np.argsort(self.time)
        self.sorted_time = self.time[self.sorted_indices]
        self.sorted_event = self.event[self.sorted_indices]
        self.event_positions_sorted = [i for i in range(len(self.sorted_event)) if self.sorted_event[i] == 1]

    def __call__(self, y_pred, dataset=None):  # Added dataset parameter with default None
        sorted_exp_f = np.exp(y_pred[self.sorted_indices])
        sum_risk = np.cumsum(sorted_exp_f[::-1])[::-1]
        
        sum_1_over_sum_risk = []
        sum_1_over_sum_risk_squared = []
        for event_pos in self.event_positions_sorted:
            sum_r = sum_risk[event_pos]
            sum_1_over_sum_risk.append(1 / sum_r)
            sum_1_over_sum_risk_squared.append(1 / sum_r**2)
        
        cum_sum_1_over_sum_risk = np.cumsum(sum_1_over_sum_risk)
        cum_sum_1_over_sum_risk_squared = np.cumsum(sum_1_over_sum_risk_squared)
        
        gradients = np.zeros(len(y_pred))
        hessians = np.zeros(len(y_pred))
        sorted_position = {self.sorted_indices[k]: k for k in range(len(self.sorted_indices))}
        
        for k in range(len(y_pred)):
            sorted_pos_k = sorted_position[k]
            index = bisect.bisect_right(self.event_positions_sorted, sorted_pos_k)
            sum_1_over_sum_risk_k = cum_sum_1_over_sum_risk[index-1] if index > 0 else 0
            sum_1_over_sum_risk_squared_k = cum_sum_1_over_sum_risk_squared[index-1] if index > 0 else 0
            
            exp_f_k = np.exp(y_pred[k])
            sum_term_grad = exp_f_k * sum_1_over_sum_risk_k
            sum_term_hess = exp_f_k * sum_1_over_sum_risk_k - exp_f_k**2 * sum_1_over_sum_risk_squared_k
            
            gradient_k = - self.event[k] + sum_term_grad
            hessian_k = sum_term_hess
            
            gradients[k] = gradient_k
            hessians[k] = hessian_k
            
        return gradients, hessians

def objective(trial):
    
    params = {
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 25, 127),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'max_bin': trial.suggest_int('max_bin', 3, 255),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-06, 10., log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-06, 10., log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'verbose': -1,
        'num_threads': 4,
        'seed': 42
    }
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    race_groups = X['race_group'].unique()
    fold_scores = []
    fold = -1

    for train_idx, val_idx in skf.split(X, X['race_group']):
        fold += 1
        print(f'Running in fold {fold}...')
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_time_train, y_time_val = y_time.iloc[train_idx], y_time.iloc[val_idx]
        y_event_train, y_event_val = y_event.iloc[train_idx], y_event.iloc[val_idx]

        train_data = lgb.Dataset(X_train, label= y_time_train)
        val_data = lgb.Dataset(X_val, label= y_time_val)

        def cindex_eval(y_pred, data_val):
            # Calculate stratified C-index for validation data
            race_specific_scores = []
            for race in race_groups:
                race_mask = X_val['race_group'] == race
                if sum(race_mask) > 1:  # Only calculate if we have at least 2 samples
                    surv = np.array([(e, t) for e, t in zip(y_event_val[race_mask], y_time_val[race_mask])], 
                                dtype=[('event', bool), ('time', float)])
                    race_cindex = concordance_index(surv['time'], -y_pred[race_mask], surv['event'])
                    race_specific_scores.append(race_cindex)
            
            stratified_cindex = np.mean(race_specific_scores) - np.std(race_specific_scores)
            return 'stratified-c-index', stratified_cindex, True
        
        params['objective'] = CoxObjective(y_time_train.values, y_event_train.values)
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=1000,
            feval=cindex_eval,
            callbacks=[lgb.early_stopping(stopping_rounds=100)]
        )
        y_pred = model.predict(X_val)
        
        # Calculate stratified C-index for fold evaluation
        stratified_cindex = cindex_eval(y_pred, X_val)[1]
        fold_scores.append(stratified_cindex)
    
    print('5 folds stratified C-index:', fold_scores)
    return np.mean(fold_scores)

In [32]:
# Now create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=3, n_jobs=1)
best_params = study.best_params

[I 2025-02-26 11:00:34,354] A new study created in memory with name: no-name-4c815de0-c9c8-4a37-bee7-060c9a9ddaa2


Running in fold 0...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's stratified-c-index: 0.671053
Running in fold 1...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[776]	valid_0's stratified-c-index: 0.664565
Running in fold 2...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[614]	valid_0's stratified-c-index: 0.672109
Running in fold 3...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[984]	valid_0's stratified-c-index: 0.669044
Running in fold 4...
Training until validation scores don't improve for 100 rounds


[I 2025-02-26 11:06:39,627] Trial 0 finished with value: 0.6704280886415624 and parameters: {'num_leaves': 26, 'max_depth': 7, 'max_bin': 178, 'learning_rate': 0.024701283706556516, 'reg_alpha': 2.2640045684783586e-05, 'reg_lambda': 2.866864083510606e-06, 'min_data_in_leaf': 55, 'feature_fraction': 0.9463253128280514, 'bagging_fraction': 0.8981480225202592, 'bagging_freq': 8}. Best is trial 0 with value: 0.6704280886415624.


Early stopping, best iteration is:
[832]	valid_0's stratified-c-index: 0.675369
5 folds stratified C-index: [np.float64(0.6710527002178232), np.float64(0.664565452217577), np.float64(0.6721085150747396), np.float64(0.6690443135295748), np.float64(0.675369462168098)]
Running in fold 0...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[982]	valid_0's stratified-c-index: 0.665779
Running in fold 1...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's stratified-c-index: 0.659575
Running in fold 2...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[989]	valid_0's stratified-c-index: 0.673413
Running in fold 3...
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's stratified-c-index: 0.662404
Running in fold 4...
Training

[I 2025-02-26 11:13:09,562] Trial 1 finished with value: 0.6669354039554529 and parameters: {'num_leaves': 32, 'max_depth': 9, 'max_bin': 245, 'learning_rate': 0.01105369021228455, 'reg_alpha': 0.011585450817545157, 'reg_lambda': 2.104775345738505, 'min_data_in_leaf': 23, 'feature_fraction': 0.8553374410563878, 'bagging_fraction': 0.9778146175573622, 'bagging_freq': 6}. Best is trial 0 with value: 0.6704280886415624.


Early stopping, best iteration is:
[870]	valid_0's stratified-c-index: 0.673506
5 folds stratified C-index: [np.float64(0.665779074388085), np.float64(0.6595745113166227), np.float64(0.6734133029584342), np.float64(0.6624043771488111), np.float64(0.6735057539653119)]
Running in fold 0...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[306]	valid_0's stratified-c-index: 0.669684
Running in fold 1...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[304]	valid_0's stratified-c-index: 0.66374
Running in fold 2...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[122]	valid_0's stratified-c-index: 0.671538
Running in fold 3...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[220]	valid_0's stratified-c-index: 0.661494
Running in fold 4...
Training until validation scores don't improve for 100 round

[I 2025-02-26 11:15:17,929] Trial 2 finished with value: 0.6670483068478383 and parameters: {'num_leaves': 96, 'max_depth': 10, 'max_bin': 20, 'learning_rate': 0.04978326254933003, 'reg_alpha': 4.95885108498113e-05, 'reg_lambda': 0.10620592668788614, 'min_data_in_leaf': 86, 'feature_fraction': 0.9885774934760958, 'bagging_fraction': 0.7412056800719447, 'bagging_freq': 3}. Best is trial 0 with value: 0.6704280886415624.


Early stopping, best iteration is:
[186]	valid_0's stratified-c-index: 0.668786
5 folds stratified C-index: [np.float64(0.6696837215682377), np.float64(0.6637399361917257), np.float64(0.6715378111677521), np.float64(0.661494319595486), np.float64(0.66878574571599)]


In [34]:
study.best_value

0.6704280886415624

## XGBoost model as Kaggle:

In [36]:
train = X
test = X_test
FEATURES = X.columns
print(len(FEATURES))

57


In [41]:
from xgboost import XGBRegressor, XGBClassifier
import xgboost
print("Using XGBoost version",xgboost.__version__)
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

FOLDS = 5
kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

oof_xgb = np.zeros(len(train))
pred_efs = np.zeros(len(test))

for i, (train_index, test_index) in enumerate(kf.split(train, y_event)):

    print("#"*25)
    print(f"### Fold {i+1}")
    print("#"*25)
    
    x_train = train.loc[train_index, FEATURES].copy()
    y_train = y_event.loc[train_index]
    x_valid = train.loc[test_index, FEATURES].copy()
    y_valid = y_event.loc[test_index]
    x_test = test[FEATURES].copy()

    model_xgb = XGBClassifier(
        device="cuda",
        max_depth=3,  
        colsample_bytree=0.7129400756425178, 
        subsample=0.8185881823156917, 
        n_estimators=20_000, 
        learning_rate=0.04425768131771064,  
        eval_metric="auc", 
        early_stopping_rounds=50, 
        objective='binary:logistic',
        scale_pos_weight=1.5379160847615545,  
        min_child_weight=4,
        enable_categorical=True,
        gamma=3.1330719334577584
    )
    model_xgb.fit(
        x_train, y_train,
        eval_set=[(x_valid, y_valid)],  
        verbose=100
    )

    # INFER OOF (Probabilities -> Binary)
    oof_xgb[test_index] = (model_xgb.predict_proba(x_valid)[:, 1] > 0.5).astype(int)
    # INFER TEST (Probabilities -> Average Probs)
    pred_efs += model_xgb.predict_proba(x_test)[:, 1]

# COMPUTE AVERAGE TEST PREDS
pred_efs = (pred_efs / FOLDS > 0.5).astype(int)

# EVALUATE PERFORMANCE
accuracy = accuracy_score(y_event, oof_xgb)
f1 = f1_score(y_event, oof_xgb)
roc_auc = roc_auc_score(y_event, oof_xgb)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Using XGBoost version 2.1.3
#########################
### Fold 1
#########################
[0]	validation_0-auc:0.65127
[100]	validation_0-auc:0.72622
[200]	validation_0-auc:0.73465
[300]	validation_0-auc:0.73841
[400]	validation_0-auc:0.74028
[500]	validation_0-auc:0.74112
[597]	validation_0-auc:0.74135
#########################
### Fold 2
#########################
[0]	validation_0-auc:0.67067
[100]	validation_0-auc:0.75028
[200]	validation_0-auc:0.75782
[300]	validation_0-auc:0.76242
[400]	validation_0-auc:0.76424
[500]	validation_0-auc:0.76544
[600]	validation_0-auc:0.76587
[681]	validation_0-auc:0.76621
#########################
### Fold 3
#########################
[0]	validation_0-auc:0.66956
[100]	validation_0-auc:0.74831
[200]	validation_0-auc:0.75756
[300]	validation_0-auc:0.76059
[400]	validation_0-auc:0.76242
[500]	validation_0-auc:0.76330
[600]	validation_0-auc:0.76422
[644]	validation_0-auc:0.76401
#########################
### Fold 4
#########################
[0]	validatio