In [33]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
import optuna
from sklearn.preprocessing import LabelEncoder 
from lifelines.utils import concordance_index
import bisect

In [34]:
data = pd.read_csv('train.csv')
X = data.drop(['efs_time', 'efs', 'ID'], axis=1) 
y_time = data['efs_time']
y_event = data['efs']

# test data
data_test = pd.read_csv('test.csv')
X_test = data_test.drop(columns=['ID'], axis=1)

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].mean())
X_test[numerical_cols] = X_test[numerical_cols].fillna(X_test[numerical_cols].mean())
# X[categorical_cols] = X[categorical_cols].fillna(X[categorical_cols].mode().iloc[0])
# le_dict = {}
# for col in categorical_cols:
#     le = LabelEncoder()
#     X[col] = le.fit_transform(X[col])
#     le_dict[col] = le

# # Handle categorical columns
for col in categorical_cols:
    X[col] = X[col].astype('object')
    X_test[col] = X_test[col].astype('object')
    value_counts = X[col].value_counts()
    category_map = {cat: idx for idx, cat in enumerate(value_counts.index)}
    X[col] = X[col].map(category_map)
    X_test[col] = X_test[col].map(category_map)

In [35]:
X

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10
0,3.0,0.0,,0.0,1.764516,6.876801,0,0.0,6.0,1,...,90.00000,0.0,,1.0,0.0,8.0,0.0,2.0,0.0,10.0
1,0.0,0.0,1.0,0.0,2.000000,8.000000,3,0.0,6.0,0,...,90.00000,0.0,0.0,0.0,0.0,8.0,0.0,2.0,1.0,10.0
2,3.0,0.0,,0.0,2.000000,8.000000,0,0.0,6.0,1,...,90.00000,0.0,0.0,0.0,0.0,8.0,0.0,2.0,0.0,10.0
3,2.0,0.0,1.0,0.0,2.000000,8.000000,0,0.0,6.0,1,...,90.00000,1.0,0.0,1.0,0.0,8.0,0.0,2.0,0.0,10.0
4,2.0,0.0,,0.0,2.000000,8.000000,0,0.0,6.0,0,...,90.00000,0.0,0.0,0.0,1.0,8.0,0.0,2.0,0.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,7.0,,2.0,0.0,2.000000,8.000000,0,0.0,6.0,0,...,83.83208,,3.0,,0.0,8.0,,2.0,0.0,10.0
28796,2.0,0.0,0.0,1.0,1.000000,4.000000,0,0.0,5.0,0,...,90.00000,0.0,1.0,0.0,0.0,6.0,1.0,1.0,1.0,8.0
28797,4.0,,0.0,,2.000000,8.000000,0,,6.0,0,...,90.00000,,1.0,1.0,0.0,8.0,,2.0,0.0,10.0
28798,3.0,0.0,0.0,0.0,1.000000,4.000000,0,0.0,3.0,0,...,90.00000,0.0,0.0,0.0,1.0,4.0,0.0,1.0,0.0,5.0


In [36]:
X_test

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10
0,3,0,,0,2.0,8.0,0,0,6.0,1,...,90.0,0,,1,0,8.0,0,2.0,0,10.0
1,0,0,1.0,0,2.0,8.0,3,0,6.0,0,...,90.0,0,0.0,0,0,8.0,0,2.0,1,10.0
2,3,0,,0,2.0,8.0,0,0,6.0,1,...,90.0,0,0.0,0,0,8.0,0,2.0,0,10.0


In [5]:
import numpy as np
import bisect

class CindexObjectiveSurrogate:
    def __init__(self, time, event):
        """
        Build a list of valid pairs (i, j) such that sample i had an event (event==1)
        and time[i] < time[j]. The surrogate loss is computed over these pairs.
        """
        self.time = time
        self.event = event
        self.valid_pairs = []
        n = len(time)
        for i in range(n):
            if event[i] == 1:  # i must have the event observed
                for j in range(n):
                    if time[i] < time[j]:
                        self.valid_pairs.append((i, j))
                        
    def __call__(self, y_pred, dataset=None):
        """
        For each valid pair (i, j), we use the surrogate loss:
        
            L_{ij} = log(1 + exp(-(y_pred[i] - y_pred[j])))
        
        The gradients and Hessians are computed from this loss.
        """
        n = len(y_pred)
        gradients = np.zeros(n)
        hessians = np.zeros(n)
        
        # For numerical stability, we can later consider vectorized implementation
        for (i, j) in self.valid_pairs:
            delta = y_pred[i] - y_pred[j]
            # Compute sigmoid over (r_j - r_i) using the logistic function.
            # Note: sigmoid = 1/(1+exp(delta)) is equivalent to σ(r_j - r_i)
            sigmoid_ij = 1.0 / (1.0 + np.exp(delta))
            
            # The derivative of L_ij with respect to y_pred[i] is -sigmoid_ij,
            # and for y_pred[j] it is +sigmoid_ij.
            gradients[i] -= sigmoid_ij
            gradients[j] += sigmoid_ij
            
            # The second derivative for this pair is sigmoid*(1-sigmoid)
            second_deriv = sigmoid_ij * (1 - sigmoid_ij)
            hessians[i] += second_deriv
            hessians[j] += second_deriv
            
        return gradients, hessians

# Example usage in the objective function for LightGBM:
def objective_surrogate(trial):
    params = {
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 25, 127),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-06, 10., log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-06, 10., log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'verbose': -1,
        'num_threads': 4,
        'seed': 42
    }

    # ... Stratified K-fold and data splitting as in your current code ...
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    race_groups = X['race_group'].unique()
    fold_scores = []
    fold = -1

    for train_idx, val_idx in skf.split(X, X['race_group']):
        fold += 1
        print(f'Running in fold {fold}...')
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_time_train, y_time_val = y_time.iloc[train_idx], y_time.iloc[val_idx]
        y_event_train, y_event_val = y_event.iloc[train_idx], y_event.iloc[val_idx]

        train_data = lgb.Dataset(X_train, label=y_time_train)
        val_data = lgb.Dataset(X_val, label=y_time_val)

        def cindex_eval(y_pred, data_val):
            # Calculate stratified C-index for validation data
            race_specific_scores = []
            for race in race_groups:
                race_mask = X_val['race_group'] == race
                if sum(race_mask) > 1:  # Only calculate if we have at least 2 samples
                    surv = np.array([(e, t) for e, t in zip(y_event_val[race_mask], y_time_val[race_mask])], 
                                    dtype=[('event', bool), ('time', float)])
                    race_cindex = concordance_index_censored(surv['event'], surv['time'], y_pred[race_mask])[0]
                    race_specific_scores.append(race_cindex)
            stratified_cindex = np.mean(race_specific_scores) - np.std(race_specific_scores)
            return 'stratified-c-index', stratified_cindex, True
        
        # Use our surrogate objective function here.
        params['objective'] = CindexObjectiveSurrogate(y_time_train.values, y_event_train.values)
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=1000,
            feval=cindex_eval,
            callbacks=[lgb.early_stopping(stopping_rounds=100)]
        )
        
        y_pred = model.predict(X_val)
        stratified_cindex = cindex_eval(y_pred, X_val)[1]
        fold_scores.append(stratified_cindex)
    
    print('5 folds stratified C-index:', fold_scores)
    return np.mean(fold_scores)

In [None]:
# Now create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective_surrogate, n_trials=3, n_jobs=1)
best_params = study.best_params

In [37]:
class CoxObjective:
    def __init__(self, time, event):
        self.time = time
        self.event = event
        self.sorted_indices = np.argsort(self.time)
        self.sorted_time = self.time[self.sorted_indices]
        self.sorted_event = self.event[self.sorted_indices]
        self.event_positions_sorted = [i for i in range(len(self.sorted_event)) if self.sorted_event[i] == 1]

    def __call__(self, y_pred, dataset=None):  # Added dataset parameter with default None
        sorted_exp_f = np.exp(y_pred[self.sorted_indices])
        sum_risk = np.cumsum(sorted_exp_f[::-1])[::-1]
        
        sum_1_over_sum_risk = []
        sum_1_over_sum_risk_squared = []
        for event_pos in self.event_positions_sorted:
            sum_r = sum_risk[event_pos]
            sum_1_over_sum_risk.append(1 / sum_r)
            sum_1_over_sum_risk_squared.append(1 / sum_r**2)
        
        cum_sum_1_over_sum_risk = np.cumsum(sum_1_over_sum_risk)
        cum_sum_1_over_sum_risk_squared = np.cumsum(sum_1_over_sum_risk_squared)
        
        gradients = np.zeros(len(y_pred))
        hessians = np.zeros(len(y_pred))
        sorted_position = {self.sorted_indices[k]: k for k in range(len(self.sorted_indices))}
        
        for k in range(len(y_pred)):
            sorted_pos_k = sorted_position[k]
            index = bisect.bisect_right(self.event_positions_sorted, sorted_pos_k)
            sum_1_over_sum_risk_k = cum_sum_1_over_sum_risk[index-1] if index > 0 else 0
            sum_1_over_sum_risk_squared_k = cum_sum_1_over_sum_risk_squared[index-1] if index > 0 else 0
            
            exp_f_k = np.exp(y_pred[k])
            sum_term_grad = exp_f_k * sum_1_over_sum_risk_k
            sum_term_hess = exp_f_k * sum_1_over_sum_risk_k - exp_f_k**2 * sum_1_over_sum_risk_squared_k
            
            gradient_k = - self.event[k] + sum_term_grad
            hessian_k = sum_term_hess
            
            gradients[k] = gradient_k
            hessians[k] = hessian_k
            
        return gradients, hessians

def objective(trial):
    
    params = {
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 25, 127),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-06, 10., log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-06, 10., log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'verbose': -1,
        'num_threads': 4,
        'seed': 42
    }
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    race_groups = X['race_group'].unique()
    fold_scores = []
    fold = -1

    for train_idx, val_idx in skf.split(X, X['race_group']):
        fold += 1
        print(f'Running in fold {fold}...')
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_time_train, y_time_val = y_time.iloc[train_idx], y_time.iloc[val_idx]
        y_event_train, y_event_val = y_event.iloc[train_idx], y_event.iloc[val_idx]

        train_data = lgb.Dataset(X_train, label= y_time_train)
        val_data = lgb.Dataset(X_val, label= y_time_val)

        def cindex_eval(y_pred, data_val):
            # Calculate stratified C-index for validation data
            race_specific_scores = []
            for race in race_groups:
                race_mask = X_val['race_group'] == race
                if sum(race_mask) > 1:  # Only calculate if we have at least 2 samples
                    surv = np.array([(e, t) for e, t in zip(y_event_val[race_mask], y_time_val[race_mask])], 
                                dtype=[('event', bool), ('time', float)])
                    race_cindex = concordance_index(surv['event'], y_pred[race_mask], surv['time'])
                    race_specific_scores.append(race_cindex)
            
            stratified_cindex = np.mean(race_specific_scores) - np.std(race_specific_scores)
            return 'stratified-c-index', stratified_cindex, True
        
        params['objective'] = CoxObjective(y_time_train.values, y_event_train.values)
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=1000,
            feval=cindex_eval,
            callbacks=[lgb.early_stopping(stopping_rounds=100)]
        )
        y_pred = model.predict(X_val)
        
        # Calculate stratified C-index for fold evaluation
        stratified_cindex = cindex_eval(y_pred, X_val)[1]
        fold_scores.append(stratified_cindex)
    
    print('5 folds stratified C-index:', fold_scores)
    return np.mean(fold_scores)

In [38]:
# Now create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=3, n_jobs=1)
best_params = study.best_params

[I 2025-02-24 17:44:04,719] A new study created in memory with name: no-name-9113d3d6-c89a-413b-a49c-5cee2486d91a


Running in fold 0...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[107]	valid_0's stratified-c-index: 0.718966
Running in fold 1...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[78]	valid_0's stratified-c-index: 0.721776
Running in fold 2...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[39]	valid_0's stratified-c-index: 0.725852
Running in fold 3...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[83]	valid_0's stratified-c-index: 0.713933
Running in fold 4...
Training until validation scores don't improve for 100 rounds


[I 2025-02-24 17:45:13,730] Trial 0 finished with value: 0.7220796515934068 and parameters: {'num_leaves': 73, 'max_depth': 9, 'learning_rate': 0.0905460791939017, 'reg_alpha': 3.9642148648504008, 'reg_lambda': 1.188854019285974e-06, 'min_data_in_leaf': 90, 'feature_fraction': 0.9323510167185858, 'bagging_fraction': 0.800955338665778, 'bagging_freq': 3}. Best is trial 0 with value: 0.7220796515934068.


Early stopping, best iteration is:
[103]	valid_0's stratified-c-index: 0.729871
5 folds stratified C-index: [np.float64(0.7189661779135742), np.float64(0.7217760440542735), np.float64(0.7258523591041617), np.float64(0.7139328007274621), np.float64(0.729870876167562)]
Running in fold 0...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[123]	valid_0's stratified-c-index: 0.717496
Running in fold 1...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[205]	valid_0's stratified-c-index: 0.720873
Running in fold 2...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[60]	valid_0's stratified-c-index: 0.726741
Running in fold 3...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[86]	valid_0's stratified-c-index: 0.713395
Running in fold 4...
Training until validation scores don't improve for 100 rounds

[I 2025-02-24 17:46:33,543] Trial 1 finished with value: 0.7215109597668492 and parameters: {'num_leaves': 54, 'max_depth': 7, 'learning_rate': 0.07720557641384572, 'reg_alpha': 1.8291334798280068, 'reg_lambda': 0.0002275917577701212, 'min_data_in_leaf': 33, 'feature_fraction': 0.844942750504041, 'bagging_fraction': 0.7428564108981732, 'bagging_freq': 8}. Best is trial 0 with value: 0.7220796515934068.


Early stopping, best iteration is:
[93]	valid_0's stratified-c-index: 0.72905
5 folds stratified C-index: [np.float64(0.7174957822562686), np.float64(0.7208730275188117), np.float64(0.7267410762140424), np.float64(0.713394951784614), np.float64(0.7290499610605099)]
Running in fold 0...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[707]	valid_0's stratified-c-index: 0.725643
Running in fold 1...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[706]	valid_0's stratified-c-index: 0.731299
Running in fold 2...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[364]	valid_0's stratified-c-index: 0.732104
Running in fold 3...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[720]	valid_0's stratified-c-index: 0.718065
Running in fold 4...
Training until validation scores don't improve for 100 rounds

[I 2025-02-24 17:50:48,168] Trial 2 finished with value: 0.7284111787287877 and parameters: {'num_leaves': 25, 'max_depth': 8, 'learning_rate': 0.02129900343077506, 'reg_alpha': 0.0010957371286396205, 'reg_lambda': 1.5292051575208883e-06, 'min_data_in_leaf': 92, 'feature_fraction': 0.8117343187919308, 'bagging_fraction': 0.9315789476929082, 'bagging_freq': 9}. Best is trial 2 with value: 0.7284111787287877.


Early stopping, best iteration is:
[451]	valid_0's stratified-c-index: 0.734945
5 folds stratified C-index: [np.float64(0.7256428659369099), np.float64(0.7312991122394129), np.float64(0.7321041709096052), np.float64(0.7180647565604734), np.float64(0.7349449879975373)]


In [39]:
best_params

{'num_leaves': 25,
 'max_depth': 8,
 'learning_rate': 0.02129900343077506,
 'reg_alpha': 0.0010957371286396205,
 'reg_lambda': 1.5292051575208883e-06,
 'min_data_in_leaf': 92,
 'feature_fraction': 0.8117343187919308,
 'bagging_fraction': 0.9315789476929082,
 'bagging_freq': 9}

In [41]:
# create a list to store y_pred_test for 5 folds
y_pred_test_all = []
fold_scores = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in skf.split(X, X['race_group']):
        
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_time_train, y_time_val = y_time.iloc[train_idx], y_time.iloc[val_idx]
    y_event_train, y_event_val = y_event.iloc[train_idx], y_event.iloc[val_idx]

    train_data = lgb.Dataset(X_train, label= y_time_train)
    val_data = lgb.Dataset(X_val, label= y_time_val)

    def cindex_eval(y_pred, data_val):
        # Calculate stratified C-index for validation data
        race_groups = X['race_group'].unique()
        race_specific_scores = []
        for race in race_groups:
            race_mask = X_val['race_group'] == race
            if sum(race_mask) > 1:  # Only calculate if we have at least 2 samples
                surv = np.array([(e, t) for e, t in zip(y_event_val[race_mask], y_time_val[race_mask])], 
                            dtype=[('event', bool), ('time', float)])
                race_cindex = concordance_index(surv['event'], y_pred[race_mask], surv['time'])
                race_specific_scores.append(race_cindex)
        
        stratified_cindex = np.mean(race_specific_scores) - np.std(race_specific_scores)
        return 'stratified-c-index', stratified_cindex, True
    
    best_params['objective'] = CoxObjective(y_time_train.values, y_event_train.values)
    best_params['verbose'] = -1
    best_params['seed'] = 42
    model = lgb.train(
        best_params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=1000,
        feval=cindex_eval,
        callbacks=[lgb.early_stopping(stopping_rounds=100)]
    )
    y_pred = model.predict(X_val)
    y_pred_test = model.predict(X_test)
    y_pred_test_all.append(y_pred_test)

    # Calculate stratified C-index for fold evaluation
    stratified_cindex = cindex_eval(y_pred, X_val)[1]
    fold_scores.append(stratified_cindex)

print('Mean of 5 folds C-index:', np.mean(fold_scores))

y_pred_test_all = np.array(y_pred_test_all)
y_pred_test_mean = np.mean(y_pred_test_all, axis=0)

submission = pd.DataFrame({'ID': data_test['ID'], 'prediction': y_pred_test_mean})
submission.to_csv('submission.csv', index=False)
submission

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[707]	valid_0's stratified-c-index: 0.725643
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[706]	valid_0's stratified-c-index: 0.731299
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[364]	valid_0's stratified-c-index: 0.732104
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[720]	valid_0's stratified-c-index: 0.718065
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[451]	valid_0's stratified-c-index: 0.734945
Mean of 5 folds C-index: 0.7284111787287877


Unnamed: 0,ID,prediction
0,28800,-0.980617
1,28801,0.660733
2,28802,-1.751341


In [42]:
print(lgb.__version__)

4.5.0
