In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
import optuna
from sklearn.preprocessing import LabelEncoder 
from lifelines.utils import concordance_index
import bisect

In [4]:
data = pd.read_csv('train.csv')
X = data.drop(['efs_time', 'efs', 'ID'], axis=1) 
y_time = data['efs_time']
y_event = data['efs']

# test data
data_test = pd.read_csv('test.csv')
X_test = data_test.drop(columns=['ID'], axis=1)

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
all_cols = X.columns

for col in categorical_cols:
    X[col] = X[col].astype('object')
    X_test[col] = X_test[col].astype('object')
    value_counts = X[col].value_counts()
    category_map = {cat: idx for idx, cat in enumerate(value_counts.index)}
    X[col] = X[col].map(category_map)
    X_test[col] = X_test[col].map(category_map)

In [5]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline

imputer = IterativeImputer(
    estimator=BayesianRidge(),
    sample_posterior=True,
    max_iter=30,
    random_state=42
)

X_num_processed = imputer.fit_transform(X[numerical_cols])
X[numerical_cols] = X_num_processed
X

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10
0,3.0,0.0,,0.0,0.574103,6.574146,0,0.0,6.0,1,...,90.000000,0.0,,1.0,0.0,8.0,0.0,2.0,0.0,10.0
1,0.0,0.0,1.0,0.0,2.000000,8.000000,3,0.0,6.0,0,...,90.000000,0.0,0.0,0.0,0.0,8.0,0.0,2.0,1.0,10.0
2,3.0,0.0,,0.0,2.000000,8.000000,0,0.0,6.0,1,...,90.000000,0.0,0.0,0.0,0.0,8.0,0.0,2.0,0.0,10.0
3,2.0,0.0,1.0,0.0,2.000000,8.000000,0,0.0,6.0,1,...,90.000000,1.0,0.0,1.0,0.0,8.0,0.0,2.0,0.0,10.0
4,2.0,0.0,,0.0,2.000000,8.000000,0,0.0,6.0,0,...,90.000000,0.0,0.0,0.0,1.0,8.0,0.0,2.0,0.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,7.0,,2.0,0.0,2.000000,8.000000,0,0.0,6.0,0,...,120.994586,,3.0,,0.0,8.0,,2.0,0.0,10.0
28796,2.0,0.0,0.0,1.0,1.000000,4.000000,0,0.0,5.0,0,...,90.000000,0.0,1.0,0.0,0.0,6.0,1.0,1.0,1.0,8.0
28797,4.0,,0.0,,2.000000,8.000000,0,,6.0,0,...,90.000000,,1.0,1.0,0.0,8.0,,2.0,0.0,10.0
28798,3.0,0.0,0.0,0.0,1.000000,4.000000,0,0.0,3.0,0,...,90.000000,0.0,0.0,0.0,1.0,4.0,0.0,1.0,0.0,5.0


In [8]:
class CoxObjective:
    def __init__(self, time, event):
        self.time = time
        self.event = event
        self.sorted_indices = np.argsort(self.time)
        self.sorted_time = self.time[self.sorted_indices]
        self.sorted_event = self.event[self.sorted_indices]
        self.event_positions_sorted = [i for i in range(len(self.sorted_event)) if self.sorted_event[i] == 1]

    def __call__(self, y_pred, dataset=None):  # Added dataset parameter with default None
        sorted_exp_f = np.exp(y_pred[self.sorted_indices])
        sum_risk = np.cumsum(sorted_exp_f[::-1])[::-1]
        
        sum_1_over_sum_risk = []
        sum_1_over_sum_risk_squared = []
        for event_pos in self.event_positions_sorted:
            sum_r = sum_risk[event_pos]
            sum_1_over_sum_risk.append(1 / sum_r)
            sum_1_over_sum_risk_squared.append(1 / sum_r**2)
        
        cum_sum_1_over_sum_risk = np.cumsum(sum_1_over_sum_risk)
        cum_sum_1_over_sum_risk_squared = np.cumsum(sum_1_over_sum_risk_squared)
        
        gradients = np.zeros(len(y_pred))
        hessians = np.zeros(len(y_pred))
        sorted_position = {self.sorted_indices[k]: k for k in range(len(self.sorted_indices))}
        
        for k in range(len(y_pred)):
            sorted_pos_k = sorted_position[k]
            index = bisect.bisect_right(self.event_positions_sorted, sorted_pos_k)
            sum_1_over_sum_risk_k = cum_sum_1_over_sum_risk[index-1] if index > 0 else 0
            sum_1_over_sum_risk_squared_k = cum_sum_1_over_sum_risk_squared[index-1] if index > 0 else 0
            
            exp_f_k = np.exp(y_pred[k])
            sum_term_grad = exp_f_k * sum_1_over_sum_risk_k
            sum_term_hess = exp_f_k * sum_1_over_sum_risk_k - exp_f_k**2 * sum_1_over_sum_risk_squared_k
            
            gradient_k = - self.event[k] + sum_term_grad
            hessian_k = sum_term_hess
            
            gradients[k] = gradient_k
            hessians[k] = hessian_k
            
        return gradients, hessians

def objective(trial):
    
    params = {
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 25, 127),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'max_bin': trial.suggest_int('max_bin', 3, 255),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-06, 10., log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-06, 10., log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.7, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'verbose': -1,
        'num_threads': 4,
        'seed': 42
    }
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    race_groups = X['race_group'].unique()
    fold_scores = []
    fold = -1

    for train_idx, val_idx in skf.split(X, X['race_group']):
        fold += 1
        print(f'Running in fold {fold}...')
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_time_train, y_time_val = y_time.iloc[train_idx], y_time.iloc[val_idx]
        y_event_train, y_event_val = y_event.iloc[train_idx], y_event.iloc[val_idx]

        train_data = lgb.Dataset(X_train, label= y_time_train)
        val_data = lgb.Dataset(X_val, label= y_time_val)

        def cindex_eval(y_pred, data_val):
            # Calculate stratified C-index for validation data
            race_specific_scores = []
            for race in race_groups:
                race_mask = X_val['race_group'] == race
                if sum(race_mask) > 1:  # Only calculate if we have at least 2 samples
                    surv = np.array([(e, t) for e, t in zip(y_event_val[race_mask], y_time_val[race_mask])], 
                                dtype=[('event', bool), ('time', float)])
                    race_cindex = concordance_index(surv['time'], -y_pred[race_mask], surv['event'])
                    race_specific_scores.append(race_cindex)
            
            stratified_cindex = np.mean(race_specific_scores) - np.std(race_specific_scores)
            return 'stratified-c-index', stratified_cindex, True
        
        params['objective'] = CoxObjective(y_time_train.values, y_event_train.values)
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=1000,
            feval=cindex_eval,
            callbacks=[lgb.early_stopping(stopping_rounds=100)]
        )
        y_pred = model.predict(X_val)
        
        # Calculate stratified C-index for fold evaluation
        stratified_cindex = cindex_eval(y_pred, X_val)[1]
        fold_scores.append(stratified_cindex)
    
    print('5 folds stratified C-index:', fold_scores)
    return np.mean(fold_scores)

In [9]:
# Now create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=3, n_jobs=1)
best_params = study.best_params

[I 2025-02-26 15:27:55,326] A new study created in memory with name: no-name-de56ec8a-af05-46d8-a7a2-2dc5673f1bae


Running in fold 0...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[418]	valid_0's stratified-c-index: 0.664029
Running in fold 1...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[534]	valid_0's stratified-c-index: 0.662963
Running in fold 2...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[251]	valid_0's stratified-c-index: 0.668421
Running in fold 3...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[492]	valid_0's stratified-c-index: 0.664821
Running in fold 4...
Training until validation scores don't improve for 100 rounds


[I 2025-02-26 15:31:18,499] Trial 0 finished with value: 0.6665323176646996 and parameters: {'num_leaves': 64, 'max_depth': 4, 'max_bin': 241, 'learning_rate': 0.09548561069547094, 'reg_alpha': 4.069325033588869e-06, 'reg_lambda': 2.8094442984516377e-06, 'min_data_in_leaf': 59, 'feature_fraction': 0.7947136709269988, 'bagging_fraction': 0.9681454593802129, 'bagging_freq': 7}. Best is trial 0 with value: 0.6665323176646996.


Early stopping, best iteration is:
[486]	valid_0's stratified-c-index: 0.672428
5 folds stratified C-index: [0.6640289542265714, 0.6629628434035976, 0.668421430784625, 0.6648206950600698, 0.6724276648486338]
Running in fold 0...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[607]	valid_0's stratified-c-index: 0.667937
Running in fold 1...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[486]	valid_0's stratified-c-index: 0.666931
Running in fold 2...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[200]	valid_0's stratified-c-index: 0.668988
Running in fold 3...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[360]	valid_0's stratified-c-index: 0.665124
Running in fold 4...
Training until validation scores don't improve for 100 rounds


[I 2025-02-26 15:34:34,297] Trial 1 finished with value: 0.6679511959222973 and parameters: {'num_leaves': 41, 'max_depth': 10, 'max_bin': 162, 'learning_rate': 0.05783691248195078, 'reg_alpha': 1.7750297012921608, 'reg_lambda': 3.2832460448631146e-06, 'min_data_in_leaf': 32, 'feature_fraction': 0.8612046322726409, 'bagging_fraction': 0.9534056924117531, 'bagging_freq': 6}. Best is trial 1 with value: 0.6679511959222973.


Early stopping, best iteration is:
[360]	valid_0's stratified-c-index: 0.670775
5 folds stratified C-index: [0.6679374006675757, 0.6669314595423271, 0.6689879226222802, 0.6651241674494656, 0.6707750293298386]
Running in fold 0...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[773]	valid_0's stratified-c-index: 0.666904
Running in fold 1...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[897]	valid_0's stratified-c-index: 0.662459
Running in fold 2...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[408]	valid_0's stratified-c-index: 0.668936
Running in fold 3...
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[548]	valid_0's stratified-c-index: 0.665968
Running in fold 4...
Training until validation scores don't improve for 100 rounds


[I 2025-02-26 15:39:05,801] Trial 2 finished with value: 0.6665353876692947 and parameters: {'num_leaves': 114, 'max_depth': 4, 'max_bin': 249, 'learning_rate': 0.06167006314288751, 'reg_alpha': 0.0004835591795563421, 'reg_lambda': 1.7798765760254021e-06, 'min_data_in_leaf': 48, 'feature_fraction': 0.861925397288651, 'bagging_fraction': 0.7632104972857008, 'bagging_freq': 1}. Best is trial 1 with value: 0.6679511959222973.


Early stopping, best iteration is:
[304]	valid_0's stratified-c-index: 0.668409
5 folds stratified C-index: [0.6669043052333506, 0.662458892803685, 0.6689364851573686, 0.6659681100736667, 0.6684091450784025]


In [45]:
study.best_value

0.6600664169611014

In [None]:
study.best_params