In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
import optuna
from sklearn.preprocessing import LabelEncoder 
from sksurv.metrics import concordance_index_censored
import bisect

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
data = pd.read_csv('train.csv')
data = data[data['efs'] == 1].reset_index(drop=True)

X = data.drop(['efs_time', 'efs'], axis=1) 
y_time = data['efs_time'].values  
y_event = data['efs'].values 

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns

# X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].mean())
# X[categorical_cols] = X[categorical_cols].fillna(X[categorical_cols].mode().iloc[0])
# le_dict = {}
# for col in categorical_cols:
#     le = LabelEncoder()
#     X[col] = le.fit_transform(X[col])
#     le_dict[col] = le

# Handle categorical columns
for col in categorical_cols:
    X[col] = X[col].astype('object')
    value_counts = X[col].value_counts()
    category_map = {cat: idx for idx, cat in enumerate(value_counts.index)}
    X[col] = X[col].map(category_map)

In [16]:
X

Unnamed: 0,ID,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,...,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10
0,1,0.0,0.0,1.0,0.0,2.0,8.0,3,0.0,6.0,...,90.0,0.0,0.0,0.0,0.0,8.0,0.0,2.0,1.0,10.0
1,5,1.0,0.0,0.0,1.0,2.0,7.0,1,0.0,4.0,...,90.0,0.0,0.0,0.0,0.0,5.0,0.0,2.0,1.0,6.0
2,8,0.0,0.0,5.0,0.0,,,1,0.0,6.0,...,90.0,0.0,,0.0,0.0,8.0,0.0,,1.0,10.0
3,9,0.0,0.0,1.0,0.0,2.0,8.0,0,0.0,6.0,...,70.0,0.0,1.0,1.0,0.0,8.0,0.0,2.0,0.0,9.0
4,10,0.0,0.0,1.0,1.0,2.0,8.0,0,0.0,6.0,...,90.0,0.0,0.0,1.0,0.0,8.0,1.0,2.0,0.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15527,28791,0.0,0.0,4.0,0.0,1.0,4.0,1,0.0,3.0,...,70.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,5.0
15528,28792,4.0,0.0,,0.0,2.0,8.0,0,0.0,6.0,...,80.0,0.0,2.0,1.0,0.0,8.0,0.0,2.0,0.0,10.0
15529,28793,4.0,0.0,,0.0,2.0,8.0,0,0.0,6.0,...,100.0,0.0,1.0,,1.0,8.0,0.0,2.0,0.0,10.0
15530,28794,2.0,0.0,,0.0,2.0,8.0,0,0.0,6.0,...,90.0,0.0,0.0,1.0,0.0,8.0,0.0,2.0,0.0,10.0


In [18]:
class CoxObjective:
    def __init__(self, time, event):
        self.time = time
        self.event = event
        self.sorted_indices = np.argsort(self.time)
        self.sorted_time = self.time[self.sorted_indices]
        self.sorted_event = self.event[self.sorted_indices]
        self.event_positions_sorted = [i for i in range(len(self.sorted_event)) if self.sorted_event[i] == 1]

    def __call__(self, y_pred, dataset=None):  # Added dataset parameter with default None
        sorted_exp_f = np.exp(y_pred[self.sorted_indices])
        sum_risk = np.cumsum(sorted_exp_f[::-1])[::-1]
        
        sum_1_over_sum_risk = []
        sum_1_over_sum_risk_squared = []
        for event_pos in self.event_positions_sorted:
            sum_r = sum_risk[event_pos]
            sum_1_over_sum_risk.append(1 / sum_r)
            sum_1_over_sum_risk_squared.append(1 / sum_r**2)
        
        cum_sum_1_over_sum_risk = np.cumsum(sum_1_over_sum_risk)
        cum_sum_1_over_sum_risk_squared = np.cumsum(sum_1_over_sum_risk_squared)
        
        gradients = np.zeros(len(y_pred))
        hessians = np.zeros(len(y_pred))
        sorted_position = {self.sorted_indices[k]: k for k in range(len(self.sorted_indices))}
        
        for k in range(len(y_pred)):
            sorted_pos_k = sorted_position[k]
            index = bisect.bisect_right(self.event_positions_sorted, sorted_pos_k)
            sum_1_over_sum_risk_k = cum_sum_1_over_sum_risk[index-1] if index > 0 else 0
            sum_1_over_sum_risk_squared_k = cum_sum_1_over_sum_risk_squared[index-1] if index > 0 else 0
            
            exp_f_k = np.exp(y_pred[k])
            sum_term_grad = exp_f_k * sum_1_over_sum_risk_k
            sum_term_hess = exp_f_k * sum_1_over_sum_risk_k - exp_f_k**2 * sum_1_over_sum_risk_squared_k
            
            gradient_k = - self.event[k] + sum_term_grad
            hessian_k = sum_term_hess
            
            gradients[k] = gradient_k
            hessians[k] = hessian_k
            
        return gradients, hessians

def objective(trial):
    # Remove rows with any NaN values from all variables
    X_aligned = X.copy()
    # Create mask for non-NaN values in both y_time and y_event
    mask = ~np.isnan(y_time) & ~np.isnan(y_event)
    # Apply mask to all data
    X_aligned = X_aligned[mask].reset_index(drop=True)
    y_time_aligned = pd.Series(y_time[mask]).reset_index(drop=True)
    y_event_aligned = pd.Series(y_event[mask]).reset_index(drop=True)
    
    params = {
        'metric': 'custom',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 25, 127),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-06, 10., log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-06, 10., log=True),
        'verbose': -1,
        'num_threads': 4
    }
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    race_groups = X_aligned['race_group'].unique()
    fold_scores = []
    fold = -1

    for train_idx, val_idx in skf.split(X_aligned, X_aligned['race_group']):
        fold += 1
        print(f'Running in fold {fold}...')
        
        X_train, X_val = X_aligned.iloc[train_idx], X_aligned.iloc[val_idx]
        y_time_train, y_time_val = y_time_aligned.iloc[train_idx], y_time_aligned.iloc[val_idx]
        y_event_train, y_event_val = y_event_aligned.iloc[train_idx], y_event_aligned.iloc[val_idx]
        
        train_data = lgb.Dataset(X_train, label= y_time_train)
        val_data = lgb.Dataset(X_val, label= y_time_val)

        def cindex_eval(y_pred, data_val):
            # Calculate stratified C-index for validation data
            race_specific_scores = []
            for race in race_groups:
                race_mask = X_val['race_group'] == race
                if sum(race_mask) > 1:  # Only calculate if we have at least 2 samples
                    surv = np.array([(e, t) for e, t in zip(y_event_val[race_mask], y_time_val[race_mask])], 
                                dtype=[('event', bool), ('time', float)])
                    race_cindex = concordance_index_censored(surv['event'], surv['time'], y_pred[race_mask])[0]
                    race_specific_scores.append(race_cindex)
            
            stratified_cindex = np.mean(race_specific_scores) - np.std(race_specific_scores)
            return 'stratified-c-index', stratified_cindex, True
        
        params['objective'] = CoxObjective(y_time_train.values, y_event_train.values)
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
            num_boost_round=1000,
            feval=cindex_eval,
            callbacks=[lgb.early_stopping(stopping_rounds=50)]
        )
        y_pred = model.predict(X_val)
        
        # Calculate stratified C-index for fold evaluation
        stratified_cindex = cindex_eval(y_pred, X_val)[1]
        fold_scores.append(stratified_cindex)
    
    print('5 folds stratified C-index:', fold_scores)
    return np.mean(fold_scores)

In [None]:
# Now create and run the study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
best_params = study.best_params

[I 2025-02-21 17:38:30,132] A new study created in memory with name: no-name-092af8a2-a741-42ab-bfac-3d1de89faf63


Running in fold 0...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[997]	valid_0's stratified-c-index: 0.709042
Running in fold 1...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's stratified-c-index: 0.727137
Running in fold 2...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[999]	valid_0's stratified-c-index: 0.711961
Running in fold 3...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's stratified-c-index: 0.711866
Running in fold 4...
Training until validation scores don't improve for 50 rounds


[I 2025-02-21 17:44:58,865] Trial 0 finished with value: 0.7161992653805479 and parameters: {'num_leaves': 29, 'max_depth': 6, 'learning_rate': 0.012577695075945087, 'reg_alpha': 0.009183357577677596, 'reg_lambda': 0.0006874486086363565}. Best is trial 0 with value: 0.7161992653805479.


Did not meet early stopping. Best iteration is:
[1000]	valid_0's stratified-c-index: 0.720989
5 folds stratified C-index: [np.float64(0.7090416958595009), np.float64(0.7271372267063081), np.float64(0.7119614785884153), np.float64(0.711866454330352), np.float64(0.7209894714181628)]
Running in fold 0...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[993]	valid_0's stratified-c-index: 0.72243
Running in fold 1...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[967]	valid_0's stratified-c-index: 0.737666
Running in fold 2...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[997]	valid_0's stratified-c-index: 0.717711
Running in fold 3...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[939]	valid_0's stratified-c-index: 0.729661
Running in fold 4...
Training un

[I 2025-02-21 17:51:00,293] Trial 1 finished with value: 0.7271560926653035 and parameters: {'num_leaves': 82, 'max_depth': 4, 'learning_rate': 0.07534182401548442, 'reg_alpha': 0.00018165661680599106, 'reg_lambda': 1.5950299581495986}. Best is trial 1 with value: 0.7271560926653035.


Early stopping, best iteration is:
[811]	valid_0's stratified-c-index: 0.728314
5 folds stratified C-index: [np.float64(0.7224295678951765), np.float64(0.7376659676470104), np.float64(0.7177106695048671), np.float64(0.72966065306304), np.float64(0.728313605216423)]
Running in fold 0...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's stratified-c-index: 0.721341
Running in fold 1...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[909]	valid_0's stratified-c-index: 0.73519
Running in fold 2...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[750]	valid_0's stratified-c-index: 0.716853
Running in fold 3...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's stratified-c-index: 0.723572
Running in fold 4...
Training until validation scores don't 

[I 2025-02-21 17:56:48,833] Trial 2 finished with value: 0.7255722570986277 and parameters: {'num_leaves': 38, 'max_depth': 6, 'learning_rate': 0.026414214427472396, 'reg_alpha': 2.0192350254758398, 'reg_lambda': 0.0003333110417926715}. Best is trial 1 with value: 0.7271560926653035.


Did not meet early stopping. Best iteration is:
[997]	valid_0's stratified-c-index: 0.730905
5 folds stratified C-index: [np.float64(0.7213408028505349), np.float64(0.7351897172952444), np.float64(0.7168530365975581), np.float64(0.7235723193002467), np.float64(0.7309054094495551)]
Running in fold 0...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[404]	valid_0's stratified-c-index: 0.721301
Running in fold 1...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[527]	valid_0's stratified-c-index: 0.733298
Running in fold 2...
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[367]	valid_0's stratified-c-index: 0.72483
Running in fold 3...
Training until validation scores don't improve for 50 rounds
