In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb
from lifelines.utils import concordance_index
import optuna

In [77]:
# Đọc dữ liệu train và test
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Xác định các cột numerical và categorical
numerical_cols = train_data.select_dtypes(include=['float64', 'int64']).columns.drop(['efs_time', 'efs'])
categorical_cols = train_data.select_dtypes(include=['object']).columns

# Biến mục tiêu và biến sự kiện
target = 'efs_time'  # Thời gian sống sót
event = 'efs'  # Biến nhị phân chỉ sự kiện xảy ra hay không

In [79]:
mask_event = train_data[train_data[event] == 1].index
mask_event

Index([    1,     5,     8,     9,    10,    11,    13,    14,    17,    18,
       ...
       28784, 28786, 28787, 28789, 28790, 28791, 28792, 28793, 28794, 28796],
      dtype='int64', length=15532)

In [66]:
# # Xử lý missing data
# numerical_imputer = SimpleImputer(strategy='mean')  # Điền giá trị trung bình cho numerical
# categorical_imputer = SimpleImputer(strategy='most_frequent')  # Điền giá trị phổ biến nhất cho categorical

# # Chuẩn hóa numerical data
# scaler = StandardScaler()

# # Mã hóa categorical data
# encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# # Pipeline cho numerical và categorical data
# numerical_pipeline = Pipeline([
#     ('imputer', numerical_imputer),
#     # ('scaler', scaler)
# ])

# categorical_pipeline = Pipeline([
#     ('imputer', categorical_imputer),
#     # ('encoder', encoder)
# ])

# # Kết hợp pipeline
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_pipeline, numerical_cols),
#         ('cat', categorical_pipeline, categorical_cols)
    # ])

In [67]:
numerical_cols = train_data.select_dtypes(include=['float64', 'int64']).columns.drop(['efs_time', 'efs'])
categorical_cols = train_data.select_dtypes(include=['object', 'category']).columns

for col in categorical_cols:
    train_data[col] = train_data[col].astype('category')
    test_data[col] = test_data[col].astype('category')

    value_counts = train_data[col].value_counts()
    categorical_map = {cat: idx for idx, cat in enumerate(value_counts.index)}
    train_data[col] = train_data[col].map(categorical_map)
    test_data[col] = test_data[col].map(categorical_map)

In [68]:
# Tách đặc trưng và nhãn
X_train = train_data.drop([target, event, 'ID'], axis=1)
y_train = train_data[[target, event]]

# Áp dụng preprocessor
# X_train_processed = preprocessor.fit_transform(X_train)

# # Chuyển dữ liệu test
# X_test_processed = preprocessor.transform(test_data)

In [69]:
X_train_processed = X_train
X_test_processed = test_data

In [70]:
X_train_processed

Unnamed: 0,dri_score,psych_disturb,cyto_score,diabetes,hla_match_c_high,hla_high_res_8,tbi_status,arrhythmia,hla_low_res_6,graft_type,...,karnofsky_score,hepatic_mild,tce_div_match,donor_related,melphalan_dose,hla_low_res_8,cardiac,hla_match_drb1_high,pulm_moderate,hla_low_res_10
0,3,0,,0,,,0,0,6.0,1,...,90.0,0,,1,0,8.0,0,2.0,0,10.0
1,0,0,1,0,2.0,8.0,3,0,6.0,0,...,90.0,0,0,0,0,8.0,0,2.0,1,10.0
2,3,0,,0,2.0,8.0,0,0,6.0,1,...,90.0,0,0,0,0,8.0,0,2.0,0,10.0
3,2,0,1,0,2.0,8.0,0,0,6.0,1,...,90.0,1,0,1,0,8.0,0,2.0,0,10.0
4,2,0,,0,2.0,8.0,0,0,6.0,0,...,90.0,0,0,0,1,8.0,0,2.0,0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,7,,2,0,2.0,8.0,0,0,6.0,0,...,,,3,,0,8.0,,2.0,0,10.0
28796,2,0,0,1,1.0,4.0,0,0,5.0,0,...,90.0,0,1,0,0,6.0,1,1.0,1,8.0
28797,4,,0,,2.0,8.0,0,,6.0,0,...,90.0,,1,1,0,8.0,,2.0,0,10.0
28798,3,0,0,0,1.0,4.0,0,0,3.0,0,...,90.0,0,0,0,1,4.0,0,1.0,0,5.0


In [71]:
clinical_vars = ['dri_score', 'prim_disease_hct', 'cyto_score', 'cyto_score_detail', 'mrd_hct']
patient_donor_vars = ['age_at_hct', 'donor_age', 'sex_match', 'race_group', 'ethnicity', 'karnofsky_score', 'donor_related']
hct_vars = ['year_hct', 'graft_type', 'prod_type', 'conditioning_intensity', 'tbi_status', 'gvhd_proph', 'in_vivo_tcd', 'melphalan_dose']
hla_vars = ['hla_match_a_high', 'hla_match_b_high', 'hla_match_c_high', 'hla_match_drb1_high', 'hla_match_dqb1_high', 'hla_match_a_low', 'hla_match_b_low', 'hla_match_c_low', 'hla_match_drb1_low', 'hla_match_dqb1_low', 'hla_high_res_6', 'hla_high_res_8', 'hla_high_res_10', 'hla_low_res_6', 'hla_low_res_8', 'hla_nmdp_6', 'tce_imm_match', 'tce_match', 'tce_div_match']
comorbidity_vars = ['comorbidity_score', 'diabetes', 'cardiac', 'pulm_severe', 'pulm_moderate', 'hepatic_severe', 'hepatic_mild', 'renal_issue', 'psych_disturb', 'vent_hist', 'prior_tumor', 'peptic_ulcer', 'rheum_issue', 'obesity', 'arrhythmia', 'rituximab', 'cmv_status']

In [72]:
X_train_processed = X_train_processed[clinical_vars + patient_donor_vars + hct_vars + comorbidity_vars]
X_test_processed = X_test_processed[clinical_vars + patient_donor_vars + hct_vars + comorbidity_vars]
X_train_processed

Unnamed: 0,dri_score,prim_disease_hct,cyto_score,cyto_score_detail,mrd_hct,age_at_hct,donor_age,sex_match,race_group,ethnicity,...,renal_issue,psych_disturb,vent_hist,prior_tumor,peptic_ulcer,rheum_issue,obesity,arrhythmia,rituximab,cmv_status
0,3,5,,,,9.942,,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,1,43.705,72.290,3,1,0,...,0,0,0,0,0,0,0,0,0,0
2,3,11,,,,33.997,,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,1,0,1,43.245,29.230,0,2,0,...,0,0,0,0,0,0,0,0,0,0
4,2,4,,,,29.740,56.810,2,4,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28795,7,0,2,0,0,51.136,24.212,2,0,0,...,0,,0,1,0,,0,0,0,3
28796,2,1,0,3,1,18.075,30.770,2,5,1,...,0,0,0,0,0,0,0,0,0,1
28797,4,3,0,1,,51.005,22.627,2,5,0,...,,,0,0,,,0,,,1
28798,3,3,0,,,0.044,58.074,0,3,0,...,,0,0,1,,0,,0,0,0


In [80]:
X_train_processed = X_train_processed.iloc[mask_event].reset_index(drop=True)
y_train = y_train.iloc[mask_event].reset_index(drop=True)
X_train_processed.shape, y_train.shape

((15532, 37), (15532, 2))

In [81]:
def objective(trial):
    # Không gian hyperparameters
    param = {
        'objective': 'survival:cox',
        'eval_metric': 'cox-nloglik',
        'booster': 'gbtree',
        'lambda': trial.suggest_float('lambda', 1e-6, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-6, 10.0, log=True),
        'eta': trial.suggest_float('eta', 1e-3, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.8, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.8, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'random_state': 42,
        'verbosity': 0
    }
    
    X_val_5folds = pd.DataFrame([], columns=X_train_processed.columns)
    y_val_5folds = pd.DataFrame([], columns=y_train.columns)   
    y_pred_5folds = []

    # Chia dữ liệu bằng cross-validation 5 folds
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for train_index, val_index in skf.split(X_train_processed, X_train_processed['race_group']):
        X_tr, X_val = X_train_processed.iloc[train_index], X_train_processed.iloc[val_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # Chuẩn bị DMatrix
        dtrain = xgb.DMatrix(X_tr, label=y_tr[target], enable_categorical=True)
        dval = xgb.DMatrix(X_val, label=y_val[target], enable_categorical=True)
        
        # def stratified_cindex(preds, dtrain):

        # Huấn luyện mô hình
        model = xgb.train(param, 
                          dtrain, 
                          evals=[(dval, 'eval')], 
                          num_boost_round=1000,
                          early_stopping_rounds=100, 
                          verbose_eval=False
                          )
        
        # Dự đoán
        X_val_5folds = pd.concat([X_val_5folds, X_val], axis=0, ignore_index=True)
        y_val_5folds = pd.concat([y_val_5folds, y_val], axis=0, ignore_index=True)
        preds = model.predict(dval)
        y_pred_5folds += list(preds)
        
    # Tính stratified C-index theo race_group
    race_groups = X_val_5folds['race_group'].unique()
    c_index_scores_by_race = []
    for race in race_groups:
        race_mask = X_val_5folds['race_group'] == race
        c_index_race = concordance_index(
            y_val_5folds[target][race_mask], 
            -np.array(y_pred_5folds)[race_mask], 
            y_val_5folds[event][race_mask]
        )
        c_index_scores_by_race.append(c_index_race)
    
    # Calculate mean and standard deviation of C-index across race groups
    mean_c_index = np.mean(c_index_scores_by_race)
    std_c_index = np.std(c_index_scores_by_race)
    return mean_c_index - std_c_index

In [82]:
import warnings
warnings.filterwarnings("ignore")

In [83]:
# Tạo study để tối ưu hóa
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5, n_jobs=2)

# In kết quả tốt nhất
print('Best hyperparameters:', study.best_params)
print('Best C-index:', study.best_value)

[I 2025-02-22 16:46:22,357] A new study created in memory with name: no-name-a57d4d6d-8ac7-4008-8ccf-c88dc176c84b
[I 2025-02-22 16:46:27,368] Trial 0 finished with value: 0.7227226756931134 and parameters: {'lambda': 0.00016552404010262972, 'alpha': 0.05046214196629911, 'eta': 0.08978235214606893, 'subsample': 0.8882694440773812, 'colsample_bytree': 0.9632899891730016, 'max_depth': 5, 'min_child_weight': 9}. Best is trial 0 with value: 0.7227226756931134.
[I 2025-02-22 16:46:28,907] Trial 1 finished with value: 0.7168814354658578 and parameters: {'lambda': 3.844125306275081e-05, 'alpha': 0.001585809128411936, 'eta': 0.049617264770991515, 'subsample': 0.8713932636002142, 'colsample_bytree': 0.8226448369956537, 'max_depth': 10, 'min_child_weight': 6}. Best is trial 0 with value: 0.7227226756931134.
[I 2025-02-22 16:46:33,669] Trial 3 finished with value: 0.7206090385081027 and parameters: {'lambda': 0.8887428320534907, 'alpha': 0.0033661111118043797, 'eta': 0.08541464098735459, 'subsampl

Best hyperparameters: {'lambda': 0.00016552404010262972, 'alpha': 0.05046214196629911, 'eta': 0.08978235214606893, 'subsample': 0.8882694440773812, 'colsample_bytree': 0.9632899891730016, 'max_depth': 5, 'min_child_weight': 9}
Best C-index: 0.7227226756931134


In [86]:
concordance_index(y_train[target], -y_train[target], y_train[event])

0.0