In [1]:
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

Processing /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
autograd is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Processing /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: autograd-gamma
  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone
  Created wheel for autograd-gamma: filename=autograd_gamma-0.5.0-py3-none-any.whl size=4031 sha256=e6ae1e66bccf54dcaf3730c49a03a695f0a1ecba53253cc91725d93b1d616a2d
  Stored in directory: /root/.cache/pip/wheels/6b/b5/e0/4c79e15c0b5f2c15ecf613c720bb20daab20a666eb67135155
Successfully built autograd-gamma
Installing collected packages: autograd-gamma
Successfully installed autograd-gamma-0.5.0
Processing /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
Installing collected packages: interface-

In [2]:
"""
To evaluate the equitable prediction of transplant survival outcomes,
we use the concordance index (C-index) between a series of event
times and a predicted score across each race group.
 
It represents the global assessment of the model discrimination power:
this is the model’s ability to correctly provide a reliable ranking
of the survival times based on the individual risk scores.
 
The concordance index is a value between 0 and 1 where:
 
0.5 is the expected result from random predictions,
1.0 is perfect concordance (with no censoring, otherwise <1.0),
0.0 is perfect anti-concordance (with no censoring, otherwise >0.0)

"""

import pandas as pd
import pandas.api.types
import numpy as np
from lifelines.utils import concordance_index

class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    >>> import pandas as pd
    >>> row_id_column_name = "id"
    >>> y_pred = {'prediction': {0: 1.0, 1: 0.0, 2: 1.0}}
    >>> y_pred = pd.DataFrame(y_pred)
    >>> y_pred.insert(0, row_id_column_name, range(len(y_pred)))
    >>> y_true = { 'efs': {0: 1.0, 1: 0.0, 2: 0.0}, 'efs_time': {0: 25.1234,1: 250.1234,2: 2500.1234}, 'race_group': {0: 'race_group_1', 1: 'race_group_1', 2: 'race_group_1'}}
    >>> y_true = pd.DataFrame(y_true)
    >>> y_true.insert(0, row_id_column_name, range(len(y_true)))
    >>> score(y_true.copy(), y_pred.copy(), row_id_column_name)
    0.75
    """
    
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    
    event_label = 'efs'
    interval_label = 'efs_time'
    prediction_label = 'prediction'
    for col in submission.columns:
        if not pandas.api.types.is_numeric_dtype(submission[col]):
            raise ParticipantVisibleError(f'Submission column {col} must be a number')
    # Merging solution and submission dfs on ID
    merged_df = pd.concat([solution, submission], axis=1)
    merged_df.reset_index(inplace=True)
    merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)
    metric_list = []
    for race in merged_df_race_dict.keys():
        # Retrieving values from y_test based on index
        indices = sorted(merged_df_race_dict[race])
        merged_df_race = merged_df.iloc[indices]
        # Calculate the concordance index
        c_index_race = concordance_index(
                        merged_df_race[interval_label],
                        -merged_df_race[prediction_label],
                        merged_df_race[event_label])
        metric_list.append(c_index_race)
    return float(np.mean(metric_list)-np.sqrt(np.var(metric_list)))

In [3]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [4]:
test = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/test.csv")
print("Test shape:", test.shape )


train = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv")
print("Train shape:",train.shape)

Test shape: (3, 58)
Train shape: (28800, 60)


In [5]:
from lifelines import KaplanMeierFitter, NelsonAalenFitter

def transform_kmf(df, time_col='efs_time', event_col='efs'):
    """
    Transform using survival probability estimates
    """
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y


In [6]:
train['y'] = transform_kmf(train, 'efs_time', 'efs')

# try adding -0.1
train.loc[train['efs'] == 0, 'y'] -= 0.1

In [7]:
to_rmv =['ID', 'efs', 'efs_time', 'y']
features = [col for col in train.columns if not col in to_rmv]
cat = [c for c in features if train[c].dtype == 'object']
num = [c for c in features if train[c].dtype != 'object']
target = 'y'
print(f'There are {len(features)} features')
print(f'There are {len(cat)} for catagorial and {len(num)} for numerical')

There are 57 features
There are 35 for catagorial and 22 for numerical


# Feature eng

In [8]:
for c in cat:
    train[c].fillna('Missing', inplace=True)
    test[c].fillna('Missing', inplace=True)

In [9]:
def convert_64_to_32(df, num_features):
    for c in num_features:
        if df[c].dtype == 'float64':
            df[c] = df[c].astype('float32')
        else:
            df[c] = df[c].astype('int32')
    return df

train = convert_64_to_32(train, num)
test = convert_64_to_32(test, num)

In [10]:
def clean_columns(df):

    value_mappings = {
        'cmv_status': {
            '+/+': 'Positive_Positive',
            '+/-': 'Positive_Negative',
            '-/+': 'Negative_Positive',
            '-/-': 'Negative_Negative'
        },
        'tbi_status': {
            'No TBI': 'No_Total_Body_Irradiation',
            'TBI + Cy +- Other': 'TBI_with_Cyclophosphamide_and_Other',
            'TBI +- Other, <=cGy': 'TBI_with_Other_Low_Dose',
            'TBI +- Other, >cGy': 'TBI_with_Other_High_Dose',
            'TBI +- Other, -cGy, single': 'TBI_with_Other_Single_Dose',
            'TBI +- Other, unknown dose': 'TBI_with_Other_Unknown_Dose',
            'TBI +- Other, -cGy, unknown dose': 'TBI_with_Other_Unknown_Dose',
            'TBI +- Other, -cGy, fractionated': 'TBI_with_Other_Fractionated_Dose'
        },
        'dri_score': {
            'Intermediate': 'Intermediate_Risk',
            'N/A - pediatric': 'Not_Applicable_Pediatric',
            'High': 'High_Risk',
            'N/A - non-malignant indication': 'Not_Applicable_Non_Malignant',
            'TBD cytogenetics': 'To_Be_Determined_Cytogenetics',
            'Low': 'Low_Risk',
            'High - TED AML case <missing cytogenetics': 'High_Risk_TED_AML_Missing_Cytogenetics',
            'Intermediate - TED AML case <missing cytogenetics': 'Intermediate_Risk_TED_AML_Missing_Cytogenetics',
            'N/A - disease not classifiable': 'Not_Applicable_Disease_Not_Classifiable',
            'Very high': 'Very_High_Risk',
            'Missing disease status': 'Missing'
        },
        'tce_imm_match': {
            'P/P': 'Perfect_Perfect',
            'G/G': 'Good_Good',
            'H/H': 'Heterozygous_Heterozygous',
            'G/B': 'Good_Bad',
            'H/B': 'Heterozygous_Bad',
            'P/H': 'Perfect_Heterozygous',
            'P/B': 'Perfect_Bad',
            'P/G': 'Perfect_Good'
        },
        'gvhd_proph': {
            'FK+ MMF +- others': 'Tacrolimus_MMF_with_Others',
            'Cyclophosphamide alone': 'Cyclophosphamide_Alone',
            'FK+ MTX +- others(not MMF)': 'Tacrolimus_MTX_with_Others_Not_MMF',
            'Cyclophosphamide +- others': 'Cyclophosphamide_with_Others',
            'CSA + MMF +- others(not FK)': 'Cyclosporine_MMF_with_Others_Not_Tacrolimus',
            'FKalone': 'Tacrolimus_Alone',
            'Other GVHD Prophylaxis': 'Other_GVHD_Prophylaxis',
            'TDEPLETION alone': 'TCell_Depletion_Alone',
            'TDEPLETION +- other': 'TCell_Depletion_with_Others',
            'No GvHD Prophylaxis': 'No_GVHD_Prophylaxis',
            'CDselect alone': 'CD_Selection_Alone',
            'CSA + MTX +- others(not MMF,FK)': 'Cyclosporine_MTX_with_Others_Not_MMF_Tacrolimus',
            'CSA alone': 'Cyclosporine_Alone',
            'Parent Q = yes, but no agent': 'Parent_Yes_No_Agent',
            'CDselect +- other': 'CD_Selection_with_Others',
            'CSA +- others(not FK,MMF,MTX)': 'Cyclosporine_with_Others_Not_Tacrolimus_MMF_MTX',
            'FK+- others(not MMF,MTX)': 'Tacrolimus_with_Others_Not_MMF_MTX'
        }
    }

    for col, mappings in value_mappings.items():
        if col in df.columns:
            df[col] = df[col].astype(str).map(mappings).fillna(df[col])

    return df

def clean_space(df):
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].apply(lambda x: x.replace(' ', '_') if isinstance(x, str) else x)
    return df

def clean_not_done(df):
    df = df.map(lambda x: 'Missing' if x == 'Not done' else x)
    return df

def clean_not_tested(df):
    df = df.map(lambda x: 'Missing' if x == 'Not tested' else x)
    return df

def clean_dri_score(score):
    if isinstance(score, str) and 'Missing disease status' in score:
        return 'Missing'
    return score

def clean_conditioning_intensity(score):
    if isinstance(score, str) and 'No drugs reported' in score:
        return 'Missing'
    return score

In [11]:
train = clean_columns(train)
test = clean_columns(test)

# train = clean_not_done(train)
# test = clean_not_done(test)

# train = clean_not_tested(train)
# test = clean_not_tested(test)

train['dri_score'] = train['dri_score'].apply(clean_dri_score)
test['dri_score'] = test['dri_score'].apply(clean_dri_score)

train['conditioning_intensity'] = train['conditioning_intensity'].apply(clean_conditioning_intensity)
test['conditioning_intensity'] = test['conditioning_intensity'].apply(clean_conditioning_intensity)

train = clean_space(train)
test = clean_space(test)

# LightGBM

In [12]:
from sklearn.preprocessing import LabelEncoder


def encode_features_lgb(train, test, cat, num):
    train = train.copy()
    test = test.copy()
    
    for c in cat:
        encoder = LabelEncoder()

        # Fit and transform the training data
        train[c] = train[c].astype(str)
        train[c] = encoder.fit_transform(train[c])
        train[c] = train[c].astype('int32').astype('category')

        # Transform the test data using the encoder fitted on the training data
        test[c] = test[c].astype(str)
        test[c] = encoder.transform(test[c])
        test[c] = test[c].astype('int32').astype('category')

    return train, test

train_lgb, test_lgb = encode_features_lgb(train, test, cat, num)

In [13]:
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import *
import lightgbm as lgb
from lightgbm import LGBMRegressor

def train_lgbm(train, test, model_params, features, cat_features, target):

    fix_params = {
        'device': "gpu",
        'n_estimators': 10000,
        'objective': 'regression',
        'early_stopping_rounds':20,
        'verbose': -1
    }
    model_params.update(fix_params)
    
    FOLDS = 10
    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
    
    oof_lgb = np.zeros(len(train))
    pred_lgb = np.zeros(len(test))

    print('Training model with the followin parameters')
    for k, v in model_params.items():
        print(f'{k} : {v}')

    for i, (t_idx, v_idx) in enumerate(kf.split(train)):

        X_train = train.iloc[t_idx][features].copy()
        y_train = train.iloc[t_idx][target]
        X_valid = train.iloc[v_idx][features].copy()
        y_valid = train.iloc[v_idx][target]
        X_test = test[features].copy()

        model_lgb = LGBMRegressor(**model_params)
        model_lgb.fit(
            X_train, np.log1p(y_train),
            eval_set=[(X_valid, np.log1p(y_valid))],
        )  
        
        y_valid_preds = np.expm1(model_lgb.predict(X_valid))
        oof_lgb[v_idx] = y_valid_preds
        pred_lgb += np.expm1(model_lgb.predict(X_test))

        
        fold_rmse = mean_squared_error(y_valid, y_valid_preds, squared=False)
        print("#"*25)
        print(f"### Fold {i+1} \n")
        print(f"Fold {i+1} RMSE: {fold_rmse}")
        print("#"*25)

    pred_lgb /= FOLDS

    return model_lgb, oof_lgb, pred_lgb


# XGBoost

In [14]:
def encode_features_xgb(train, test, label_encode_features, one_hot_encode_features, features):

    # Label Encoding
    label_encoders = {}
    for feature in label_encode_features:
        encoder = LabelEncoder()
        train[feature] = train[feature].astype(str)
        train[feature] = encoder.fit_transform(train[feature])
        test[feature] = test[feature].astype(str)
        test[feature] = encoder.transform(test[feature])
        label_encoders[feature] = encoder

    # One-Hot Encoding
    train_one_hot = pd.get_dummies(train[one_hot_encode_features], prefix=one_hot_encode_features)
    test_one_hot = pd.get_dummies(test[one_hot_encode_features], prefix=one_hot_encode_features)

    train_one_hot, test_one_hot = train_one_hot.align(test_one_hot, join="outer", axis=1, fill_value=0)

    train_xgb = pd.concat([train, train_one_hot], axis=1)
    test_xgb = pd.concat([test, test_one_hot], axis=1)

    features_xgb = features.copy()
    features_xgb.extend(train_one_hot.columns)
    features_xgb = [f for f in features_xgb if f not in one_hot_encode_features]

    return train_xgb, test_xgb, features_xgb

In [15]:
# Define features for label and one-hot encoding
label_encode_features = [
    "dri_score", "psych_disturb", "cyto_score", "diabetes", 
    "arrhythmia", "vent_hist", "renal_issue", "pulm_severe", 
    "cmv_status", "tce_imm_match", "rituximab", "cyto_score_detail", 
    "conditioning_intensity", "ethnicity", "obesity", "mrd_hct", 
    "in_vivo_tcd", "tce_match", "hepatic_severe", "prior_tumor", 
    "peptic_ulcer", "gvhd_proph", "rheum_issue", "sex_match", 
    "hepatic_mild", "tce_div_match", "donor_related", "melphalan_dose", 
    "cardiac", "pulm_moderate"
]

one_hot_encode_features = [
    "tbi_status", "graft_type", "prod_type", "prim_disease_hct", "race_group"
]

# Apply the encoding
train_xgb, test_xgb, features_xgb = encode_features_xgb(train, test, label_encode_features, one_hot_encode_features, features)


In [16]:
def object_to_cat(df):
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].astype('category')
    return df
train_xgb2 = object_to_cat(train)
test_xgb2 = object_to_cat(test)

In [17]:
from xgboost import XGBRegressor

def train_XGB(train, test, model_params, features, target):
    fix_params = {
        'tree_method': 'hist',
        'device': 'cuda', 
        'objective': 'reg:squarederror',
        'n_estimators': 10000,
        'early_stopping_rounds': 20,
        'eval_metric': 'rmse',
        'enable_categorical': True
    }
    model_params.update(fix_params)

    FOLDS = 10
    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

    oof_xgb = np.zeros(len(train))
    pred_xgb = np.zeros(len(test))

    print('Training XGBoost model with the following parameters:')
    for k, v in model_params.items():
        print(f'{k} : {v}')

    for i, (t_idx, v_idx) in enumerate(kf.split(train)):
        X_train = train.iloc[t_idx][features].copy()
        y_train = train.iloc[t_idx][target]
        X_valid = train.iloc[v_idx][features].copy()
        y_valid = train.iloc[v_idx][target]
        X_test = test[features].copy()

        model_xgb = XGBRegressor(**model_params)
        model_xgb.fit(
            X_train, np.log1p(y_train),
            eval_set=[(X_valid, np.log1p(y_valid))],
            verbose=False
        )

        y_valid_preds = np.expm1(model_xgb.predict(X_valid))
        oof_xgb[v_idx] = y_valid_preds
        pred_xgb += np.expm1(model_xgb.predict(X_test))

        fold_rmse = mean_squared_error(y_valid, y_valid_preds, squared=False)
        print("#" * 25)
        print(f"### Fold {i+1} \n")
        print(f"Fold {i+1} RMSE: {fold_rmse}")
        print("#" * 25)

    pred_xgb /= FOLDS

    return model_xgb, oof_xgb, pred_xgb

# CatBoost

In [18]:
train_cat = train.copy()
test_cat = test.copy()

In [19]:
from catboost import CatBoostRegressor

def train_CAT(train, test, model_params, features, cat_features, target):
    fix_params = {
        'task_type': "GPU",
        'iterations': 10000,
        'loss_function': 'RMSE',
        'early_stopping_rounds': 20,
    }
    model_params.update(fix_params)

    FOLDS = 10
    kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)

    oof_cat = np.zeros(len(train))
    pred_cat = np.zeros(len(test))

    print('Training model with the following parameters')
    for k, v in model_params.items():
        print(f'{k} : {v}')

    for i, (t_idx, v_idx) in enumerate(kf.split(train)):
        X_train = train.iloc[t_idx][features].copy()
        y_train = train.iloc[t_idx][target]
        X_valid = train.iloc[v_idx][features].copy()
        y_valid = train.iloc[v_idx][target]
        X_test = test[features].copy()


        model_cat = CatBoostRegressor(**model_params)
        model_cat.fit(
            X_train, np.log1p(y_train),
            cat_features=cat_features,
            eval_set=[(X_valid, np.log1p(y_valid))],
            verbose=False
        )

        y_valid_preds = np.expm1(model_cat.predict(X_valid))
        oof_cat[v_idx] = y_valid_preds
        pred_cat += np.expm1(model_cat.predict(X_test))

        fold_rmse = mean_squared_error(y_valid, y_valid_preds, squared=False)
        print("#" * 25)
        print(f"### Fold {i + 1} \n")
        print(f"Fold {i + 1} RMSE: {fold_rmse}")
        print("#" * 25)

    pred_cat /= FOLDS

    return model_cat, oof_cat, pred_cat


# Ensemble

In [20]:
model_lgb_params = {
    'max_depth': 5,
    'learning_rate': 0.01,
    'colsample_bytree': 0.40481123886709114,
    'subsample': 0.7673666426617842,
    'num_leaves': 46,
    'min_child_samples': 34,
    'lambda_l1': 0.0032893870728708495,
    'lambda_l2': 1.5780171816002318e-06,
    'bagging_freq': 5,
    'cat_features': cat,
    'n_estimators': 5000,
    'objective': 'regression',
    'early_stopping_rounds': 20
}

model_xgb_params = {
    'max_depth': 4, 
    'learning_rate': 0.09180807102095336,
    'colsample_bytree': 0.3809438487844099, 
    'subsample': 0.844622438351228, 
    'n_estimators': 419, 
    'min_child_weight': 3.714743419003562, 
    'reg_alpha': 5.80197653137552e-06, 
    'reg_lambda': 1.2374095115455325e-08, 
    'gamma': 0.0037460722016019465
}

model_xgb2_params = {
    "max_depth": 9,
    "learning_rate": 0.018203874021653552,
    "colsample_bytree": 0.41392312362600636,
    "subsample": 0.870771567534879,
    "n_estimators": 10000,
    "min_child_weight": 6.587958958652532,
    "reg_alpha": 1.675358492618636e-07,
    "reg_lambda": 0.004228750471811781,
    "gamma": 0.02009243264106564,
    "tree_method": "gpu_hist",
    "objective": "reg:squarederror",
    "early_stopping_rounds": 20,
    "eval_metric": "rmse",
    "enable_categorical": True
}


model_cat_params = {
    'depth': 5, 
    'learning_rate': 0.05259516359861675, 
    'bagging_temperature':0.5,
    'l2_leaf_reg': 6.806823646372654e-06, 
    'random_strength': 5.889614035287661
}

model_lgb, oof_lgb, pred_lgb = train_lgbm(train_lgb, test_lgb, model_lgb_params, features, cat, target)
model_xgb, oof_xgb, pred_xgb = train_XGB(train_xgb, test_xgb, model_xgb_params, features_xgb, target)
model_xgb2, oof_xgb2, pred_xgb2 = train_XGB(train_xgb2, test_xgb2, model_xgb2_params, features, target)
# model_cat, oof_cat, pred_cat = train_CAT(train_cat, test_cat, model_cat_params, features, cat, target)

oof_ensemble = (oof_lgb + oof_xgb + oof_xgb2) / 3
pred_ensemble = (pred_lgb + pred_xgb + pred_xgb2) / 3

y_true = train[["ID","efs","efs_time","race_group"]].copy()
y_pred = train[["ID"]].copy()
y_pred["prediction"] = oof_ensemble
m = score(y_true, y_pred, "ID")
print(f"\nOverall CV for KaplanMeier =", m)
# Overall CV for KaplanMeier = 0.6744979717032406 (with 2 xgb)
# Overall CV for KaplanMeier = 0.6742222404834405 (with 1 xgb)
# Overall CV for KaplanMeier = 0.6745101916302204 (with 2 xgb, avg preds)
# Overall CV for KaplanMeier = 0.6771615394911454 (when -= 0.1 for kmf)
# Overall CV for KaplanMeier = 0.6770523750451444 (removing nor done)

Training model with the followin parameters
max_depth : 5
learning_rate : 0.01
colsample_bytree : 0.40481123886709114
subsample : 0.7673666426617842
num_leaves : 46
min_child_samples : 34
lambda_l1 : 0.0032893870728708495
lambda_l2 : 1.5780171816002318e-06
bagging_freq : 5
cat_features : ['dri_score', 'psych_disturb', 'cyto_score', 'diabetes', 'tbi_status', 'arrhythmia', 'graft_type', 'vent_hist', 'renal_issue', 'pulm_severe', 'prim_disease_hct', 'cmv_status', 'tce_imm_match', 'rituximab', 'prod_type', 'cyto_score_detail', 'conditioning_intensity', 'ethnicity', 'obesity', 'mrd_hct', 'in_vivo_tcd', 'tce_match', 'hepatic_severe', 'prior_tumor', 'peptic_ulcer', 'gvhd_proph', 'rheum_issue', 'sex_match', 'race_group', 'hepatic_mild', 'tce_div_match', 'donor_related', 'melphalan_dose', 'cardiac', 'pulm_moderate']
n_estimators : 10000
objective : regression
early_stopping_rounds : 20
device : gpu
verbose : -1
#########################
### Fold 1 

Fold 1 RMSE: 0.19412947247339082
############

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




#########################
### Fold 1 

Fold 1 RMSE: 0.19485636660309585
#########################
#########################
### Fold 2 

Fold 2 RMSE: 0.19061653480287635
#########################
#########################
### Fold 3 

Fold 3 RMSE: 0.192958637381658
#########################
#########################
### Fold 4 

Fold 4 RMSE: 0.19419730042728078
#########################
#########################
### Fold 5 

Fold 5 RMSE: 0.19156243688891625
#########################
#########################
### Fold 6 

Fold 6 RMSE: 0.19468112145322947
#########################
#########################
### Fold 7 

Fold 7 RMSE: 0.19709612390544606
#########################
#########################
### Fold 8 

Fold 8 RMSE: 0.19438781176211123
#########################
#########################
### Fold 9 

Fold 9 RMSE: 0.19659049605845522
#########################
#########################
### Fold 10 

Fold 10 RMSE: 0.19365635180806168
#########################
Training XGBoost mod

  merged_df_race_dict = dict(merged_df.groupby(['race_group']).groups)



Overall CV for KaplanMeier = 0.6770501592495527


# Submission

In [21]:
sample_submission = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv')
submission = sample_submission.copy()
submission['prediction'] = pred_ensemble
submission.to_csv('/kaggle/working/submission.csv', index=False)
submission

Unnamed: 0,ID,prediction
0,28800,0.420952
1,28801,0.589429
2,28802,0.366986
