In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings("ignore")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl


In [None]:
from lifelines import KaplanMeierFitter, CoxPHFitter, NelsonAalenFitter
from metric import score
from scipy.stats import rankdata 
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor, XGBClassifier
import xgboost as xgb
from lightgbm import LGBMRegressor,LGBMClassifier
import lightgbm as lgb
from catboost import CatBoostRegressor, CatBoostClassifier
import catboost as cb
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from pathlib import Path
import optuna

In [None]:
def stratified_c_index(df, predictions):
    y_true = df[["ID", "efs", "efs_time","race_group"]].copy()
    y_pred = df[["ID"]].copy()
    y_pred["prediction"] = predictions
    m = score(y_true.copy(), y_pred.copy(),"ID")
    return m

In [None]:
class settings:

    train_link = "/kaggle/input/equity-post-HCT-survival-predictions/train.csv"
    test_link =  "/kaggle/input/equity-post-HCT-survival-predictions/test.csv"
    sub_link = "/kaggle/input/equity-post-HCT-survival-predictions/sample_submission.csv"
    
    seed = 42
    target = 'target'
    n_splits = 10
    col_ignore = ['efs_time','efs','target','ID']
    

In [None]:
train = pd.read_csv(settings.train_link)
test = pd.read_csv(settings.test_link)

In [None]:
train

In [None]:
class SurvModel:

    def __init__(self, model, n_splits = settings.n_splits, random_state = settings.seed, estimator = 'na'):
        """
        Initialise the SurvModel class

        Parameters:
        - model: A model instance
        - n_splits: Number of folds for k-fold cross validation
        - random_state: for reproducability 
        """

        self.model = model
        self.n_splits = n_splits
        self.random_state = random_state
        self.estimator = estimator
        self.oof = None
        self.pred = None
        self.scores = []
        self.cats = []

    def preprocessing(self, train, test):
        """
        Proprocessing the test and train data

        Parameters
        - train: training dataw
        - test: testing data
        """
        # For now for testing lets use KM estmator
        # Fitted a survival curve to the training data to predict off
        
        if self.estimator == 'km':
            kmf = KaplanMeierFitter()
            kmf.fit(durations = train['efs_time'], event_observed = train['efs'])
            train[settings.target] = kmf.survival_function_at_times(train['efs_time']).values
            train[settings.target] = np.log(train[settings.target])
            print("#"*25,"KM Estmator Fitted", "#"*25)

        elif self.estimator == 'na':
            naf = NelsonAalenFitter()
            naf.fit(durations = train['efs_time'], event_observed = train['efs'])
            train[settings.target] = -naf.cumulative_hazard_at_times(train['efs_time']).values
            print("#"*25,"Nelson-Aalen Estmator Fitted", "#"*25)

        else:
            train[settings.target] = train.efs_time.copy()
            train.loc[train.efs==0,settings.target]*=-1
            print("#"*25,"CPH Fitted", "#"*25)

        features = [col for col in train.columns if not col in settings.col_ignore]
        
        # Combined test and train to label encode
        combined = pd.concat([train,test], axis = 0, ignore_index = True)

        for col in features:
            if combined[col].dtype == "object":
                self.cats.append(col)
                combined[col] = combined[col].fillna("NAN")
                combined[col],_ = combined[col].factorize()
                combined[col] -= combined[col].min()
                combined[col] = combined[col].astype("int32")
                combined[col] = combined[col].astype("category")

            else:
                if combined[col].dtype=="float64":
                    combined[col] = combined[col].astype("float32")
                if combined[col].dtype=="int64":
                    combined[col] = combined[col].astype("int32")
        train = combined.iloc[:len(train)].copy()
        test = combined.iloc[len(train):].reset_index(drop=True).copy().drop([settings.target,'efs','efs_time'], axis = 1)
        return train, test

    def fit_predict(self, train, test, catboost = False):
        """
        Fit the model using k-fold cross validation and collect oof predictions.

        Note: the train and test data include ID, efs and efs_time as we need it for the metric.
              We need to remove them for training but add them back in for metric collection.
        
        Parameters:
        - Train: the training data
        - Test: the test data from kaggle

        Returns:
        - OOF predictions
        _ Preditions for the test data
        """
        # Collect features

        features = [col for col in train.columns if not col in settings.col_ignore]
        print("#"*25,"Features","#"*25)
        print(f"There are {len(features)} features.")
        print(features)

        # Initialise k-fold cross validation
        print("#"*25,"Initialise Cross-Validation","#"*25)

        kf = KFold(n_splits = self.n_splits, shuffle=True, random_state=self.random_state)

        # Predictions for OOF and test
        self.oof = np.zeros(len(train))
        self.preds = np.zeros(len(test))

        # Cross-validation loop
        for fold, (train_idx, test_idx) in enumerate(kf.split(train)):
            
            print("#"*25,f"Fold {fold+1}","#"*25)
            # Split data
            x_train, x_val = train.loc[train_idx, features].copy(), train.loc[test_idx, features].copy()
            y_train, y_val = train.loc[train_idx, settings.target].copy(), train.loc[test_idx, settings.target].copy()
            x_test = test[features]

            x_score = train.loc[test_idx,["ID", "efs", "efs_time","race_group"]].copy()
            
            # Train on training set
            if catboost:
                self.model.fit(x_train, y_train, self.cats, verbose = 0, eval_set=[(x_val,y_val)])
            else:
                self.model.fit(x_train, y_train)

            # Predict on validation set
            val_preds = self.model.predict(x_val)
            self.oof[test_idx] = val_preds

            # Calculate score
            m = stratified_c_index(x_score, val_preds)
            self.scores.append(m)
            print(f"Fold {fold+1}: Stratified C-index: {m}")

            self.preds += self.model.predict(x_test)/self.n_splits
            
        train_score = train[["ID", "efs", "efs_time","race_group"]].copy()
        print("#"*25,"Cross-Validation Completed","#"*25)
        print(f"Average Stratified C-Index: {np.mean(self.scores)}")
        print(f"Overall Stratified C-Index: {stratified_c_index(train_score, self.oof)}")
        return self.oof, self.preds, np.mean(self.scores)

    def save_predictions(self, filename,train, test):
        """
        Save predictions in a csv.

        Parameters:
        - filename: model_name
        - train: train data for ID
        - test: test data for ID
        """
        directory_train = Path("/kaggle/working/train")
        directory_test = Path("/kaggle/working/test")

        directory_train.mkdir(exist_ok = True)
        print(f"Directory '{directory_train}' created successfully")
        directory_test.mkdir(exist_ok = True)
        print(f"Directory '{directory_test}' created successfully")
            
        
        # oof train preds
        train_csv = train.copy()
        train_csv["prediction"] = self.oof

        train_csv.to_csv("train/"+filename+"_train.csv", index = False)

        # test
        test_csv = test.copy()
        test_csv["prediction"] = self.preds

        test_csv.to_csv("test/"+filename+"_test.csv", index = False)

        print(f"Out of fold predictions and test predictions have been saved")
        

In [None]:
class Ensembler:
    def __init__(self, train_folder, test_folder):
        """
        Initialse class with the train and test folders.

        Parameters:
        - train_folder: Path to the train folder containing OOF predictions
        - test_folder: Path to the predictions
        """

        self.train_folder = Path(train_folder)
        self.test_folder = Path(test_folder)
        self.scores = []

    def load_files(self, folder_path):
        """
        Load the train and test files.

        Returns:
        - train: consolidated predictions for each dataset
        - test: consolidated predictions for each dataset
        """
        predictions = {}

        # loop through the csv files
        counter = 0
        for idx, file in enumerate(sorted(folder_path.glob("*.csv"))):
            try:
                df = pd.read_csv(file)
                if "prediction" in df.columns:
                    counter +=1
                    predictions[f"prediction_{counter}"] = df["prediction"].reset_index(drop=True)
                    print(f"Loaded 'prediction' column from {file.name}")
                else:
                    print(f"'prediction' column not found in {file.name}")
            except Exception as e:
                print(f"Error loading {file.name}: {e}")
        if predictions:
            predictions = pd.DataFrame(predictions)
            predictions["ID"] = df['ID']
            return predictions
        else:
            print("No predictions found.")
            return pd.DataFrame()

    def concatenate_train_test(self, train):
        """
        Concatenate train and test folder data into another train and test set.

        Parameters:
        - train: we need this to get the target variable in the training set

        Returns
        - train: Another training set based on the oof predictions.
        - test: Another test set based on preds.
        """
        print(f"Loading predictions from the train folder {self.train_folder}")
        train_predictions = self.load_files(self.train_folder)
        train_predictions[settings.target] = train[settings.target].reset_index(drop=True)

        print(f"Loading predictions from the test folder {self.test_folder}")
        test_predictions = self.load_files(self.test_folder)

        return train_predictions, test_predictions

    def ranking(self, train, test, original_train):
        
        # Collect features

        features = [col for col in train.columns if not col in settings.col_ignore]
        print("#"*25,"Features","#"*25)
        print(f"There are {len(features)} features.")
        print(features)

        # Initialise k-fold cross validation
        print("#"*25,"Initialise Cross-Validation","#"*25)

        kf = KFold(n_splits = settings.n_splits, shuffle=True, random_state=settings.seed)

        # Cross-validation loop
        for fold, (train_idx, test_idx) in enumerate(kf.split(train)):
            
            print("#"*25,f"Fold {fold+1}","#"*25)
            # Split data
            x_train, x_val = train.loc[train_idx, features].copy(), train.loc[test_idx, features].copy()
            y_train, y_val = train.loc[train_idx, settings.target].copy(), train.loc[test_idx, settings.target].copy()
            x_test = test[features]

            x_score = original_train.loc[test_idx,["ID", "efs", "efs_time","race_group"]].copy()

            val_preds = np.zeros(len(x_val))
            
            # Predict on validation set
            for col in x_val:
                if col not in settings.col_ignore:
                    val_preds += rankdata(x_val[col])
                    


            # Calculate score
            m = stratified_c_index(x_score, val_preds)
            self.scores.append(m)
            print(f"Fold {fold+1}: Stratified C-index: {m}")
            
        overall_pred = np.zeros(len(original_train))
        for col in train:
            if col not in settings.col_ignore:
                overall_pred += rankdata(train[col])

        overall_score = original_train[["ID", "efs", "efs_time","race_group"]].copy()
            
        print("#"*25,"Cross-Validation Completed","#"*25)
        print(f"Average Stratified C-Index: {np.mean(self.scores)}")
        print(f"Overall Stratified C-Index: {stratified_c_index(overall_score, overall_pred)}")

        

        
        
        

# Model Training

In [None]:
estimators = ['km','na','cox']

In [None]:
for estimator in estimators:
    if estimator == 'cox':
        pass
    else:
        params = {
        'metric': 'rmse', 
        'random_state': 42,
        'reg_alpha': 1.3502289561482839,
        'reg_lambda': 7.699709830405624,
        'colsample_bytree': 0.5,
        'subsample': 0.7,
        "learning_rate": 0.09286925251836978,
        'max_depth': 10,
        'num_leaves' : 232,
        'min_child_samples': 125,
        'cat_smooth' : 32
    }

        model = LGBMRegressor(
            **params,
            device="gpu", 
            objective="regression", 
            verbose=-1, 
            #early_stopping_rounds=25,
        )
        pipeline = SurvModel(model, estimator = estimator)
        df_train, df_test = pipeline.preprocessing(train, test)
        oof_lgb, preds_lgb,_ = pipeline.fit_predict(df_train,df_test)
        pipeline.save_predictions("LightGBM_"+estimator, df_train, df_test)

In [None]:
for estimator in estimators:
    if estimator == 'cox':
        model = XGBRegressor(
            device="cuda",
            max_depth=3,  
            colsample_bytree=0.5,  
            subsample=0.8,  
            n_estimators=2000,  
            learning_rate=0.02,  
            enable_categorical = True,
            objective = "survival:cox",
            eval_metric = 'cox-nloglik'
        )
        
    else:
        
        model = XGBRegressor(
            device="cuda",
            max_depth=3,  
            colsample_bytree=0.5,  
            subsample=0.8,  
            n_estimators=2000,  
            learning_rate=0.02,  
            enable_categorical=True,
            verbose=0
        )
    pipeline = SurvModel(model, estimator = estimator)
    df_train, df_test = pipeline.preprocessing(train, test)
    oof_xgb, preds_xgb,_ = pipeline.fit_predict(df_train,df_test)
    pipeline.save_predictions("XGBoost_"+estimator, df_train, df_test)

In [None]:
for estimator in estimators:
    if estimator == 'cox':
        model = CatBoostRegressor(loss_function = 'Cox', grow_policy='Lossguide',random_state = settings.seed)
    else:
        model = CatBoostRegressor(task_type = "GPU", grow_policy = 'Lossguide',random_state = settings.seed)
    
    pipeline = SurvModel(model, estimator = estimator)
    df_train, df_test = pipeline.preprocessing(train, test)
    oof_cat, preds_cat,_ = pipeline.fit_predict(df_train,df_test, True)
    pipeline.save_predictions("Catboost_"+estimator, df_train, df_test)

# Ensembling

In [None]:
esm = Ensembler("/kaggle/working/train", "/kaggle/working/test")
final_train, final_test = esm.concatenate_train_test(df_train)
esm.ranking(final_train,final_test,df_train)

# Submission

In [None]:
final_pred = np.zeros(len(final_test))

for col in final_test.columns:
    if col not in settings.col_ignore:
        final_pred += rankdata(final_test[col])


final_pred

In [None]:
submission = pd.read_csv(settings.sub_link)
submission['prediction'] = final_pred

submission.to_csv("submission.csv", index = False)
submission