In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import KFold
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

from lifelines import CoxPHFitter
# import the score function
%run -i ../examples/concordance_index.ipynb

In [None]:
# import data
from sklearn.model_selection import train_test_split
df_test= pd.read_csv("../data/test_validation_set.csv")
df_train = pd.read_csv("../data/train_set.csv")

In [None]:
# Preprocessing based on KNN imputation 
class MissingValueTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, null_list = ["Missing Disease Status", "Missing disease status"]):
        self.null_list = null_list
        self.columns = None
    
    def transform(self, X, y=None):
        X_transform = X.copy(deep = True)
        X_transform.replace(self.null_list, np.nan, inplace = True)
        return X_transform

    def fit(self, X, y=None):
        self.columns = X.columns
        return self 
    
    def get_feature_names_out(self, input_features = None):
        return self.columns

cat_cols = df_train.select_dtypes(include='O').columns
preproc_sd = Pipeline(
    [   
        ('preprocessing',
                ColumnTransformer([
                                    ('cat_missing', MissingValueTransformer(), cat_cols),
                                    ('ID_year_dropper', 'drop', ["ID", 'year_hct'])],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
        ),
        (
            "encode_and_scale",
            ColumnTransformer(
                [
                    ('one_hot', 
                    OneHotEncoder(drop='first',
                                    min_frequency = 0.001,
                                    handle_unknown='ignore',
                                    sparse_output= False
                    ), 
                    cat_cols
                    ),
                    ('scale', StandardScaler(), ['donor_age', 'age_at_hct', 'karnofsky_score'])
                ],
                sparse_threshold=0,
                remainder='passthrough',
                verbose_feature_names_out=False,
                force_int_remainder_cols=False
            ).set_output(transform="pandas")
        ),
        (
            "impute",
            KNNImputer().set_output(transform = "pandas")
        ),
    ]
)

# cox propotional harzard model (with l2 l1 regularization parameters)
def cph_model(train_preproc, X_test_preproc, l2, l1):

    cph = CoxPHFitter(penalizer= l2, l1_ratio= l1)
    cph.fit(train_preproc, duration_col='efs_time', event_col='efs')
    preds = cph.predict_partial_hazard(X_test_preproc)
    
    return preds

In [None]:
def eval(preds, X_test, solution):
    prediction= pd.DataFrame({"ID":X_test["ID"], "prediction":preds})
    sc_score = score(solution.copy(deep=True), prediction.copy(deep=True), "ID")

    return sc_score

## Fine tuning for the l1 l2 regularizations

In [None]:
penalizer_list = [0.0001, 0.001, 0.01]
l1_ratio_list = [0.0, 0.01, 0.05]

penalizer_lables = ['l2 = ' + str(p) for p in penalizer_list]
l1_ratio_table = ['l1 = ' + str(l1) for l1 in l1_ratio_list]

In [None]:
preproc_pipline = preproc_sd

n_splits = 5
kfold = KFold(n_splits = n_splits, shuffle = True, random_state = 42)
target_features = ['efs', 'efs_time']
sc_indexes = -1.0 * np.ones((n_splits, len(penalizer_list), len(l1_ratio_list))) 

for i, (train_idx,test_idx) in enumerate(kfold.split(df_train)):

    X_train = df_train.iloc[train_idx].drop(columns = target_features)
    y_train = df_train.loc[train_idx, target_features]

    X_test = df_train.iloc[test_idx].drop(columns = target_features)
    y_test = df_train.loc[test_idx, target_features]

    preproc_pipline.fit(X_train)
    X_train_preproc = preproc_pipline.transform(X_train)
    X_test_preproc =preproc_pipline.transform(X_test)

    train_preproc = pd.concat([X_train_preproc, y_train], axis=1)
    solution = df_train.iloc[test_idx]

    for j, p  in enumerate(penalizer_list):
        for k, l1 in enumerate(l1_ratio_list):
                preds_cph = cph_model(train_preproc, X_test_preproc, p, l1)
                score_cph = eval(preds_cph, X_test, solution)
                sc_indexes[i, j, k] = score_cph
                print(f"stratified c-index for fold {i}, penalizer {p}, l1_ratio {l1}: {score_cph}")

In [None]:
sc_indexes_mean = sc_indexes.mean(axis = 0)
sc_indexes_mean = pd.DataFrame(sc_indexes_mean,
                               index = penalizer_lables,
                               columns = l1_ratio_table)

print(f"mean stratified c-index: \n {sc_indexes_mean}")
sc_indexes_mean.to_csv("finetuning_cph_results.csv", sep='\t', index= True, header= True)