In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import main_module as md

# figure fonts configuration
from matplotlib import rc
rc('font', **{'family': 'serif', 'serif': ['Computer Modern']})
rc('text', usetex=True)

In [3]:
# import training set and validation set
df_val_test= pd.read_csv("../data/test_validation_set.csv")
from sklearn.model_selection import train_test_split
df_val, df_test = train_test_split(df_val_test, train_size= 0.5, random_state = 41, shuffle = True)
df_train = pd.read_csv("../data/train_set.csv")


hct_df = md.hct("../data/train_set.csv")
hct_df.data = hct_df.clean(method="replace", params=\
                          [["Not done", "Not tested", "Other", "Missing disease status", "Non-resident of the U.S."], \
                           'missing_cat'])
# hct_df.report_missing_values(hct_df.data[hct_df.data.select_dtypes(include=("float64")).columns])

In [4]:
from sklearn.model_selection import KFold
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config

### Careful designed preprocessor1

In [5]:
class MissingIndicateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, null_list = ["Not done", "Not tested", "Other", "Missing disease status", "Non-resident of the U.S."]):
        self.null_list = null_list
        self.columns = None
    
    def transform(self, X, y=None):
        X_transform = X.copy(deep = True)
        X_transform.replace(self.null_list, "missing", inplace = True)
        cat_cols = X_transform.select_dtypes(include = 'O').columns
        X_transform[cat_cols] = X_transform[cat_cols].fillna("missing")
        return X_transform

    def fit(self, X, y=None):
        self.columns = X.columns
        return self 
    
    def get_feature_names_out(self, input_features = None):
        return self.columns

In [6]:
cat_cols = df_train.select_dtypes(include='O').columns
num_cols = df_train.select_dtypes(exclude='O').columns.drop(["ID", 'year_hct','efs', 'efs_time'])

# set_config(transform_output="pandas")
preproc = Pipeline(
    steps = [('preprocessing',
                ColumnTransformer([('cat_missing', MissingIndicateTransformer(), cat_cols),
                                ('ID_year_dropper', 'drop', ["ID", 'year_hct']),
                                ('scale', StandardScaler(), num_cols)],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
            ),
            ('one_hot_encode_and_impute',
                ColumnTransformer([('one_hot', OneHotEncoder(drop='first',
                                                             min_frequency = 0.001,
                                                             handle_unknown='ignore',
                                                             sparse_output= False), cat_cols),
                                ('impute_donor_age', SimpleImputer(strategy="median"), ['donor_age']),
                                ('impute_other', KNNImputer(n_neighbors=5), num_cols.drop(['donor_age']))],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
            )
    ]
)

In [22]:
preproc.fit(df_train)
df_train_preproc = preproc.transform(df_train)

In [23]:
# X = pd.DataFrame(df_train_ready, columns=one_hot_impute.get_feature_names_out())
df_train_preproc.head()

Unnamed: 0,dri_score_High - TED AML case <missing cytogenetics,dri_score_Intermediate,dri_score_Intermediate - TED AML case <missing cytogenetics,dri_score_Low,dri_score_N/A - disease not classifiable,dri_score_N/A - non-malignant indication,dri_score_N/A - pediatric,dri_score_TBD cytogenetics,dri_score_Very high,dri_score_missing,...,age_at_hct,hla_match_a_low,hla_match_b_high,comorbidity_score,karnofsky_score,hla_low_res_8,hla_match_drb1_high,hla_low_res_10,efs,efs_time
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.630684,0.635095,-1.499348,-0.853537,0.557682,-1.214733,-1.535178,-0.882139,0.0,93.779
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.558801,0.635095,0.647705,0.146609,-2.168356,0.700292,0.635422,0.708859,1.0,12.088
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.229453,0.635095,0.647705,1.146755,0.557682,0.700292,0.635422,0.708859,0.0,25.724
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.83306,0.635095,0.647705,-0.853537,0.557682,0.700292,0.635422,0.708859,0.0,43.373
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-1.638944,0.635095,0.647705,-0.853537,-0.350997,0.700292,0.635422,0.708859,1.0,8.593


In [24]:
hct_df.report_missing_values(df_train_preproc)

Unnamed: 0,Feature,Percentage Missing
0,dri_score_High - TED AML case <missing cytogen...,0.0
1,dri_score_Intermediate,0.0
2,dri_score_Intermediate - TED AML case <missing...,0.0
3,dri_score_Low,0.0
4,dri_score_N/A - disease not classifiable,0.0
...,...,...
155,hla_low_res_8,0.0
156,hla_match_drb1_high,0.0
157,hla_low_res_10,0.0
158,efs,0.0


### Kfold cross validation

In [7]:
from lifelines import CoxPHFitter
# import the score function
%run -i ../examples/concordance_index.ipynb

In [None]:
n_splits = 10
kfold = KFold(n_splits = n_splits, shuffle = True, random_state = 42)
c_indexes = np.zeros(n_splits)
scores = np.zeros(n_splits)
target_features = ['efs', 'efs_time']

for i, (train_idx,test_idx) in enumerate(kfold.split(df_train)):

    X_train = df_train.iloc[train_idx].drop(columns = target_features)
    y_train = df_train.loc[train_idx, target_features]

    X_test = df_train.iloc[test_idx].drop(columns = target_features)
    y_test = df_train.loc[test_idx, target_features]

    preproc.fit(X_train)
    X_train_preproc = preproc.transform(X_train)
    X_test_preproc =preproc.transform(X_test)

    train_preproc = pd.concat([X_train_preproc, y_train], axis=1)
    test_preproc = pd.concat([X_test_preproc, y_test], axis=1)

    cph = CoxPHFitter()
    cph.fit(train_preproc, duration_col='efs_time', event_col='efs')
    preds = cph.predict_partial_hazard(X_test_preproc)

    solution = df_train.iloc[test_idx]
    prediction = pd.DataFrame({"ID":X_test["ID"], "prediction":preds})
    scores[i] = score(solution.copy(deep=True), prediction.copy(deep=True), "ID")
    c_indexes[i] = concordance_index(y_test['efs_time'], -preds, y_test['efs'])
# 27m 56.2s



In [39]:
print(f"stratified c-index: \n {scores} \n, c-index: \n {c_indexes}")

stratified c-index: 
 [0.647206   0.65912036 0.65153799 0.63008074 0.64940412 0.66133796
 0.65071265 0.63895584 0.65555021 0.64092212] 
, c-index: 
 [0.67327646 0.67850871 0.6711965  0.66579483 0.67326945 0.67590336
 0.66382557 0.65796142 0.67129549 0.66691247]


### Naive preprocessor

In [35]:
# Naive preprocessor
# replace missing categorical variables by 'missing', replace missing numerical values by -1
class NaiveDataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
    
    def transform(self, X, y=None):
        X_transform = X.copy(deep = True)
        cat_cols = X_transform.select_dtypes(include = 'O').columns
        num_cols = X_transform.select_dtypes(exclude = 'O').columns
        X_transform[cat_cols] = X_transform[cat_cols].fillna("missing")
        X_transform[num_cols] = X_transform[num_cols].fillna(-1.0)
        return X_transform

    def fit(self, X, y=None):
        self.columns = X.columns
        return self 
    
    def get_feature_names_out(self, input_features = None):
        return self.columns

cat_cols = df_train.select_dtypes(include='O').columns
num_cols = df_train.select_dtypes(exclude='O').columns.drop(["ID", 'year_hct','efs', 'efs_time'])
other_cols = df_train.columns.drop(["ID", 'year_hct','efs', 'efs_time'])
# set_config(transform_output="pandas")
preproc_naive = Pipeline(
    steps = [('preprocessing',
                ColumnTransformer([('naive_missing', NaiveDataTransformer(), other_cols),
                                ('ID_year_dropper', 'drop', ["ID", 'year_hct'])],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
            ),
            ('naive_one_hot_encode',
                ColumnTransformer([('one_hot', OneHotEncoder(drop='first',
                                                             min_frequency = 0.001,
                                                             handle_unknown='ignore',
                                                             sparse_output= False), cat_cols)],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
            )
    ]
)

### Preprocecss with interaction

In [80]:
# drop columns with all zeros
class LowVarDropTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns = None
        self.low_var_cols = None
    
    def transform(self, X, y=None):
        X_transform = X.copy(deep = True)
        X_transform.drop(columns = self.low_var_cols, inplace=True)
        return X_transform

    def fit(self, X, y=None):
        self.low_var_cols = X.columns[X.var() < 0.2]
        self.columns = X.columns.drop(self.low_var_cols)
        return self 
    
    def get_feature_names_out(self, input_features = None):
        return self.columns

In [81]:
from sklearn.preprocessing import PolynomialFeatures
cat_cols = df_train.select_dtypes(include='O').columns
num_cols = df_train.select_dtypes(exclude='O').columns.drop(["ID", 'year_hct','efs', 'efs_time'])
other_cols = df_train.columns.drop(["ID", 'year_hct','efs', 'efs_time'])
preproc_interact = Pipeline(
    steps = [('preprocessing',
                ColumnTransformer([('naive_missing', NaiveDataTransformer(), other_cols),
                                ('ID_year_dropper', 'drop', ["ID", 'year_hct'])],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
            ),
            ('naive_one_hot_encode',
                ColumnTransformer([('one_hot', OneHotEncoder(drop='first',
                                                             min_frequency = 0.001,
                                                             handle_unknown='ignore',
                                                             sparse_output= False), cat_cols),
                                    ('scale', StandardScaler(), num_cols)],
                                    sparse_threshold=0,
                                    remainder='passthrough',
                                    verbose_feature_names_out=False,
                                    force_int_remainder_cols=False
                                ).set_output(transform="pandas")
            ),
            ('add_interaction',
                PolynomialFeatures(2, interaction_only=True, include_bias=False).set_output(transform = "pandas")
            ),
            ('drop_low_variance_columns', LowVarDropTransformer().set_output(transform="pandas"))
    ]
)

### CPH: test and validation Stratified C-index preformance

In [28]:
def cph_report(df_train, df_test, pipeline, penalizer = 0.0, l1_ratio = 0.0):
    target_features = ['efs', 'efs_time']
    X_train = df_train.drop(columns = target_features)
    y_train = df_train[target_features]

    X_test = df_test.drop(columns= target_features)
    y_test = df_test[target_features]

    pipeline.fit(X_train)
    X_train_preproc = pipeline.transform(X_train)
    X_test_preproc = pipeline.transform(X_test)

    train_preproc = pd.concat([X_train_preproc, y_train], axis=1)
    test_preproc = pd.concat([X_test_preproc, y_test], axis=1)

    cph = CoxPHFitter(penalizer = penalizer, l1_ratio= l1_ratio)
    cph.fit(train_preproc, duration_col='efs_time', event_col='efs')
    preds = cph.predict_partial_hazard(X_test_preproc)

    solution = df_test
    prediction = pd.DataFrame({"ID":X_test["ID"], "prediction":preds})
    test_SCIndex = score(solution.copy(deep=True), prediction.copy(deep=True), "ID")
    test_C_index = concordance_index(y_test['efs_time'], -preds, y_test['efs'])
    print(f"        stratified c-index: {test_SCIndex}, c-index: {test_C_index}")

    return test_SCIndex, test_C_index

In [29]:
# naive preprocessor
print("preformance of the naive preprocessor:")
# performance on the validation set with various penalizer
penalizer_list = [0.0, 0.01, 0.2, 0.5]
l1_ratio_list = [0.0, 0.2, 0.5, 0.8]
tuning_sc_results = -np.ones((4,4))
tuning_c_results = -np.ones((4,4))
for i, p in enumerate(penalizer_list):
    for j, l in enumerate(l1_ratio_list): 
        print(f"    performance on the validation set with penalizer {p}, l1_ratio {l}:")
        test_SCIndex, test_C_index = cph_report(df_train, df_val, preproc_naive, penalizer= p, l1_ratio= l)
        tuning_sc_results[i][j] = test_SCIndex
        tuning_c_results[i][j] = test_C_index

preformance of the naive preprocessor:
    performance on the validation set with penalizer 0.0, l1_ratio 0.0:
        stratified c-index: 0.6534229834010374, c-index: 0.6708225220550078
    performance on the validation set with penalizer 0.0, l1_ratio 0.2:
        stratified c-index: 0.6534229834010374, c-index: 0.6708225220550078
    performance on the validation set with penalizer 0.0, l1_ratio 0.5:
        stratified c-index: 0.6534229834010374, c-index: 0.6708225220550078
    performance on the validation set with penalizer 0.0, l1_ratio 0.8:
        stratified c-index: 0.6534229834010374, c-index: 0.6708225220550078
    performance on the validation set with penalizer 0.01, l1_ratio 0.0:
        stratified c-index: 0.6535804328851086, c-index: 0.6712052413077322
    performance on the validation set with penalizer 0.01, l1_ratio 0.2:
        stratified c-index: 0.6549412042902181, c-index: 0.6722267526626634
    performance on the validation set with penalizer 0.01, l1_ratio 0.5

In [31]:
# performance on the test set
cph_report(df_train, df_test, preproc_naive, penalizer= 0.01, l1_ratio= 0.2)

        stratified c-index: 0.6399288633895288, c-index: 0.6626878416658459


(0.6399288633895288, np.float64(0.6626878416658459))

In [71]:
# designed preprocessor
print("preformance of the imputing preprocessor:")
# performance on the validation set
sc_index, c_index = cph_report(df_train, df_val, preproc, penalizer= 0.01, l1_ratio= 0.2)
# performance on the test set
sc_index, c_index = cph_report(df_train, df_test, preproc, penalizer= 0.01, l1_ratio= 0.2)

preformance of the imputing preprocessor:
        stratified c-index: 0.6552338517599439, c-index: 0.6732788444905725
        stratified c-index: 0.6346773936914096, c-index: 0.6584861584234605


In [None]:
# intereact preprocessor
print("preformance of the interact preprocessor:")
# performance on the validation set
sc_index, c_index = cph_report(df_train, df_val, preproc_interact, penalizer= 0.01, l1_ratio= 0.2)
# performance on the test set
sc_index, c_index = cph_report(df_train, df_test, preproc_interact, penalizer= 0.01, l1_ratio= 0.2)

In [82]:
preproc_interact.fit(df_train.drop(columns=['efs','efs_time']))
X_train_preproc = preproc_interact.transform(df_train.drop(columns=['efs','efs_time']))
X_train_preproc.head()



Unnamed: 0,dri_score_Intermediate,cyto_score_Poor,cyto_score_missing,graft_type_Peripheral blood,prim_disease_hct_ALL,tce_imm_match_P/P,tce_imm_match_missing,prod_type_PB,cyto_score_detail_Intermediate,cyto_score_detail_missing,...,comorbidity_score karnofsky_score,comorbidity_score hla_low_res_8,comorbidity_score hla_match_drb1_high,comorbidity_score hla_low_res_10,karnofsky_score hla_low_res_8,karnofsky_score hla_match_drb1_high,karnofsky_score hla_low_res_10,hla_low_res_8 hla_match_drb1_high,hla_low_res_8 hla_low_res_10,hla_match_drb1_high hla_low_res_10
0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,-0.396712,0.252989,0.336118,-0.005824,-0.146954,-0.19524,0.003383,0.124508,-0.002158,-0.002866
1,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,...,-0.195461,0.116394,0.104726,0.125059,-0.812556,-0.731103,-0.873051,0.43536,0.519888,0.467773
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.55736,0.807646,0.726685,0.867775,0.333917,0.300444,0.358777,0.43536,0.519888,0.467773
3,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,-0.396712,-0.574858,-0.517232,-0.617656,0.333917,0.300444,0.358777,0.43536,0.519888,0.467773
4,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.057313,-0.574858,-0.517232,-0.617656,-0.048241,-0.043405,-0.051832,0.43536,0.519888,0.467773


In [None]:
X_train_preproc.var()

dri_score_Intermediate                 0.230984
cyto_score_Poor                        0.212923
cyto_score_missing                     0.201069
graft_type_Peripheral blood            0.203851
prim_disease_hct_ALL                   0.202574
                                         ...   
karnofsky_score hla_match_drb1_high    1.101140
karnofsky_score hla_low_res_10         1.200311
hla_low_res_8 hla_match_drb1_high      2.600477
hla_low_res_8 hla_low_res_10           2.094742
hla_match_drb1_high hla_low_res_10     2.107862
Length: 849, dtype: float64

In [85]:
(X_train_preproc.var() > 0.24).sum()

np.int64(737)