# Barlow Cox PH model, penalized

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

import pickle

from weighted_concordance import *

In [2]:
# pd.set_option('display.max_columns', None,'display.max_rows',20)

In [3]:
from lifelines import CoxPHFitter
from lifelines.utils import datetimes_to_durations, k_fold_cross_validation, concordance_index

In [4]:
# importing data
X_tr = pd.read_csv("D:/compiled_data/X_tr.csv").set_index("csid")

y = pd.read_csv("D:/compiled_data/y.csv").set_index("csid")

Converting date-times from string format to datetime format.

In [5]:
def date_time_conversion(date):
    return(datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ"))

In [6]:
# Coverting the times into datetime format
y_tr = y.assign(
    event = y.ep_CKB0020_combined_ep,
    start_time = y.study_date.map(date_time_conversion),
    event_time = y.ep_CKB0020_combined_datedeveloped.map(date_time_conversion)
).drop(columns = y.columns)
y_tr["duration"] = datetimes_to_durations(y_tr["start_time"],y_tr["event_time"])[0]
y_tr["subcohort"] = y["subcohort"]
y_tr = y_tr.drop(columns = ["start_time","event_time"])
y_tr

Unnamed: 0_level_0,event,duration,subcohort
csid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-00229-1799093,1,1192.0,False
2022-00229-4744170,1,1690.0,False
2022-00229-1082517,1,2035.0,False
2022-00229-3397875,1,2530.0,False
2022-00229-4372175,1,2216.0,False
...,...,...,...
2022-00229-5565114,0,4495.0,True
2022-00229-4902412,0,3448.0,True
2022-00229-4823882,0,895.0,True
2022-00229-4439450,0,5084.0,True


## Fitting Model

In [8]:
# creating a case dataframe
cases_barlow_df = X_tr.join(y_tr).query("subcohort == False")

cases_barlow_df = cases_barlow_df.assign(
    # setting events outside subcohort to start just before they occur
    start_time = lambda df: df["duration"] - 0.001,
    # adding appropriate weight
    weight = 1,
    subcohort = False
)

# creating a subcohort dataframe
subcohort_barlow_df = X_tr.join(y_tr).query("subcohort == True")

# sampling proportion of cohort
alpha = len(subcohort_barlow_df)/51272400
print(alpha)

subcohort_barlow_df = subcohort_barlow_df.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1/alpha,
        subcohort = True
    )

case_subcohort_barlow_df = pd.concat([cases_barlow_df,subcohort_barlow_df]).drop(columns = "subcohort")

1.1604684001529088e-05


In [9]:
cph = CoxPHFitter(penalizer = 1.0)
cph.fit(case_subcohort_barlow_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

<lifelines.CoxPHFitter: fitted with 5.1273e+07 total observations, 5.12724e+07 right-censored observations>

Unlike unregularised, dimension does not seem to have an issue. Now we need to select the regulariser using cross-validation.

See "Unweighted Cox PH with PCA" file for more detailed code annotation. Errors are suppressed as usually there is a successful analysis on at least one of the folds, so we ignore the failures.

In [14]:
from sklearn.model_selection import KFold

In [15]:
cox_barlow_penalizer_scores = pd.DataFrame({
    'penalizer' : [],
    'l1_ratio': [],
    'score': []
})
cox_barlow_penalizer_scores

Unnamed: 0,penalizer,l1_ratio,score


In [16]:
for i in range(6,9):
    for j in range(0,3):
    
        kf = KFold(n_splits = 15)
        kf.get_n_splits(X_tr)

        scores = []

        for train_index, test_index in kf.split(X_tr):
            X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
            y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]

            # creating a case dataframe
            cases_barlow_df = X_tr_train.join(y_tr_train).query("subcohort == False")

            cases_barlow_df = cases_barlow_df.assign(
                # setting events outside subcohort to start just before they occur
                start_time = lambda df: df["duration"] - 0.001,
                # adding appropriate weight
                weight = 1,
                subcohort = False
            )

            # creating a subcohort dataframe
            subcohort_barlow_df = X_tr_train.join(y_tr_train).query("subcohort == True")

            # sampling proportion of cohort
            alpha = len(subcohort_barlow_df)/51272400

            subcohort_barlow_df = subcohort_barlow_df.assign(
                    # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
                    duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
                    # the events start from the origin
                    start_time = 0, 
                    event = False,
                    weight = 1/alpha,
                    subcohort = True
                )

            case_subcohort_barlow_df = pd.concat([cases_barlow_df,subcohort_barlow_df]).drop(columns = "subcohort")

            cph = CoxPHFitter(penalizer = i/5, l1_ratio = j/3)
            try:
                cph.fit(case_subcohort_barlow_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

                event_times = y_tr_test["duration"]
                event_observed = y_tr_test["event"]

                test_preds = cph.predict_partial_hazard(X_tr_test)
                scores.append(weighted_concordance(event_times,test_preds, event_observed,0.0011604684001529089))
            except:
                pass
            
        cox_barlow_penalizer_scores = cox_barlow_penalizer_scores.append({
            'penalizer' : i/5,
            'l1_ratio': j/3,
            'score': np.mean(scores)
        }, ignore_index = True)



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

In [17]:
cox_barlow_penalizer_scores.pivot(index = "penalizer", columns = "l1_ratio", values = "score")

l1_ratio,0.000000,0.333333,0.666667
penalizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.2,0.671928,0.610125,0.60866
1.4,0.673086,0.614546,0.608733
1.6,0.672781,0.614476,0.608595


In [23]:
for i in range(11,12):
    for j in range(0,1):
        
        print('penalizer =', i/10, 'l1_ratio =', j/5)
    
        kf = KFold(n_splits = 15)
        kf.get_n_splits(X_tr)

        scores = []

        for train_index, test_index in kf.split(X_tr):
            X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
            y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]

            # creating a case dataframe
            cases_barlow_df = X_tr_train.join(y_tr_train).query("subcohort == False")

            cases_barlow_df = cases_barlow_df.assign(
                # setting events outside subcohort to start just before they occur
                start_time = lambda df: df["duration"] - 0.001,
                # adding appropriate weight
                weight = 1,
                subcohort = False
            )

            # creating a subcohort dataframe
            subcohort_barlow_df = X_tr_train.join(y_tr_train).query("subcohort == True")

            # sampling proportion of cohort
            alpha = len(subcohort_barlow_df)/51272400

            subcohort_barlow_df = subcohort_barlow_df.assign(
                    # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
                    duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
                    # the events start from the origin
                    start_time = 0, 
                    event = False,
                    weight = 1/alpha,
                    subcohort = True
                )

            case_subcohort_barlow_df = pd.concat([cases_barlow_df,subcohort_barlow_df]).drop(columns = "subcohort")

            cph = CoxPHFitter(penalizer = i/10, l1_ratio = j/3)
            try:
                cph.fit(case_subcohort_barlow_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

                event_times = y_tr_test["duration"]
                event_observed = y_tr_test["event"]

                test_preds = cph.predict_partial_hazard(X_tr_test)
                scores.append(weighted_concordance(event_times,test_preds, event_observed,0.0011604684001529089))
            except:
                pass
            
        cox_barlow_penalizer_scores = cox_barlow_penalizer_scores.append({
            'penalizer' : i/10,
            'l1_ratio': j/3,
            'score': np.mean(scores)
        }, ignore_index = True)


penalizer = 1.1 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

In [24]:
cox_barlow_penalizer_scores#.pivot(index = "penalizer", columns = "l1_ratio", values = "score")

Unnamed: 0,penalizer,l1_ratio,score
0,1.2,0.0,0.671928
1,1.2,0.333333,0.610125
2,1.2,0.666667,0.60866
3,1.4,0.0,0.673086
4,1.4,0.333333,0.614546
5,1.4,0.666667,0.608733
6,1.6,0.0,0.672781
7,1.6,0.333333,0.614476
8,1.6,0.666667,0.608595
9,0.6,0.0,0.715523


Accuracy increases up to 1.0, so we should probably explore further. L1 does not help so let's not look further here.

In [27]:
# creating a case dataframe
cases_barlow_df = X_tr.join(y_tr).query("subcohort == False")

cases_barlow_df = cases_barlow_df.assign(
    # setting events outside subcohort to start just before they occur
    start_time = lambda df: df["duration"] - 0.001,
    # adding appropriate weight
    weight = 1,
    subcohort = False
)

# creating a subcohort dataframe
subcohort_barlow_df = X_tr.join(y_tr).query("subcohort == True")

# sampling proportion of cohort
alpha = len(subcohort_barlow_df)/51272400
print(alpha)

subcohort_barlow_df = subcohort_barlow_df.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1/alpha,
        subcohort = True
    )

case_subcohort_barlow_df = pd.concat([cases_barlow_df,subcohort_barlow_df]).drop(columns = "subcohort")

1.1604684001529088e-05


In [28]:
cph = CoxPHFitter(penalizer = 1.0)
cph.fit(case_subcohort_barlow_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

<lifelines.CoxPHFitter: fitted with 5.1273e+07 total observations, 5.12724e+07 right-censored observations>

In [29]:
pickle.dump(cph, open('penalised_barlow_cox_wc.pkl', 'wb'))