# Barlow Cox PH model with PCA
Fitting Cox PH model with Barlow weights.

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime

import pickle

from weighted_concordance import *

In [3]:
# pd.set_option('display.max_columns', None,'display.max_rows',20)

In [4]:
from lifelines import CoxPHFitter
from lifelines.utils import datetimes_to_durations

In [5]:
# importing data
X_tr = pd.read_csv("D:/compiled_data/X_tr.csv").set_index("csid")

y = pd.read_csv("D:/compiled_data/y.csv").set_index("csid")

Converting date-times from string format to datetime format.

In [6]:
def date_time_conversion(date):
    return(datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ"))

In [7]:
# Coverting the times into datetime format
y_tr = y.assign(
    event = y.ep_CKB0020_combined_ep,
    start_time = y.study_date.map(date_time_conversion),
    event_time = y.ep_CKB0020_combined_datedeveloped.map(date_time_conversion)
).drop(columns = y.columns)
y_tr["duration"] = datetimes_to_durations(y_tr["start_time"],y_tr["event_time"])[0]
y_tr["subcohort"] = y["subcohort"]
y_tr = y_tr.drop(columns = ["start_time","event_time"])

## Fitting Model

Model fails to converge when naively applied to the data:

In [9]:
# creating a case dataframe
cases_barlow_df = X_tr.join(y_tr).query("subcohort == False")

cases_barlow_df = cases_barlow_df.assign(
    # setting events outside subcohort to start just before they occur
    start_time = lambda df: df["duration"] - 0.001,
    # adding appropriate weight
    weight = 1,
    subcohort = False
)

# creating a subcohort dataframe
subcohort_barlow_df = X_tr.join(y_tr).query("subcohort == True")

# sampling proportion of cohort
alpha = len(subcohort_barlow_df)/512724/nfolds

subcohort_barlow_df = subcohort_barlow_df.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1/alpha,
        subcohort = True
    )

case_subcohort_barlow_df = pd.concat([cases_barlow_df,subcohort_barlow_df]).drop(columns = "subcohort")

cph = CoxPHFitter()
cph.fit(case_subcohort_barlow_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

We use PCA to reduce dimension to remedy convergence issues. Number of components will be selected by cross-validation:

In [8]:
from sklearn.decomposition import PCA
from lifelines.utils import k_fold_cross_validation, concordance_index
from sklearn.model_selection import KFold

See "Unweighted Cox PH with PCA" file for more detailed code annotation.

In [10]:
cox_pca_scores = []
nfolds = 3

for i in range(1,20):
    
    print(i)
    
    kf = KFold(n_splits=nfolds)
    kf.get_n_splits(X_tr)

    scores = []
    
    for train_index, test_index in kf.split(X_tr):
        X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
        y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]
            
        pca = PCA(n_components = i/20)
        X_tr_train_pca = pd.DataFrame(pca.fit_transform(X_tr_train))
        X_tr_train_pca.index = X_tr_train.index

        # creating a case dataframe
        cases_barlow_df = X_tr_train_pca.join(y_tr_train).query("subcohort == False")

        cases_barlow_df = cases_barlow_df.assign(
            # setting events outside subcohort to start just before they occur
            start_time = lambda df: df["duration"] - 0.001,
            # adding appropriate weight
            weight = 1,
            subcohort = False
        )
    
        # creating a subcohort dataframe
        subcohort_barlow_df = X_tr_train_pca.join(y_tr_train).query("subcohort == True")

        # sampling proportion of cohort
        alpha = len(subcohort_barlow_df)/51272400

        subcohort_barlow_df = subcohort_barlow_df.assign(
                # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
                duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
                # the events start from the origin
                start_time = 0, 
                event = False,
                weight = 1/alpha,
                subcohort = True
            )

        case_subcohort_barlow_df = pd.concat([cases_barlow_df,subcohort_barlow_df]).drop(columns = "subcohort")

        cph = CoxPHFitter()
        cph.fit(case_subcohort_barlow_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)
        
        event_times = y_tr_test["duration"]
        event_observed = y_tr_test["event"]
        
        X_tr_pca_test = pca.transform(X_tr_test)
        
        test_preds = cph.predict_partial_hazard(X_tr_pca_test)
        scores.append(weighted_concordance(event_times,test_preds, event_observed, samp_fraction = 0.0011604684001529089/n_folds))
        
    cox_pca_scores.append(np.mean(scores))
    
print(cox_pca_scores)
print("Best variation percentage:",np.where(cox_pca_scores == np.max(cox_pca_scores))[0]/20)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17




18




19




[0.5742394904049702, 0.5742394904049702, 0.5767370397379626, 0.6855369962981764, 0.6932618525437464, 0.667465867904287, 0.6885650074266972, 0.7182822076640817, 0.6827459499933259, 0.7278981845257656, 0.7081896084157259, 0.7270545226169398, 0.7120239221671132, 0.68191957338689, 0.664612802973687, 0.5402186258364331, 0.65259073656468, 0.6164341591621124, 0.6615526731264826]
Best variation percentage: [0.45]


Fitting final model:

In [22]:
pca = PCA(n_components = 0.45)
X_tr_pca = pd.DataFrame(pca.fit_transform(X_tr))
X_tr_pca.index = X_tr.index

In [23]:
pickle.dump(pca, open('barlow_cox_pca_wc.pkl', 'wb'))

In [24]:
# creating a case dataframe
cases_barlow_df = X_tr_pca.join(y_tr).query("subcohort == False")

cases_barlow_df = cases_barlow_df.assign(
    # setting events outside subcohort to start just before they occur
    start_time = lambda df: df["duration"] - 0.001,
    # adding appropriate weight
    weight = 1,
    subcohort = False
)

# creating a subcohort dataframe
subcohort_barlow_df = X_tr_pca.join(y_tr).query("subcohort == True")

# sampling proportion of cohort
alpha = len(subcohort_barlow_df)/512724

subcohort_barlow_df = subcohort_barlow_df.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1/alpha,
        subcohort = True
    )

case_subcohort_barlow_df = pd.concat([cases_barlow_df,subcohort_barlow_df]).drop(columns = "subcohort")

cph = CoxPHFitter()
cph.fit(case_subcohort_barlow_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

<lifelines.CoxPHFitter: fitted with 513319 total observations, 512724 right-censored observations>

In [25]:
pickle.dump(cph, open('barlow_cox_wc.pkl', 'wb'))