# Barlow Cox PH model

In [18]:
import numpy as np
import pandas as pd
from datetime import datetime

import pickle

from weighted_concordance import *

In [19]:
# pd.set_option('display.max_columns', None,'display.max_rows',20)

In [20]:
from lifelines import CoxPHFitter
from lifelines.utils import datetimes_to_durations

In [21]:
# importing data
X_tr = pd.read_csv("D:/compiled_data/X_tr.csv").set_index("csid")

y = pd.read_csv("D:/compiled_data/y.csv").set_index("csid")

Converting date-times from string format to datetime format.

In [24]:
def date_time_conversion(date):
    return(datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ"))

In [None]:
# Coverting the times into datetime format
y_tr = y.assign(
    event = y.ep_CKB0020_combined_ep,
    start_time = y.study_date.map(date_time_conversion),
    event_time = y.ep_CKB0020_combined_datedeveloped.map(date_time_conversion)
).drop(columns = y.columns)
y_tr["duration"] = datetimes_to_durations(y_tr["start_time"],y_tr["event_time"])[0]
y_tr["subcohort"] = y["subcohort"]
y_tr = y_tr.drop(columns = ["start_time","event_time"])

## Fitting Model

The procedure is the same as making a Barlow data set, it's just that we adjust the size of the weighting of the subcohort. We still need to reduce dimension through PCA.

See "Unweighted Cox PH with PCA" file for more detailed code annotation.

In [10]:
from sklearn.decomposition import PCA
from lifelines.utils import k_fold_cross_validation, concordance_index
from sklearn.model_selection import KFold

In [11]:
cox_adjusted_pca_scores =  pd.DataFrame({
    'PCA' : [],
    'Weight': [],
    'score': []
})
cox_adjusted_pca_scores

Unnamed: 0,PCA,Weight,score


In [13]:
# sampling proportion of cohort
alpha = len(y_tr.query("subcohort == True"))/1400

for i in range(1,15):
    for j in range(1,round(1/alpha)*8):
        
        print("Prop. variance:", i/20 , "Weight:", j/4)
    
        kf = KFold(n_splits=3)
        kf.get_n_splits(X_tr)

        scores = []

        for train_index, test_index in kf.split(X_tr):
            X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
            y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]

            pca = PCA(n_components = i/20)
            X_tr_train_pca = pd.DataFrame(pca.fit_transform(X_tr_train))
            X_tr_train_pca.index = X_tr_train.index

            # creating a case dataframe
            cases_barlow_df = X_tr_train_pca.join(y_tr_train).query("subcohort == False")

            cases_barlow_df = cases_barlow_df.assign(
                # setting events outside subcohort to start just before they occur
                start_time = lambda df: df["duration"] - 0.001,
                # adding appropriate weight
                weight = 1,
                subcohort = False
            )

            # creating a subcohort dataframe
            subcohort_barlow_df = X_tr_train_pca.join(y_tr_train).query("subcohort == True")

            subcohort_barlow_df = subcohort_barlow_df.assign(
                    # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
                    duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
                    # the events start from the origin
                    start_time = 0, 
                    event = False,
                    weight = j/4,
                    subcohort = True
                )

            case_subcohort_barlow_df = pd.concat([cases_barlow_df,subcohort_barlow_df]).drop(columns = "subcohort")

            cph = CoxPHFitter()
            cph.fit(case_subcohort_barlow_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

            event_times = y_tr_test["duration"]
            event_observed = y_tr_test["event"]
            
            X_tr_pca_test = pca.transform(X_tr_test)

            test_preds = cph.predict_partial_hazard(X_tr_pca_test)
            scores.append(weighted_concordance(event_times,test_preds, event_observed,0.0011604684001529089))

        cox_adjusted_pca_scores = cox_adjusted_pca_scores.append({
            'PCA' : i/20,
            'Weight': j/4,
            'score': np.mean(scores)
        }, ignore_index = True)
    

Prop. variance: 0.05 Weight: 0.25
Prop. variance: 0.05 Weight: 0.5
Prop. variance: 0.05 Weight: 0.75
Prop. variance: 0.05 Weight: 1.0
Prop. variance: 0.05 Weight: 1.25
Prop. variance: 0.05 Weight: 1.5
Prop. variance: 0.05 Weight: 1.75
Prop. variance: 0.05 Weight: 2.0
Prop. variance: 0.05 Weight: 2.25
Prop. variance: 0.05 Weight: 2.5
Prop. variance: 0.05 Weight: 2.75
Prop. variance: 0.05 Weight: 3.0
Prop. variance: 0.05 Weight: 3.25
Prop. variance: 0.05 Weight: 3.5
Prop. variance: 0.05 Weight: 3.75
Prop. variance: 0.1 Weight: 0.25
Prop. variance: 0.1 Weight: 0.5
Prop. variance: 0.1 Weight: 0.75
Prop. variance: 0.1 Weight: 1.0
Prop. variance: 0.1 Weight: 1.25
Prop. variance: 0.1 Weight: 1.5
Prop. variance: 0.1 Weight: 1.75
Prop. variance: 0.1 Weight: 2.0
Prop. variance: 0.1 Weight: 2.25
Prop. variance: 0.1 Weight: 2.5
Prop. variance: 0.1 Weight: 2.75
Prop. variance: 0.1 Weight: 3.0
Prop. variance: 0.1 Weight: 3.25
Prop. variance: 0.1 Weight: 3.5
Prop. variance: 0.1 Weight: 3.75
Prop. var

In [14]:
cox_adjusted_pca_scores.drop_duplicates().pivot(index = "PCA",columns = "Weight", values = "score")

Weight,0.25,0.50,0.75,1.00,1.25,1.50,1.75,2.00,2.25,2.50,2.75,3.00,3.25,3.50,3.75
PCA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.05,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239
0.1,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239,0.574239
0.15,0.576724,0.576729,0.576733,0.576733,0.576733,0.576733,0.576733,0.576733,0.576733,0.576733,0.576733,0.576733,0.576733,0.576733,0.576733
0.2,0.685623,0.685555,0.685515,0.685542,0.685551,0.685542,0.685533,0.685533,0.685533,0.685533,0.685533,0.685533,0.685533,0.685533,0.685533
0.25,0.692482,0.693285,0.693262,0.693258,0.693267,0.693271,0.693262,0.693262,0.693262,0.693253,0.693244,0.693244,0.693252,0.693266,0.69327
0.3,0.665716,0.666621,0.66664,0.666626,0.666622,0.666605,0.666605,0.666609,0.666613,0.666613,0.666613,0.666613,0.666613,0.667488,0.667488
0.35,0.68695,0.686879,0.68687,0.687704,0.687686,0.688548,0.688583,0.6886,0.688595,0.688595,0.688587,0.688583,0.688583,0.688574,0.688574
0.4,0.718406,0.718305,0.71823,0.718226,0.718225,0.718234,0.718261,0.718261,0.718257,0.718261,0.718274,0.718283,0.718283,0.718278,0.718278
0.45,0.680158,0.681081,0.68193,0.682778,0.682769,0.682746,0.682768,0.682776,0.682772,0.682776,0.682768,0.682772,0.682768,0.682768,0.682768
0.5,0.726418,0.727247,0.727203,0.727118,0.727105,0.72797,0.727939,0.727926,0.727926,0.727925,0.727907,0.727899,0.727925,0.727929,0.727925


In [15]:
cox_adjusted_pca_scores.sort_values("score")

Unnamed: 0,PCA,Weight,score
0,0.05,0.25,0.574239
29,0.10,3.75,0.574239
28,0.10,3.50,0.574239
27,0.10,3.25,0.574239
26,0.10,3.00,0.574239
...,...,...,...
143,0.50,2.25,0.727926
142,0.50,2.00,0.727926
148,0.50,3.50,0.727929
141,0.50,1.75,0.727939


0.5 with weight 1.5 produces the best result

Fitting final model:

In [26]:
pca = PCA(n_components = 0.5)
X_tr_pca = pd.DataFrame(pca.fit_transform(X_tr))
X_tr_pca.index = X_tr.index

In [27]:
pickle.dump(cph, open('adjusted_cox_pca_wc.pkl', 'wb'))

In [28]:
# creating a case dataframe
cases_adjusted_df = X_tr_pca.join(y_tr).query("subcohort == False")

cases_adjusted_df = cases_adjusted_df.assign(
    # setting events outside subcohort to start just before they occur
    start_time = lambda df: df["duration"] - 0.001,
    # adding appropriate weight
    weight = 1,
    subcohort = False
)

# creating a subcohort dataframe
subcohort_adjusted_df = X_tr_pca.join(y_tr).query("subcohort == True")

subcohort_adjusted_df = subcohort_adjusted_df.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1.5 ,
        subcohort = True
    )

case_subcohort_adjusted_df = pd.concat([cases_adjusted_df,subcohort_adjusted_df]).drop(columns = "subcohort")

cph = CoxPHFitter()
cph.fit(case_subcohort_adjusted_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

<lifelines.CoxPHFitter: fitted with 1487.5 total observations, 892.5 right-censored observations>

In [29]:
pickle.dump(cph, open('adjusted_cox_wc.pkl', 'wb'))