# Unweighted Cox PH with PCA

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

import pickle

In [2]:
from weighted_concordance import *

In [3]:
pd.set_option('display.max_columns', None,'display.max_rows',20)

In [4]:
from lifelines import CoxPHFitter
from lifelines.utils import datetimes_to_durations, concordance_index

In [5]:
# importing data
X_tr = pd.read_csv("D:/compiled_data/X_tr.csv").set_index("csid")

y = pd.read_csv("D:/compiled_data/y.csv").set_index("csid")

In [8]:
# function converting the date-time from string format to datetime format
def date_time_conversion(date):
    return(datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ"))

In [9]:
# Coverting the times into datetime format
y_tr = y.assign(
    event = y.ep_CKB0020_combined_ep,
    start_time = y.study_date.map(date_time_conversion),
    event_time = y.ep_CKB0020_combined_datedeveloped.map(date_time_conversion)
).drop(columns = y.columns)
y_tr["duration"] = datetimes_to_durations(y_tr["start_time"],y_tr["event_time"])[0]
y_tr["subcohort"] = y["subcohort"]
y_tr = y_tr.drop(columns = ["start_time","event_time"])

Unnamed: 0_level_0,event,duration,subcohort
csid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-00229-1799093,1,1192.0,False
2022-00229-4744170,1,1690.0,False
2022-00229-1082517,1,2035.0,False
2022-00229-3397875,1,2530.0,False
2022-00229-4372175,1,2216.0,False
...,...,...,...
2022-00229-5565114,0,4495.0,True
2022-00229-4902412,0,3448.0,True
2022-00229-4823882,0,895.0,True
2022-00229-4439450,0,5084.0,True


## Fitting Model

So the model runs if we perform dimension reduction. We need to select a suitable number of PCs. Let us use grid search.

In [10]:
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

In [11]:
cox_pca_scores = []

for i in range(1,20):
    
    kf = KFold(n_splits=3,shuffle = True)
    kf.get_n_splits(X_tr)

    scores = []
    
    for train_index, test_index in kf.split(X_tr):
        X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
        y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]
            
        pca = PCA(n_components = i/20)
        X_tr_train_pca = pd.DataFrame(pca.fit_transform(X_tr_train))
        X_tr_train_pca.index = X_tr_train.index

        # creating a single case subcohort dataframe
        case_subcohort_df_pca = X_tr_train_pca.join(y_tr_train).drop(columns = "subcohort").drop_duplicates()
        
        cph = CoxPHFitter()
        cph.fit(case_subcohort_df_pca, duration_col = "duration", event_col = "event")
        
        event_times = y_tr_test["duration"]
        event_observed = y_tr_test["event"]
        
        X_tr_test_pca = pca.transform(X_tr_test)

        test_preds = cph.predict_partial_hazard(X_tr_test_pca)
        
        scores.append(weighted_concordance(event_times, test_preds, event_observed, samp_fraction = 
0.0011604684001529089))
        
    cox_pca_scores.append(np.mean(scores))
    
print(cox_pca_scores)
print("Best variation percentage:",np.where(cox_pca_scores == np.max(cox_pca_scores))[0]/20)

[0.6335359302482791, 0.633629499012922, 0.636121700224077, 0.6546302053616858, 0.6750458270385056, 0.6890874726815, 0.7004023914926195, 0.7029524953925922, 0.7138544774905538, 0.727865738026091, 0.7334345720299681, 0.7347440050039888, 0.7319790455829569, 0.7311722618193442, 0.7475329386542694, 0.740902906289684, 0.7309438337074164, 0.7312719394307271, 0.7357875363505096]
Best variation percentage: [0.7]


So it seems like 0.6 of the variance has the best concordance scores. The final model:

In [14]:
pca = PCA(n_components = 0.7)
X_tr_pca = pd.DataFrame(pca.fit_transform(X_tr))
X_tr_pca.index = X_tr.index

# creating a single case subcohort dataframe
case_subcohort_df_pca = X_tr_pca.join(y_tr).drop(columns = "subcohort").drop_duplicates()

# creating the model and fitting the data
cph = CoxPHFitter()
cph.fit(case_subcohort_df_pca, duration_col = "duration", event_col = "event")

<lifelines.CoxPHFitter: fitted with 1187 total observations, 593 right-censored observations>

In [15]:
# Saving Model
pickle.dump(cph, open('unweighted_cox_pca_wc.pkl', 'wb'))