# Unweighted Cox PH with PCA
Fitting the unweighted Cox PH model.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

import pickle

In [2]:
from weighted_concordance import *

In [3]:
pd.set_option('display.max_columns', None,'display.max_rows',20)

In [4]:
from lifelines import CoxPHFitter
from lifelines.utils import datetimes_to_durations, concordance_index

In [5]:
# importing data
X_tr = pd.read_csv("D:/compiled_data/X_tr.csv").set_index("csid")

y = pd.read_csv("D:/compiled_data/y.csv").set_index("csid")

Convert date-time from text format to datetime format:

In [6]:
def date_time_conversion(date):
    return(datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ"))

In [7]:
# Coverting the times into datetime format
y_tr = y.assign(
    event = y.ep_CKB0020_combined_ep,
    start_time = y.study_date.map(date_time_conversion),
    event_time = y.ep_CKB0020_combined_datedeveloped.map(date_time_conversion)
).drop(columns = y.columns)
y_tr["duration"] = datetimes_to_durations(y_tr["start_time"],y_tr["event_time"])[0]
y_tr["subcohort"] = y["subcohort"]
y_tr = y_tr.drop(columns = ["start_time","event_time"])

## Fitting Model

Running the model on the original dataset causes convergence error:

In [8]:
case_subcohort_df = X_tr.join(y_tr).drop(columns = "subcohort").drop_duplicates()

cph = CoxPHFitter()
cph.fit(case_subcohort_df, duration_col = "duration", event_col = "event")


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

We use PCA to reduce dimension to remedy convergence issues. Number of components will be selected by cross-validation:

In [10]:
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA

In [15]:
cox_pca_scores = []
nfolds = 3

for i in range(1,20):
    
    
    # Configuring the folds on the data
    kf = KFold(n_splits=3,shuffle = True)
    kf.get_n_splits(X_tr)
    
    # Scores for the model on each fold
    scores = [] 
    
    # fitting the model on each fold
    for train_index, test_index in kf.split(X_tr):
        X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
        y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]
            
        pca = PCA(n_components = i/20)
        X_tr_train_pca = pd.DataFrame(pca.fit_transform(X_tr_train))
        X_tr_train_pca.index = X_tr_train.index

        # creating a single case subcohort dataframe
        case_subcohort_df_pca = X_tr_train_pca.join(y_tr_train).drop(columns = "subcohort").drop_duplicates()
        
        # fitting model
        cph = CoxPHFitter()
        cph.fit(case_subcohort_df_pca, duration_col = "duration", event_col = "event")
        
        # evaluating performance using weighted concordance
        event_times = y_tr_test["duration"]
        event_observed = y_tr_test["event"]
        
        X_tr_test_pca = pca.transform(X_tr_test)

        test_preds = cph.predict_partial_hazard(X_tr_test_pca)
        
        scores.append(weighted_concordance(event_times, test_preds, event_observed, samp_fraction = 
0.0011604684001529089/nfolds))
        
    cox_pca_scores.append(np.mean(scores))
    
print(cox_pca_scores)
print("Best variation percentage:",np.where(cox_pca_scores == np.max(cox_pca_scores))[0]/20)

[0.6331830468305198, 0.6334525767640352, 0.6436833504274709, 0.6523320907856919, 0.6690591810745591, 0.6924871346572421, 0.7019508318677893, 0.7026587382127785, 0.7186831004733568, 0.7403933774769605, 0.7319112210617921, 0.7472116102446741, 0.7468810977582917, 0.7387952736994302, 0.7479126526509329, 0.7295796846268602, 0.7184389176569698, 0.7092828556812312, 0.7270595367179107]
Best variation percentage: [0.7]


So it seems like 0.7 of the variance has the best concordance scores. The final model:

In [11]:
pca = PCA(n_components = 0.7)
X_tr_pca = pd.DataFrame(pca.fit_transform(X_tr))
X_tr_pca.index = X_tr.index

In [12]:
# Saving Model
pickle.dump(pca, open('unweighted_cox_pca_wc.pkl', 'wb'))

In [13]:
# creating a single case subcohort dataframe
case_subcohort_df_pca = X_tr_pca.join(y_tr).drop(columns = "subcohort").drop_duplicates()

# creating the model and fitting the data
cph = CoxPHFitter()
cph.fit(case_subcohort_df_pca, duration_col = "duration", event_col = "event")

<lifelines.CoxPHFitter: fitted with 1187 total observations, 593 right-censored observations>

In [14]:
# Saving Model
pickle.dump(cph, open('unweighted_cox_wc.pkl', 'wb'))