# Self-Prentice Cox PH model with PCA
Fitting Cox PH model with Self-Prentice weights.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

import pickle
from weighted_concordance import *

In [2]:
# pd.set_option('display.max_columns', None,'display.max_rows',20)

In [3]:
from lifelines import CoxPHFitter
from lifelines.utils import datetimes_to_durations

In [4]:
# importing data
X_tr = pd.read_csv("D:/compiled_data/X_tr.csv").set_index("csid")

y = pd.read_csv("D:/compiled_data/y.csv").set_index("csid")

Converting date-times from strings to datetime format.

In [5]:
def date_time_conversion(date):
    return(datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ"))

In [6]:
# Coverting the times into datetime format
y_tr = y.assign(
    event = y.ep_CKB0020_combined_ep,
    start_time = y.study_date.map(date_time_conversion),
    event_time = y.ep_CKB0020_combined_datedeveloped.map(date_time_conversion)
).drop(columns = y.columns)
y_tr["duration"] = datetimes_to_durations(y_tr["start_time"],y_tr["event_time"])[0]
y_tr["subcohort"] = y["subcohort"]
y_tr = y_tr.drop(columns = ["start_time","event_time"])

## Fitting Model

Convergence fails if model naively applied to the data.

In [7]:
# creating a case-subcohort dataframe
case_subcohort_selfprentice_df = X_tr.join(y_tr)

case_subcohort_selfprentice_df["weight"] = np.where(case_subcohort_selfprentice_df["subcohort"],1,0.001)

case_subcohort_selfprentice_df = case_subcohort_selfprentice_df.drop(columns = "subcohort")


cph3 = CoxPHFitter()
cph3.fit(case_subcohort_selfprentice_df, duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

We use PCA to reduce dimension to remedy convergence issues. Number of components will be selected by cross-validation:

In [12]:
from sklearn.decomposition import PCA
from lifelines.utils import k_fold_cross_validation, concordance_index
from sklearn.model_selection import KFold

More detailed annotation for cross validation code can be found in "Unweighted Cox PH with PCA" file

In [13]:
cox_pca_scores = []
nfolds = 3

for i in range(1,20):
    
    print(i)
    
    kf = KFold(n_splits=nfolds)
    kf.get_n_splits(X_tr)

    scores = []
    
    for train_index, test_index in kf.split(X_tr):
        X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
        y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]
            
        pca = PCA(n_components = i/20)
        X_tr_train_pca = pd.DataFrame(pca.fit_transform(X_tr_train))
        X_tr_train_pca.index = X_tr_train.index

         # creating a case-subcohort dataframe
        case_subcohort_selfprentice_df = X_tr_train_pca.join(y_tr_train)

        case_subcohort_selfprentice_df["weight"] = np.where(case_subcohort_selfprentice_df["subcohort"],1,0.001)

        case_subcohort_selfprentice_df = case_subcohort_selfprentice_df.drop(columns = "subcohort")

        try:
            cph3 = CoxPHFitter()
            cph3.fit(case_subcohort_selfprentice_df, duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

            event_times = y_tr_test["duration"]
            event_observed = y_tr_test["event"]
            
            X_tr_pca_test = pca.transform(X_tr_test)

            test_preds = cph3.predict_partial_hazard(X_tr_pca_test)
            scores.append(weighted_concordance(event_times,test_preds, event_observed,0.0011604684001529089/nfolds))
        except:
            pass
        
    cox_pca_scores.append(np.mean(scores))
    
print(cox_pca_scores)
print("Best variation percentage:",np.where(cox_pca_scores == np.max(cox_pca_scores))[0]/20)

1
2
3
4
5
6
7
8
9
10
11
12
13




14


  scores = weights * exp(dot(X, beta))
  risk_phi_x = risk_phi_x + phi_x_i
  risk_phi_x_x = risk_phi_x_x + phi_x_x_i
  a1 = risk_phi_x_x * denom
  log_lik = log_lik + dot(x_death_sum, beta) + weighted_average * log(denom).sum()
  tie_phi_x = tie_phi_x + phi_x_i
  tie_phi_x_x = tie_phi_x_x + phi_x_x_i
  denom = 1.0 / (risk_phi - increasing_proportion * tie_phi)
  denom = 1.0 / (risk_phi - increasing_proportion * tie_phi)
  numer = risk_phi_x - multiply.outer(increasing_proportion, tie_phi_x)


15
16
17
18




19


  scores = weights * exp(dot(X, beta))
  risk_phi_x = risk_phi_x + phi_x_i
  risk_phi_x_x = risk_phi_x_x + phi_x_x_i
  a1 = risk_phi_x_x * denom
  log_lik = log_lik + dot(x_death_sum, beta) + weighted_average * log(denom).sum()
  denom = 1.0 / (risk_phi - increasing_proportion * tie_phi)
  denom = 1.0 / (risk_phi - increasing_proportion * tie_phi)
  numer = risk_phi_x - multiply.outer(increasing_proportion, tie_phi_x)
  scores = weights * exp(dot(X, beta))
  risk_phi_x = risk_phi_x + phi_x_i
  risk_phi_x_x = risk_phi_x_x + phi_x_x_i
  a1 = risk_phi_x_x * denom
  log_lik = log_lik + dot(x_death_sum, beta) + weighted_average * log(denom).sum()
  denom = 1.0 / (risk_phi - increasing_proportion * tie_phi)
  denom = 1.0 / (risk_phi - increasing_proportion * tie_phi)
  numer = risk_phi_x - multiply.outer(increasing_proportion, tie_phi_x)
  tie_phi_x = tie_phi_x + phi_x_i
  tie_phi_x_x = tie_phi_x_x + phi_x_x_i


[0.5742394904049702, 0.5742394904049702, 0.5726156571355673, 0.6611702575381145, 0.6588693956692813, 0.6426794844711677, 0.6504995822370834, 0.6640186192020541, 0.6246213681830853, 0.671520531773793, 0.6802285057328051, 0.6839915425189198, 0.7271349422245156, 0.6699488657128445, 0.6608008722730141, 0.5349941857848938, 0.6166034094909597, 0.6076228569875614, 0.8556430446194226]
Best variation percentage: [0.9]


Fitting final model:

In [31]:
pca = PCA(n_components = 0.895)
X_tr_pca = pd.DataFrame(pca.fit_transform(X_tr))
X_tr_pca.index = X_tr.index

In [32]:
pickle.dump(pca, open('selfprentice_cox_pca_wc.pkl', 'wb'))

In [33]:
# creating a case-subcohort dataframe
case_subcohort_selfprentice_df = X_tr_pca.join(y_tr)

case_subcohort_selfprentice_df["weight"] = np.where(case_subcohort_selfprentice_df["subcohort"],1,0.001)

case_subcohort_selfprentice_df = case_subcohort_selfprentice_df.drop(columns = "subcohort")


cph3 = CoxPHFitter()
cph3.fit(case_subcohort_selfprentice_df, duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

<lifelines.CoxPHFitter: fitted with 595.595 total observations, 593 right-censored observations>

0.895 is the closest value that works.

In [34]:
pickle.dump(cph3, open('selfprentice_cox_wc.pkl', 'wb'))