# Unweighted Cox penalised
Fitting an unweighted penalised Cox model to the data.

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

import pickle

from weighted_concordance import *

In [2]:
pd.set_option('display.max_columns', None,'display.max_rows',20)

In [3]:
from lifelines import CoxPHFitter
from lifelines.utils import datetimes_to_durations

In [4]:
# importing data
X_tr = pd.read_csv("D:/compiled_data/X_tr.csv").set_index("csid")

y = pd.read_csv("D:/compiled_data/y.csv").set_index("csid")

Converting date-times from string format to datetime format.

In [5]:
def date_time_conversion(date):
    return(datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ"))

In [6]:
# Coverting the times into datetime format
y_tr = y.assign(
    event = y.ep_CKB0020_combined_ep,
    start_time = y.study_date.map(date_time_conversion),
    event_time = y.ep_CKB0020_combined_datedeveloped.map(date_time_conversion)
).drop(columns = y.columns)
y_tr["duration"] = datetimes_to_durations(y_tr["start_time"],y_tr["event_time"])[0]
y_tr["subcohort"] = y["subcohort"]
y_tr = y_tr.drop(columns = ["start_time","event_time"])

## Fitting Model

In [7]:
# creating a single case subcohort dataframe
case_subcohort_df = X_tr.join(y_tr).drop(columns = "subcohort").drop_duplicates()

In [8]:
# creating the model and fitting the data
cph = CoxPHFitter(penalizer = 0.5)
cph.fit(case_subcohort_df, duration_col = "duration", event_col = "event")

<lifelines.CoxPHFitter: fitted with 1187 total observations, 593 right-censored observations>

Unlike unregularised, dimension does not seem to have an issue. Now we need to select the regulariser using cross-validation.

See "Unweighted Cox PH with PCA" file for more detailed code annotation. Errors are suppressed as usually there is a successful analysis on at least one of the folds, so we ignore the failures.

In [9]:
from sklearn.model_selection import KFold
from lifelines.utils import concordance_index

Data frame for cross-validation results:

In [10]:
penalized_cox_scores = pd.DataFrame({
    'penalizer' : [],
    'l1_ratio': [],
    'score': []
})
penalized_cox_scores

Unnamed: 0,penalizer,l1_ratio,score


In [11]:
# for i in range(1,5):
#     for j in range(0,4):
#         print('penalizer =', i/5, 'l1_ratio =', j/3)
#         cph2 = CoxPHFitter(penalizer = i/5, l1_ratio = j/3)
#         cph2.fit(case_subcohort_df, duration_col = "duration", event_col = "event")
#         penalized_cox_scores = penalized_cox_scores.append({
#             'penalizer' : i/5,
#             'l1_ratio': j/3,
#             'score': np.mean(k_fold_cross_validation(cph2, case_subcohort_df, duration_col = "duration", event_col="event", k=5, scoring_method='concordance_index'))
#         }, ignore_index = True)
        
for i in range(1,10):
    for j in range(0,1):
        
        print('penalizer =', i/1000, 'l1_ratio =', j/3)
    
        kf = KFold(n_splits = 15)
        kf.get_n_splits(X_tr)

        scores = []

        for train_index, test_index in kf.split(X_tr):
            X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
            y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]
            
            case_subcohort_train = X_tr.join(y_tr).drop(columns = "subcohort").drop_duplicates()

            cph = CoxPHFitter(penalizer = i/1000, l1_ratio = j/3)

            try:
                cph.fit(case_subcohort_train, duration_col = "duration", event_col = "event")

                event_times = y_tr_test["duration"]
                event_observed = y_tr_test["event"]

                test_preds = cph.predict_partial_hazard(X_tr_test)
                scores.append(weighted_concordance(event_times,test_preds, event_observed,0.0011604684001529089))
            except:
                pass
            
        penalized_cox_scores = penalized_cox_scores.append({
            'penalizer' : i/1000,
            'l1_ratio': j/3,
            'score': np.mean(scores)
        }, ignore_index = True)

penalizer = 0.001 l1_ratio = 0.0
penalizer = 0.002 l1_ratio = 0.0
penalizer = 0.003 l1_ratio = 0.0
penalizer = 0.004 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

penalizer = 0.005 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

penalizer = 0.006 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

penalizer = 0.007 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

penalizer = 0.008 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

penalizer = 0.009 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

In [12]:
penalized_cox_scores.drop_duplicates().pivot(index = "penalizer", columns = "l1_ratio", values = "score")

l1_ratio,0.0
penalizer,Unnamed: 1_level_1
0.001,0.722549
0.002,0.722166
0.003,0.721466
0.004,0.527226
0.005,0.57041
0.006,0.596694
0.007,0.617215
0.008,0.653573
0.009,0.673575


In [20]:
# for i in range(1,5):
#     for j in range(0,4):
#         print('penalizer =', i/5, 'l1_ratio =', j/3)
#         cph2 = CoxPHFitter(penalizer = i/5, l1_ratio = j/3)
#         cph2.fit(case_subcohort_df, duration_col = "duration", event_col = "event")
#         penalized_cox_scores = penalized_cox_scores.append({
#             'penalizer' : i/5,
#             'l1_ratio': j/3,
#             'score': np.mean(k_fold_cross_validation(cph2, case_subcohort_df, duration_col = "duration", event_col="event", k=5, scoring_method='concordance_index'))
#         }, ignore_index = True)
        
for i in range(1,10):
    for j in range(0,1):
        
        print('penalizer =', i/10000, 'l1_ratio =', j/3)
    
        kf = KFold(n_splits = 15)
        kf.get_n_splits(X_tr)

        scores = []

        for train_index, test_index in kf.split(X_tr):
            X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
            y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]
            
            case_subcohort_train = X_tr.join(y_tr).drop(columns = "subcohort").drop_duplicates()

            cph = CoxPHFitter(penalizer = i/10000, l1_ratio = j/3)

            try:
                cph.fit(case_subcohort_train, duration_col = "duration", event_col = "event")

                event_times = y_tr_test["duration"]
                event_observed = y_tr_test["event"]

                test_preds = cph.predict_partial_hazard(X_tr_test)
                scores.append(concordance_index(event_times, -1*test_preds, event_observed))
            except:
                pass
            
        penalized_cox_scores = penalized_cox_scores.append({
            'penalizer' : i/10000,
            'l1_ratio': j/3,
            'score': np.mean(scores)
        }, ignore_index = True)

penalizer = 0.0001 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

penalizer = 0.0002 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

penalizer = 0.0003 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

penalizer = 0.0004 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

penalizer = 0.0005 l1_ratio = 0.0
penalizer = 0.0006 l1_ratio = 0.0
penalizer = 0.0007 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

penalizer = 0.0008 l1_ratio = 0.0



>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://

penalizer = 0.0009 l1_ratio = 0.0


In [21]:
penalized_cox_scores.query("penalizer <= 0.01").drop_duplicates().pivot(index = "penalizer", columns = "l1_ratio", values = "score")

l1_ratio,0.0
penalizer,Unnamed: 1_level_1
0.0001,0.684723
0.0002,0.685083
0.0003,0.68455
0.0004,0.681776
0.0005,0.716116
0.0006,0.716078
0.0007,0.688768
0.0008,0.687642
0.0009,0.715134


0.0005 is the best.

In [22]:
# creating a single case subcohort dataframe
case_subcohort_df = X_tr.join(y_tr).drop(columns = "subcohort").drop_duplicates()

In [23]:
# creating the model and fitting the data
cph = CoxPHFitter(penalizer = 0.0005)
cph.fit(case_subcohort_df, duration_col = "duration", event_col = "event")

<lifelines.CoxPHFitter: fitted with 1187 total observations, 593 right-censored observations>

In [24]:
# Saving Model
pickle.dump(cph, open('unweighted_cox_penalised_wc.pkl', 'wb'))