# Cox PH model

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

import pickle
from weighted_concordance import weighted_concordance

In [2]:
# pd.set_option('display.max_columns', None,'display.max_rows',20)

In [3]:
from lifelines import CoxPHFitter
from lifelines.utils import datetimes_to_durations, k_fold_cross_validation, concordance_index

In [4]:
# importing data
X_tr = pd.read_csv("D:/compiled_data/X_tr.csv").set_index("csid")

y = pd.read_csv("D:/compiled_data/y.csv").set_index("csid")

Converting date-times from string format to datetime format.

In [5]:
def date_time_conversion(date):
    return(datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ"))

In [6]:
# Coverting the times into datetime format
y_tr = y.assign(
    event = y.ep_CKB0020_combined_ep,
    start_time = y.study_date.map(date_time_conversion),
    event_time = y.ep_CKB0020_combined_datedeveloped.map(date_time_conversion)
).drop(columns = y.columns)
y_tr["duration"] = datetimes_to_durations(y_tr["start_time"],y_tr["event_time"])[0]
y_tr["subcohort"] = y["subcohort"]
y_tr = y_tr.drop(columns = ["start_time","event_time"])

## Fitting Model

In [7]:
# creating a case dataframe
cases_prentice_df = X_tr.join(y_tr).query("subcohort == False")

cases_prentice_df = cases_prentice_df.assign(
    # setting events outside subcohort to start just before they occur
    start_time = lambda df: df["duration"] - 0.001,
    # adding appropriate weight
    weight = 1,
    subcohort = False
)

# creating a subcohort dataframe
subcohort_prentice_df = X_tr.join(y_tr).query("subcohort == True")

subcohort_prentice_df = subcohort_prentice_df.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1,
        subcohort = True
    )

case_subcohort_prentice_df = pd.concat([cases_prentice_df,subcohort_prentice_df]).drop(columns = "subcohort")

In [8]:
cph2 = CoxPHFitter(penalizer = 0.5)
cph2.fit(case_subcohort_prentice_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

<lifelines.CoxPHFitter: fitted with 1190 total observations, 595 right-censored observations>

Unlike unregularised, dimension does not seem to have an issue. Now we need to select the regulariser using cross-validation.

See "Unweighted Cox PH with PCA" file for more detailed code annotation. Errors are suppressed as usually there is a successful analysis on at least one of the folds, so we ignore the failures.

In [9]:
from sklearn.model_selection import KFold

In [10]:
cox_prentice_penalizer_scores = pd.DataFrame({
    'penalizer' : [],
    'l1_ratio': [],
    'score': []
})
cox_prentice_penalizer_scores

Unnamed: 0,penalizer,l1_ratio,score


In [11]:
for i in range(6,11):
    for j in range(0,3):
        
        print('penalizer =', i/5, 'l1_ratio =', j/3)
    
        kf = KFold(n_splits = 15)
        kf.get_n_splits(X_tr)

        scores = []

        for train_index, test_index in kf.split(X_tr):
            X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
            y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]

            # creating a case dataframe
            # creating a case dataframe
            cases_prentice_df = X_tr_train.join(y_tr_train).query("subcohort == False")

            cases_prentice_df = cases_prentice_df.assign(
                # setting events outside subcohort to start just before they occur
                start_time = lambda df: df["duration"] - 0.001,
                # adding appropriate weight
                weight = 1,
                subcohort = False
            )

            # creating a subcohort dataframe
            subcohort_prentice_df = X_tr_train.join(y_tr_train).query("subcohort == True")

            subcohort_prentice_df = subcohort_prentice_df.assign(
                    # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
                    duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
                    # the events start from the origin
                    start_time = 0, 
                    event = False,
                    weight = 1,
                    subcohort = True
                )
            
            case_subcohort_prentice_df = pd.concat([cases_prentice_df,subcohort_prentice_df]).drop(columns = "subcohort")

            cph = CoxPHFitter(penalizer = i/5, l1_ratio = j/3)
            try:
                cph.fit(case_subcohort_prentice_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

                event_times = y_tr_test["duration"]
                event_observed = y_tr_test["event"]

                test_preds = cph.predict_partial_hazard(X_tr_test)
                scores.append(weighted_concordance(event_times, test_preds, event_observed,0.0011604684001529089))
            except:
                pass
            
        cox_prentice_penalizer_scores = cox_prentice_penalizer_scores.append({
            'penalizer' : i/5,
            'l1_ratio': j/3,
            'score': np.mean(scores)
        }, ignore_index = True)

penalizer = 1.2 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.2 l1_ratio = 0.3333333333333333



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.2 l1_ratio = 0.6666666666666666



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.4 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.4 l1_ratio = 0.3333333333333333



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.4 l1_ratio = 0.6666666666666666



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.6 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.6 l1_ratio = 0.3333333333333333



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.6 l1_ratio = 0.6666666666666666



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.8 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.8 l1_ratio = 0.3333333333333333



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.8 l1_ratio = 0.6666666666666666



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 2.0 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 2.0 l1_ratio = 0.3333333333333333



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 2.0 l1_ratio = 0.6666666666666666



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

In [12]:
cox_prentice_penalizer_scores.drop_duplicates().pivot(index = "penalizer", columns = "l1_ratio", values = "score")

l1_ratio,0.000000,0.333333,0.666667
penalizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.2,0.674672,0.610188,0.608725
1.4,0.672952,0.614681,0.608669
1.6,0.673108,0.614539,0.608595
1.8,0.669236,0.611607,0.608467
2.0,0.664987,0.608534,0.608531


L1 ratio again decreases performance. Best penalizer was 1.2. Should probably do a closer search around 1.2-1.

In [15]:
cox_prentice_penalizer_scores = pd.DataFrame({
    'penalizer' : [],
    'l1_ratio': [],
    'score': []
})
cox_prentice_penalizer_scores

Unnamed: 0,penalizer,l1_ratio,score


In [16]:
for i in range(10,17):
    for j in range(0,1):
        
        print('penalizer =', i/10, 'l1_ratio =', j/3)
    
        kf = KFold(n_splits = 15)
        kf.get_n_splits(X_tr)

        scores = []

        for train_index, test_index in kf.split(X_tr):
            X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
            y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]

            # creating a case dataframe
            # creating a case dataframe
            cases_prentice_df = X_tr_train.join(y_tr_train).query("subcohort == False")

            cases_prentice_df = cases_prentice_df.assign(
                # setting events outside subcohort to start just before they occur
                start_time = lambda df: df["duration"] - 0.001,
                # adding appropriate weight
                weight = 1,
                subcohort = False
            )

            # creating a subcohort dataframe
            subcohort_prentice_df = X_tr_train.join(y_tr_train).query("subcohort == True")

            subcohort_prentice_df = subcohort_prentice_df.assign(
                    # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
                    duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
                    # the events start from the origin
                    start_time = 0, 
                    event = False,
                    weight = 1,
                    subcohort = True
                )
            
            case_subcohort_prentice_df = pd.concat([cases_prentice_df,subcohort_prentice_df]).drop(columns = "subcohort")

            cph = CoxPHFitter(penalizer = i/10, l1_ratio = j/3)
            try:
                cph.fit(case_subcohort_prentice_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

                event_times = y_tr_test["duration"]
                event_observed = y_tr_test["event"]

                test_preds = cph.predict_partial_hazard(X_tr_test)
                scores.append(weighted_concordance(event_times, test_preds, event_observed,0.0011604684001529089))
            except:
                pass
            
        cox_prentice_penalizer_scores = cox_prentice_penalizer_scores.append({
            'penalizer' : i/10,
            'l1_ratio': j/3,
            'score': np.mean(scores)
        }, ignore_index = True)

penalizer = 1.0 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.1 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.2 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.3 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.4 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.5 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.6 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

In [17]:
cox_prentice_penalizer_scores.drop_duplicates().pivot(index = "penalizer", columns = "l1_ratio", values = "score")

l1_ratio,0.0
penalizer,Unnamed: 1_level_1
1.0,0.67514
1.1,0.674408
1.2,0.674672
1.3,0.673874
1.4,0.672952
1.5,0.672445
1.6,0.673108


In [20]:
for i in range(7,10):
    for j in range(0,1):
        
        print('penalizer =', i/10, 'l1_ratio =', j/3)
    
        kf = KFold(n_splits = 15)
        kf.get_n_splits(X_tr)

        scores = []

        for train_index, test_index in kf.split(X_tr):
            X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
            y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]

            # creating a case dataframe
            # creating a case dataframe
            cases_prentice_df = X_tr_train.join(y_tr_train).query("subcohort == False")

            cases_prentice_df = cases_prentice_df.assign(
                # setting events outside subcohort to start just before they occur
                start_time = lambda df: df["duration"] - 0.001,
                # adding appropriate weight
                weight = 1,
                subcohort = False
            )

            # creating a subcohort dataframe
            subcohort_prentice_df = X_tr_train.join(y_tr_train).query("subcohort == True")

            subcohort_prentice_df = subcohort_prentice_df.assign(
                    # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
                    duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
                    # the events start from the origin
                    start_time = 0, 
                    event = False,
                    weight = 1,
                    subcohort = True
                )
            
            case_subcohort_prentice_df = pd.concat([cases_prentice_df,subcohort_prentice_df]).drop(columns = "subcohort")

            cph = CoxPHFitter(penalizer = i/10, l1_ratio = j/3)
            try:
                cph.fit(case_subcohort_prentice_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

                event_times = y_tr_test["duration"]
                event_observed = y_tr_test["event"]

                test_preds = cph.predict_partial_hazard(X_tr_test)
                scores.append(weighted_concordance(event_times, test_preds, event_observed,0.0011604684001529089))
            except:
                pass
            
        cox_prentice_penalizer_scores = cox_prentice_penalizer_scores.append({
            'penalizer' : i/10,
            'l1_ratio': j/3,
            'score': np.mean(scores)
        }, ignore_index = True)

penalizer = 0.7 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 0.8 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 0.9 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

In [25]:
cox_prentice_penalizer_scores.sort_values("score")#.pivot(index = "penalizer", columns = "l1_ratio", values = "score")

Unnamed: 0,penalizer,l1_ratio,score
11,0.9,0.0,0.672314
5,1.5,0.0,0.672445
8,0.8,0.0,0.672504
10,0.8,0.0,0.672504
9,0.7,0.0,0.67291
4,1.4,0.0,0.672952
6,1.6,0.0,0.673108
3,1.3,0.0,0.673874
1,1.1,0.0,0.674408
2,1.2,0.0,0.674672


1.0 has the best result. (Ignore the last 0.7 shouldn't there.

Fitting final model:

In [26]:
# creating a case dataframe
cases_prentice_df = X_tr.join(y_tr).query("subcohort == False")

cases_prentice_df = cases_prentice_df.assign(
    # setting events outside subcohort to start just before they occur
    start_time = lambda df: df["duration"] - 0.001,
    # adding appropriate weight
    weight = 1,
    subcohort = False
)

# creating a subcohort dataframe
subcohort_prentice_df = X_tr.join(y_tr).query("subcohort == True")

subcohort_prentice_df = subcohort_prentice_df.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        duration = lambda df: np.where(df["event"], df["duration"] - 0.001, df["duration"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1,
        subcohort = True
    )

case_subcohort_prentice_df = pd.concat([cases_prentice_df,subcohort_prentice_df]).drop(columns = "subcohort")

In [27]:
cph2 = CoxPHFitter(penalizer = 1.0)
cph2.fit(case_subcohort_prentice_df, entry_col = "start_time", duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

<lifelines.CoxPHFitter: fitted with 1190 total observations, 595 right-censored observations>

In [28]:
pickle.dump(cph2, open('penalised_prentice_cox_wc.pkl', 'wb'))