# Self-Prentice Cox PH model, penalized
Fitting a penalized Cox PH model with Self-Prentice weights.

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime

import pickle

from weighted_concordance import weighted_concordance

In [3]:
# pd.set_option('display.max_columns', None,'display.max_rows',20)

In [4]:
from lifelines import CoxPHFitter
from lifelines.utils import datetimes_to_durations, k_fold_cross_validation, concordance_index

In [5]:
# importing data
X_tr = pd.read_csv("D:/compiled_data/X_tr.csv").set_index("csid")

y = pd.read_csv("D:/compiled_data/y.csv").set_index("csid")

Converting date-times from string format to datetime format.

In [8]:
def date_time_conversion(date):
    return(datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ"))

In [9]:
# Coverting the times into datetime format
y_tr = y.assign(
    event = y.ep_CKB0020_combined_ep,
    start_time = y.study_date.map(date_time_conversion),
    event_time = y.ep_CKB0020_combined_datedeveloped.map(date_time_conversion)
).drop(columns = y.columns)
y_tr["duration"] = datetimes_to_durations(y_tr["start_time"],y_tr["event_time"])[0]
y_tr["subcohort"] = y["subcohort"]
y_tr = y_tr.drop(columns = ["start_time","event_time"])
y_tr

Unnamed: 0_level_0,event,duration,subcohort
csid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-00229-1799093,1,1192.0,False
2022-00229-4744170,1,1690.0,False
2022-00229-1082517,1,2035.0,False
2022-00229-3397875,1,2530.0,False
2022-00229-4372175,1,2216.0,False
...,...,...,...
2022-00229-5565114,0,4495.0,True
2022-00229-4902412,0,3448.0,True
2022-00229-4823882,0,895.0,True
2022-00229-4439450,0,5084.0,True


## Fitting Model

In [22]:
# creating a case-subcohort dataframe
case_subcohort_selfprentice_df = X_tr.join(y_tr)

case_subcohort_selfprentice_df["weight"] = np.where(case_subcohort_selfprentice_df["subcohort"],1,0.001)

case_subcohort_selfprentice_df = case_subcohort_selfprentice_df.drop(columns = "subcohort")

In [23]:
cph3 = CoxPHFitter(penalizer = 1)
cph3.fit(case_subcohort_selfprentice_df, duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

<lifelines.CoxPHFitter: fitted with 595.595 total observations, 593 right-censored observations>

Unlike unregularised, dimension does not seem to have an issue. Now we need to select the regulariser using cross-validation.

See "Unweighted Cox PH with PCA" file for more detailed code annotation. Errors are suppressed as usually there is a successful analysis on at least one of the folds, so we ignore the failures.

In [10]:
from sklearn.decomposition import PCA
from lifelines.utils import k_fold_cross_validation, concordance_index
from sklearn.model_selection import KFold

In [11]:
cox_selfprentice_penalizer_scores = pd.DataFrame({
    'penalizer' : [],
    'l1_ratio': [],
    'score': []
})
cox_selfprentice_penalizer_scores

Unnamed: 0,penalizer,l1_ratio,score


In [12]:

for i in range(6,11):
    for j in range(0,3):
        
        print('penalizer =', i/5, 'l1_ratio =', j/3)
    
        kf = KFold(n_splits = 15)
        kf.get_n_splits(X_tr)

        scores = []

        for train_index, test_index in kf.split(X_tr):
            X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
            y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]

            # creating a case-subcohort dataframe
            case_subcohort_selfprentice_df = X_tr_train.join(y_tr_train)

            case_subcohort_selfprentice_df["weight"] = np.where(case_subcohort_selfprentice_df["subcohort"],1,0.001)

            case_subcohort_selfprentice_df = case_subcohort_selfprentice_df.drop(columns = "subcohort")

            case_subcohort_selfprentice_df

            cph = CoxPHFitter(penalizer = i/5, l1_ratio = j/3)
            
            try:
                cph.fit(case_subcohort_selfprentice_df, duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

                event_times = y_tr_test["duration"]
                event_observed = y_tr_test["event"]

                test_preds = cph.predict_partial_hazard(X_tr_test)
                scores.append(weighted_concordance(event_times,test_preds, event_observed,0.0011604684001529089))
            except:
                pass
            
        cox_selfprentice_penalizer_scores = cox_selfprentice_penalizer_scores.append({
            'penalizer' : i/5,
            'l1_ratio': j/3,
            'score': np.mean(scores)
        }, ignore_index = True)

penalizer = 1.2 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.2 l1_ratio = 0.3333333333333333



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.2 l1_ratio = 0.6666666666666666



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.4 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.4 l1_ratio = 0.3333333333333333



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.4 l1_ratio = 0.6666666666666666



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.6 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.6 l1_ratio = 0.3333333333333333



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.6 l1_ratio = 0.6666666666666666



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.8 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.8 l1_ratio = 0.3333333333333333



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.8 l1_ratio = 0.6666666666666666



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 2.0 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 2.0 l1_ratio = 0.3333333333333333



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 2.0 l1_ratio = 0.6666666666666666



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

In [14]:
cox_selfprentice_penalizer_scores.pivot(index = "penalizer", columns = "l1_ratio", values = "score")

l1_ratio,0.000000,0.333333,0.666667
penalizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.2,0.582681,0.582329,0.582329
1.4,0.582679,0.582329,0.582329
1.6,0.582672,0.582329,0.582329
1.8,0.582542,0.582329,0.582329
2.0,0.582206,0.582329,0.582329


The L1 penalty results in a slight decrease in performance. The best penaliser was 1.2 so let us check around this area.

In [17]:
cox_selfprentice_penalizer_scores = pd.DataFrame({
    'penalizer' : [],
    'l1_ratio': [],
    'score': []
})
cox_selfprentice_penalizer_scores

Unnamed: 0,penalizer,l1_ratio,score


In [18]:
for i in range(10,13):
    for j in range(0,1):
        
        print('penalizer =', i/10, 'l1_ratio =', j/3)
    
        kf = KFold(n_splits = 15)
        kf.get_n_splits(X_tr)

        scores = []

        for train_index, test_index in kf.split(X_tr):
            X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
            y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]

            # creating a case-subcohort dataframe
            case_subcohort_selfprentice_df = X_tr_train.join(y_tr_train)

            case_subcohort_selfprentice_df["weight"] = np.where(case_subcohort_selfprentice_df["subcohort"],1,0.001)

            case_subcohort_selfprentice_df = case_subcohort_selfprentice_df.drop(columns = "subcohort")

            case_subcohort_selfprentice_df

            cph = CoxPHFitter(penalizer = i/10, l1_ratio = j/3)
            
            try:
                cph.fit(case_subcohort_selfprentice_df, duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

                event_times = y_tr_test["duration"]
                event_observed = y_tr_test["event"]

                test_preds = cph.predict_partial_hazard(X_tr_test)
                scores.append(weighted_concordance(event_times, test_preds, event_observed,0.0011604684001529089))
            except:
                pass
            
        cox_selfprentice_penalizer_scores = cox_selfprentice_penalizer_scores.append({
            'penalizer' : i/10,
            'l1_ratio': j/3,
            'score': np.mean(scores)
        }, ignore_index = True)

penalizer = 1.0 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.1 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 1.2 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

In [20]:
for i in range(7,10):
    for j in range(0,1):
        
        print('penalizer =', i/10, 'l1_ratio =', j/3)
    
        kf = KFold(n_splits = 15)
        kf.get_n_splits(X_tr)

        scores = []

        for train_index, test_index in kf.split(X_tr):
            X_tr_train, X_tr_test = X_tr.iloc[train_index], X_tr.iloc[test_index]
            y_tr_train, y_tr_test = y_tr.iloc[train_index], y_tr.iloc[test_index]

            # creating a case-subcohort dataframe
            case_subcohort_selfprentice_df = X_tr_train.join(y_tr_train)

            case_subcohort_selfprentice_df["weight"] = np.where(case_subcohort_selfprentice_df["subcohort"],1,0.001)

            case_subcohort_selfprentice_df = case_subcohort_selfprentice_df.drop(columns = "subcohort")

            case_subcohort_selfprentice_df

            cph = CoxPHFitter(penalizer = i/10, l1_ratio = j/3)
            
            try:
                cph.fit(case_subcohort_selfprentice_df, duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

                event_times = y_tr_test["duration"]
                event_observed = y_tr_test["event"]

                test_preds = cph.predict_partial_hazard(X_tr_test)
                scores.append(weighted_concordance(event_times, test_preds, event_observed,0.0011604684001529089))
            except:
                pass
            
        cox_selfprentice_penalizer_scores = cox_selfprentice_penalizer_scores.append({
            'penalizer' : i/10,
            'l1_ratio': j/3,
            'score': np.mean(scores)
        }, ignore_index = True)

penalizer = 0.7 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 0.8 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

penalizer = 0.9 l1_ratio = 0.0



  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'asthma_now'].var())
>>> print(df.loc[~events, 'asthma_now'].var())

A very low variance means that the column asthma_now completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.


  return (X - mean) / std

>>> events = df['event'].astype(bool)
>>> print(df.loc[events, 'cancer_site_1.0'].var())
>>> print(df.loc[~events, 'cancer_site_1.0'].var())

A very low variance means that the column cancer_site_1.0

In [21]:
cox_selfprentice_penalizer_scores.dropna().query("l1_ratio == 0").sort_values("penalizer")

Unnamed: 0,penalizer,l1_ratio,score
3,0.7,0.0,0.582363
4,0.8,0.0,0.582426
5,0.9,0.0,0.582625
0,1.0,0.0,0.582752
1,1.1,0.0,0.582744
2,1.2,0.0,0.582681


1.0 seems to be the best penaliser.

Fitting final model:

In [22]:
# creating a case-subcohort dataframe
case_subcohort_selfprentice_df = X_tr.join(y_tr)

case_subcohort_selfprentice_df["weight"] = np.where(case_subcohort_selfprentice_df["subcohort"],1,0.001)

case_subcohort_selfprentice_df = case_subcohort_selfprentice_df.drop(columns = "subcohort")

In [23]:
cph3 = CoxPHFitter(penalizer = 1)
cph3.fit(case_subcohort_selfprentice_df, duration_col = "duration",event_col = "event",weights_col = "weight",robust = True)

<lifelines.CoxPHFitter: fitted with 595.595 total observations, 593 right-censored observations>

In [24]:
pickle.dump(cph3, open('penalized_selfprentice_cox_wc.pkl', 'wb'))