# Model fitting
Writing functions for fitting survival models to data.

In [12]:
import numpy as np
from scipy.stats import weibull_min # r weibull simulation
from scipy.stats import norm # for covariate simulation
from scipy.stats import gamma # for weibull shape parameter
from scipy.stats import bernoulli # for censoring
from scipy.stats import uniform
from scipy.stats.mstats import mquantiles
import pandas as pd

import warnings 
warnings.filterwarnings('ignore')

In [13]:
from data_simulation import *

For writing the functions, we start with a simulated case-subcohort and test dataset:

In [14]:
n_covariates = 10
sample = weibull_simple_linear_sim([1,1,1,1,1,1,1,1,1,1], 0.5, 1500, 0.7, pi = 0.5)

In [15]:
cases, subcohort, cohort, test = cch_splitter(sample)

In [16]:
cases.shape, subcohort.shape, cohort.shape, test.shape

((298, 16), (298, 16), (1000, 16), (500, 16))

In [17]:
cases.to_csv('cases.csv')
subcohort.to_csv('subcohort.csv')

We also need tests for when we only have categorical or only continuous variables.

In [18]:
sample_cont = weibull_simple_linear_sim([1,1,1,1,1,1,1,1,1,1], 0, 1500, 0.7, pi = 0.5)
sample_cont

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y,end_censor,dropout,end_censor_time,time,event
0,-1.681362,-0.289119,-1.002503,0.743749,-1.350441,0.168640,-0.480428,0.563653,-0.663205,0.013717,2.546802,True,False,0.701635,0.701635,False
1,-0.035623,0.227009,0.123099,-0.148076,-0.803088,0.589035,-0.178951,0.263770,-0.369971,-0.536244,1.017491,True,False,0.701635,0.701635,False
2,0.323331,-0.574447,0.404777,0.925723,2.751700,-0.970148,-1.735850,-1.008092,1.625487,0.583077,0.740215,True,False,0.701635,0.701635,False
3,0.281171,0.373715,1.066568,-0.130397,0.445499,-1.780835,1.095087,0.193621,-1.725621,1.677702,0.959654,True,False,0.701635,0.701635,False
4,1.351294,0.303983,0.887563,-0.042948,-0.342005,0.102409,0.529656,0.306619,0.083053,-0.885592,0.734424,True,False,0.701635,0.701635,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,-0.182762,-1.075576,-0.218914,0.711948,1.677593,0.765391,0.051405,-0.138774,1.488633,0.201001,0.542671,False,False,0.542671,0.542671,True
1496,0.268991,-0.034585,-0.751944,-0.382930,1.658262,0.229947,-1.055611,2.854918,-0.657967,2.269326,0.400945,False,False,0.400945,0.400945,True
1497,0.128068,-0.321857,-0.060037,0.135277,1.977381,0.036908,-0.017118,-0.722662,-1.054160,0.202419,0.376949,False,False,0.376949,0.376949,True
1498,-2.559650,-0.704505,-1.638894,0.007730,0.465091,0.009633,1.057917,-0.257777,-2.736760,-0.066443,3.560886,True,False,0.701635,0.701635,False


In [19]:
cases_cont, subcohort_cont, cohort_cont, test_cont = cch_splitter(sample_cont)

In [20]:
sample_cat = weibull_simple_linear_sim([1,1,1,1,1,1,1,1,1,1], 1, 1500, 0.7, pi = 0.5)
sample_cat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y,end_censor,dropout,end_censor_time,time,event
0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.521775,True,False,0.288249,0.288249,False
1,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.515405,True,False,0.288249,0.288249,False
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.358431,True,False,0.288249,0.288249,False
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.463839,True,False,0.288249,0.288249,False
4,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.241371,False,True,0.241371,0.020517,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.433931,True,False,0.288249,0.288249,False
1496,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.174807,False,False,0.174807,0.174807,True
1497,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.216197,False,False,0.216197,0.216197,True
1498,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.176569,False,True,0.176569,0.085627,False


In [37]:
cases_cat, subcohort_cat, cohort_cat, test_cat = cch_splitter(sample_cat)

## Cox mode, unweighted
First we look at the standard Cox model without adjusting for bias. When the model is not mispecified at all, the performance of the unweighted version may not be that different, however, if the model is mispecified, the results may be poorer.

The following function fits an unweighted Cox model:

In [75]:
from lifelines import CoxPHFitter

In [75]:
def fit_cox(cases, subcohort,n_covariates):
    # Fit a Cox PH model to case-subcohort data.
    
    # cases: cases dataframe
    # cohort: cohort dataframe
    # n_covariates: the number of covariates used in the simulation
    
    # creating a single case subcohort dataframe
    case_subcohort_df = pd.concat([cases,subcohort])
    # removing unnecessary columns and duplicate rows
    case_subcohort_df = case_subcohort_df.loc[case_subcohort_df.duplicated() == False,[i for i in range(0,n_covariates)]+["time", "event"]]
    
    # creating the model and fitting the data
    cph = CoxPHFitter()
    cph.fit(case_subcohort_df, duration_col = "time", event_col = "event")
    return(cph)

### Tests

In [76]:
cph = fit_cox(cases, subcohort,n_covariates)
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'event'
baseline estimation,breslow
number of observations,515
number of events observed,300
partial log-likelihood,-1490.37
time fit was run,2022-07-31 07:57:56 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
0,-1.5,0.22,0.09,-1.67,-1.33,0.19,0.26,0.0,-17.2,<0.005,217.78
1,-0.47,0.62,0.07,-0.61,-0.33,0.54,0.72,0.0,-6.63,<0.005,34.79
2,1.11,3.03,0.08,0.96,1.26,2.61,3.52,0.0,14.44,<0.005,154.53
3,-0.33,0.72,0.07,-0.46,-0.2,0.63,0.82,0.0,-5.01,<0.005,20.78
4,0.02,1.02,0.07,-0.11,0.14,0.89,1.15,0.0,0.24,0.81,0.3
5,0.76,2.14,0.12,0.52,1.0,1.68,2.72,0.0,6.2,<0.005,30.74
6,0.29,1.33,0.12,0.05,0.52,1.05,1.68,0.0,2.4,0.02,5.92
7,-0.14,0.87,0.12,-0.38,0.09,0.68,1.09,0.0,-1.2,0.23,2.13
8,0.89,2.43,0.13,0.64,1.14,1.9,3.12,0.0,7.03,<0.005,38.82
9,0.7,2.01,0.12,0.46,0.94,1.59,2.55,0.0,5.79,<0.005,27.09

0,1
Concordance,0.84
Partial AIC,3000.73
log-likelihood ratio test,527.37 on 10 df
-log2(p) of ll-ratio test,352.81


The function runs and returns values close to the simulated coefficients.

## Cox Model, Barlow Weights

To fit the model with Barlow weights, we split the data points corresponding to events into two parts. We use the case data to construct the interval at the event which has weight 1. Using the subcohort data, we construct the interval before the event that has weight $\frac{1}{\alpha}$. All non-events in the subcohort have weight $\frac{1}{\alpha}$ while in the risk set.

Function for changing data for Cox model with Barlow weights:

In [77]:
def barlow_trans(cases,subcohort, n_covariates, alpha):
    # Transform a case-subcohort data to the Barlow weight scheme.
    
    # cases: cases dataframe
    # subcohort: subcohort dataframe
    # n_covariates: number of covariates used in the simulation
    # alpha: the sampling proportion used for the subcohort
    
    
    # finding the order of magnitude of data to pick the appropriate size of each "instant". We use the largest event time for this.
    order = int(np.floor(np.log(max(cases["time"]))/np.log(10))) 
    
    cases = cases.assign(
        # setting events outside subcohort to start just before they occur
        start_time = lambda df: df["time"] - 10**-(- order + 5),
        # adding appropriate weight
        weight = 1,
        subcohort = False
    )
    # setting times < 0  to 0
    cases["start_time"] = np.where(cases["start_time"] < 0, 0, cases["start_time"]) 
    
    subcohort = subcohort.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        time = lambda df: np.where(df["event"], df["time"] - 10**-(- order + 5), df["time"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1/alpha,
        subcohort = True
    )
    # drop any rows where the start time in cases is 0, this is equivaent to time < 0 in subcohort
    subcohort = subcohort.query('time > 0')

    return(pd.concat([cases,subcohort])[[i for i in range(0,n_covariates)]+["start_time","time", "event","weight","subcohort"]])

In [78]:
def fit_cox_barlow(cases, subcohort,n_covariates,len_cohort):
    # Fit a Cox PH model to case-subcohort transformed to have Barlow weights.
    
    # cases: cases dataframe
    # subcohort: subcohort dataframe
    # n_covariates: number of covariates used in the simulation

    case_subcohort_df = barlow_trans(cases,subcohort,n_covariates,len(subcohort)/len_cohort).drop(columns = "subcohort")
    
    # creating the model and fitting the data
    cph = CoxPHFitter()
    cph.fit(case_subcohort_df, entry_col = "start_time", duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    return(cph)

### Tests

First let us test that the `barlow_trans` works in the desired manner.

In [79]:
test_cases = pd.DataFrame(
    {0: [1,2,3,4],
     "time": [0.000001,0.000001,2,3],
     "event": [True,True,True,True]})
test_cases

Unnamed: 0,0,time,event
0,1,1e-06,True
1,2,1e-06,True
2,3,2.0,True
3,4,3.0,True


In [80]:
test_subcohort = pd.DataFrame(
    {0: [1,2,3,4],
     "time": [0.000001,2,3,4],
      "index": [1,3,4,5],
     "event": [True,True,True,False]}).set_index("index")
test_subcohort

Unnamed: 0_level_0,0,time,event
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,1e-06,True
3,2,2.0,True
4,3,3.0,True
5,4,4.0,False


For `test_cases`, we expect that all the `start_time`s should be `time`$-10^{-5}$, unless this would result in them being less than 0, like for 0 and 1, so these should be 0. `weight` should be 1, and `subcohort` should be `False`.

For `test_subcohort`, all the entries should have `start_time` $=0$. The first entry should be removed because the start time is already 0 in the case set so we do not need the duplication. Index 3 should have a `start_time` that is $-10^{-5}$ the `time`. For 4 and 5 there should be no other changes. Weight should be $1/\alpha$.

In [81]:
barlow_trans(test_cases,test_subcohort, 1, 1/5)

Unnamed: 0,0,start_time,time,event,weight,subcohort
0,1,0.0,1e-06,True,1.0,False
1,2,0.0,1e-06,True,1.0,False
2,3,1.99999,2.0,True,1.0,False
3,4,2.99999,3.0,True,1.0,False
3,2,0.0,1.99999,False,5.0,True
4,3,0.0,2.99999,False,5.0,True
5,4,0.0,4.0,False,5.0,True


Testing if the model runs:

In [82]:
cph2 = fit_cox_barlow(cases, subcohort,10,1000)

In [83]:
cph2.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'event'
weights col,'weight'
entry col,'start_time'
robust variance,True
baseline estimation,breslow
number of observations,1300
number of events observed,300
partial log-likelihood,-1577.96

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
0,-1.87,0.15,0.13,-2.13,-1.61,0.12,0.2,0.0,-14.31,<0.005,151.81
1,-0.62,0.54,0.11,-0.84,-0.39,0.43,0.68,0.0,-5.4,<0.005,23.85
2,1.41,4.08,0.11,1.2,1.61,3.32,5.02,0.0,13.3,<0.005,131.63
3,-0.41,0.66,0.09,-0.59,-0.23,0.55,0.8,0.0,-4.38,<0.005,16.37
4,-0.07,0.94,0.09,-0.25,0.11,0.78,1.12,0.0,-0.72,0.47,1.09
5,1.29,3.63,0.19,0.92,1.65,2.52,5.22,0.0,6.93,<0.005,37.78
6,0.56,1.75,0.17,0.22,0.9,1.24,2.46,0.0,3.22,<0.005,9.62
7,-0.1,0.9,0.18,-0.45,0.25,0.64,1.28,0.0,-0.58,0.56,0.83
8,1.25,3.5,0.2,0.86,1.65,2.36,5.2,0.0,6.21,<0.005,30.84
9,0.91,2.47,0.18,0.56,1.25,1.75,3.49,0.0,5.14,<0.005,21.82

0,1
Concordance,0.50
Partial AIC,3175.91
log-likelihood ratio test,868.53 on 10 df
-log2(p) of ll-ratio test,596.04


Again the coefficients seem close enough to the simulated coefficients.

## Cox Model, Prentice Weights

To fit the model with Prentice weights, we split the data points corresponding to events into two parts. We use the case data to construct the interval at the event which has weight 1. Using the subcohort data, we construct the interval before the event that has weight $1$. All non-events in the subcohort have weight $1$ while in the risk set.

Function for changing data for Cox model with Prentice weights:

In [84]:
def prentice_trans(cases,subcohort,n_covariates,len_cohort):
    # Transform a case-subcohort dataset to a Prentice weighting scheme.
    
    # cases: cases dataframe
    # subcohort: subcohort dataframe
    # n_covariates: number of covariates used in the simulation
    
    # finding the order of magnitude of data to pick the appropriate size of each "instant". We use the largest event time for this.
    order = int(np.floor(np.log(max(cases["time"]))/np.log(10))) 
    
    
    cases = cases.assign(
        # rounding all of the 
#         time = round(cases["time"],- order + 5),
        # setting events outside subcohort to start just before they occur
        start_time = lambda df: df["time"] - 10**-(- order + 5),
        # adding appropriate weight
        weight = 1,
        subcohort = False
    )
    # setting times < 0  to 0
    cases["start_time"] = np.where(cases["start_time"] < 0, 0, cases["start_time"]) 
    
    subcohort = subcohort.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        time = lambda df: np.where(df["event"], df["time"] - 10**-(- order + 5), df["time"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1,
        subcohort = True
    )
    # drop any rows where the start time in cases is 0, this is equivaent to time < 0 in subcohort
    subcohort = subcohort.query('time > 0')

    return(pd.concat([cases,subcohort])[[i for i in range(0,n_covariates)]+["start_time","time", "event","weight","subcohort"]])

In [85]:
def fit_cox_prentice(cases, subcohort,n_covariates):
    # Fit a Cox model with Prentice weights to case-subcohort data
    
    # cases: cases dataframe
    # subcohort: subcohort dataframe
    # n_covariates: number of covariates used in the simulation
    
    case_subcohort_df = prentice_trans(cases,subcohort,n_covariates,1000).drop(columns = "subcohort")
    
    # creating the model and fitting the data
    cph = CoxPHFitter()
    cph.fit(case_subcohort_df, entry_col = "start_time", duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    return(cph)

### Tests

For `test_cases`, we expect that all the `start_time`s should be `time`$-10^{-5}$, unless this would result in them being less than 0, like for 0 and 1, so these should be 0. `weight` should be 1, and `subcohort` should be `False`.

For `test_subcohort`, all the entries should have `start_time` $=0$. The first entry should be removed because the start time is already 0 in the case set so we do not need the duplication. Index 3 should have a `start_time` that is $-10^{-5}$ the `time`. For 4 and 5 there should be no other changes.`weight` should be 1.

In [86]:
prentice_trans(test_cases,test_subcohort, 1,1000)

Unnamed: 0,0,start_time,time,event,weight,subcohort
0,1,0.0,1e-06,True,1,False
1,2,0.0,1e-06,True,1,False
2,3,1.99999,2.0,True,1,False
3,4,2.99999,3.0,True,1,False
3,2,0.0,1.99999,False,1,True
4,3,0.0,2.99999,False,1,True
5,4,0.0,4.0,False,1,True


In [87]:
cph3 = fit_cox_prentice(cases,subcohort,n_covariates)

In [88]:
cph3.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'event'
weights col,'weight'
entry col,'start_time'
robust variance,True
baseline estimation,breslow
number of observations,600
number of events observed,300
partial log-likelihood,-1225.31

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
0,-1.83,0.16,0.12,-2.05,-1.6,0.13,0.2,0.0,-15.75,<0.005,183.26
1,-0.6,0.55,0.1,-0.79,-0.41,0.45,0.66,0.0,-6.18,<0.005,30.51
2,1.38,3.96,0.08,1.21,1.54,3.37,4.66,0.0,16.52,<0.005,201.33
3,-0.4,0.67,0.07,-0.54,-0.25,0.58,0.78,0.0,-5.38,<0.005,23.66
4,-0.06,0.94,0.07,-0.21,0.08,0.81,1.08,0.0,-0.88,0.38,1.39
5,1.26,3.52,0.15,0.97,1.55,2.63,4.7,0.0,8.48,<0.005,55.27
6,0.55,1.73,0.15,0.25,0.84,1.29,2.31,0.0,3.67,<0.005,12.0
7,-0.11,0.9,0.16,-0.41,0.2,0.66,1.22,0.0,-0.7,0.48,1.05
8,1.22,3.38,0.17,0.89,1.55,2.43,4.7,0.0,7.22,<0.005,40.85
9,0.89,2.45,0.14,0.62,1.17,1.85,3.23,0.0,6.31,<0.005,31.76

0,1
Concordance,0.50
Partial AIC,2470.62
log-likelihood ratio test,853.16 on 10 df
-log2(p) of ll-ratio test,585.05


## Cox mode, Self and Prentice Weights


To fit the model with Self and Prentice weights, we split the data points corresponding to events into two parts. Case data has weight $0$ in the pseudo-partial likelihood. The Cox fitter does not support settingn weight to 0, so let the weight be extremely small. Subcohort data has weight $1$.

Function for changing data for Cox model with Prentice weights:

In [89]:
def self_prentice_trans(cases,subcohort,n_covariates,len_cohort):
    # Transform a case-subcohort dataset to a Self-Prentice weighting scheme.
    
    # cases: cases dataframe
    # subcohort: subcohort dataframe
    # n_covariates: number of covariates used in the simulation
    
    # finding the order of magnitude of data to pick the appropriate size of each "instant". We use the largest event time for this.
    order = int(np.floor(np.log(max(cases["time"]))/np.log(10))) 
    
    # removing the cases that are in the subcohort from the cases data frame
    cases = cases[~cases.index.isin(subcohort.index)]
    # Adding the non-subcohort case weights
    cases["weight"] = 10**(-order - 5)
    cases["subcohort"] = False
    
    subcohort = subcohort.assign(
        weight = 1,
        subcohort = True
    )

    return(pd.concat([cases,subcohort])[[i for i in range(0,n_covariates)]+["time", "event","weight","subcohort"]])

In [90]:
def fit_cox_self_prentice(cases, subcohort,n_covariates):
    # Fit a Cox model with Self-Prentice weights to case-subcohort the data
    
    # cases: cases dataframe
    # subcohort: subcohort dataframe
    # n_covariates: number of covariates used in the simulation
    
    case_subcohort_df = self_prentice_trans(cases,subcohort,n_covariates,1000).drop(columns = "subcohort")
    
    # creating the model and fitting the data
    cph = CoxPHFitter()
    cph.fit(case_subcohort_df, duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    return(cph)

### Tests

`weight`s for the cases not in the subcohort should be very close to 0 and for subcohort close to 1. Any cases that are in the subcohort should not have duplicates.

In [91]:
self_prentice_trans(test_cases,test_subcohort, 1,1000)

Unnamed: 0,0,time,event,weight,subcohort
0,1,1e-06,True,1e-05,False
2,3,2.0,True,1e-05,False
1,1,1e-06,True,1.0,True
3,2,2.0,True,1.0,True
4,3,3.0,True,1.0,True
5,4,4.0,False,1.0,True


In [92]:
cph4 = fit_cox_self_prentice(cases,subcohort,10)

In [93]:
cph4.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'event'
weights col,'weight'
robust variance,True
baseline estimation,breslow
number of observations,300.022
number of events observed,85.0215
partial log-likelihood,-354.09
time fit was run,2022-07-31 07:57:57 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
0,-1.82,0.16,0.16,-2.13,-1.5,0.12,0.22,0.0,-11.36,<0.005,96.86
1,-0.47,0.62,0.14,-0.74,-0.2,0.48,0.82,0.0,-3.41,<0.005,10.61
2,1.18,3.24,0.14,0.9,1.46,2.45,4.29,0.0,8.27,<0.005,52.78
3,-0.39,0.68,0.13,-0.64,-0.15,0.53,0.86,0.0,-3.11,<0.005,9.06
4,-0.19,0.83,0.13,-0.45,0.07,0.64,1.07,0.0,-1.44,0.15,2.73
5,0.94,2.56,0.24,0.48,1.4,1.62,4.06,0.0,4.0,<0.005,13.96
6,0.5,1.65,0.22,0.07,0.93,1.07,2.53,0.0,2.27,0.02,5.41
7,-0.2,0.82,0.23,-0.65,0.25,0.52,1.28,0.0,-0.87,0.38,1.38
8,1.05,2.86,0.23,0.59,1.51,1.81,4.51,0.0,4.51,<0.005,17.23
9,0.85,2.35,0.23,0.41,1.3,1.51,3.67,0.0,3.77,<0.005,12.58

0,1
Concordance,0.83
Partial AIC,728.18
log-likelihood ratio test,229.82 on 10 df
-log2(p) of ll-ratio test,142.94


## Penalised Cox Regression,Barlow weights

For simplicity, consider L1, L2 and 0.5 L1 weight in the penality function. We use k-fold cross validation to find the optimal $\alpha$. We need to adapt the inbuilt `k_fold_cross_validation` function to accomodate changing the dataset for the time dependent weights.

Importing a function for using k-fold cross validation adapted from the Lifelines package:

In [94]:
from cox_k_fold import cox_k_fold

In [95]:
 cox_k_fold(
    cph2, cases, subcohort,10,1000, prentice_trans, "time", event_col="event", k=5, scoring_method="log_likelihood", fitter_kwargs={"weights_col": "weight", "robust": True}
)

[-0.729862254812239,
 -0.4340706654078084,
 -0.43481065649952044,
 -0.479719912808315,
 -0.41543919817466113]

In [96]:
def fit_pen_cox_barlow(cases, subcohort,n_covariates, len_cohort, l1_ratio = 0, penalizer_show = False):
    # Fit a penalized Cox model to case-subcohort data transformed to have Barlow weights.
    
    # choosing the penaliser via cross validation
    avg_score = []
    for penalizer in range(0,20):
        score = cox_k_fold(CoxPHFitter(penalizer = penalizer/10),cases, subcohort,n_covariates, len_cohort, barlow_trans,"time", event_col="event", k=5, scoring_method="log_likelihood", fitter_kwargs={"weights_col": "weight", "robust": True})
        avg_score.append(np.mean(score))
    penalizer = int(np.where(avg_score == max(avg_score))[0])/10
    
    # creating the model and fitting the data
    cph = CoxPHFitter(penalizer = penalizer,l1_ratio = l1_ratio)
    case_subcohort_df = barlow_trans(cases,subcohort,n_covariates, len_cohort).drop(columns = "subcohort")
    cph.fit(case_subcohort_df, entry_col = "start_time", duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    if penalizer_show:
        return(cph, penalizer)
    else:
        return(cph)

In [97]:
cph5, penalizer = fit_pen_cox_barlow(cases,subcohort,10,1000, l1_ratio = 0, penalizer_show = True)
print(penalizer)

0.0


In [98]:
cph5.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'event'
weights col,'weight'
entry col,'start_time'
robust variance,True
baseline estimation,breslow
number of observations,300.3
number of events observed,300
partial log-likelihood,-28.36

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
0,-1.0,0.37,0.38,-1.74,-0.26,0.17,0.77,0.0,-2.66,0.01,6.99
1,-0.3,0.74,0.44,-1.17,0.57,0.31,1.77,0.0,-0.68,0.5,1.01
2,0.78,2.19,0.36,0.08,1.49,1.08,4.42,0.0,2.18,0.03,5.1
3,-0.21,0.81,0.39,-0.97,0.55,0.38,1.74,0.0,-0.54,0.59,0.76
4,0.01,1.01,0.33,-0.64,0.67,0.53,1.95,0.0,0.04,0.97,0.05
5,0.67,1.95,0.7,-0.7,2.04,0.49,7.69,0.0,0.95,0.34,1.56
6,0.3,1.35,0.75,-1.17,1.77,0.31,5.89,0.0,0.4,0.69,0.54
7,-0.3,0.74,0.77,-1.82,1.22,0.16,3.37,0.0,-0.39,0.7,0.52
8,0.55,1.74,0.76,-0.93,2.04,0.39,7.7,0.0,0.73,0.46,1.11
9,0.61,1.84,0.69,-0.75,1.97,0.47,7.15,0.0,0.88,0.38,1.41

0,1
Concordance,0.50
Partial AIC,76.72
log-likelihood ratio test,75.42 on 10 df
-log2(p) of ll-ratio test,37.88


## Penalised Cox Model, Prentice Weights

In [99]:
def fit_pen_cox_prentice(cases, subcohort,n_covariates,len_cohort, l1_ratio = 0, penalizer_show = False):
    # Fitting penalized Cox model to case-sucbohort data with Prentice weights.
    
    # choosing the penaliser via k-fold cross validation
    avg_score = []
    for penalizer in range(0,20):
        score = cox_k_fold(CoxPHFitter(penalizer = penalizer/10),cases, subcohort,n_covariates,len_cohort, prentice_trans,"time", event_col="event", k=5, scoring_method="log_likelihood", fitter_kwargs={"weights_col": "weight", "robust": True})
        avg_score.append(np.mean(score))
    penalizer = int(np.where(avg_score == max(avg_score))[0])/10
    
    # creating the model and fitting the data
    cph = CoxPHFitter(penalizer = penalizer,l1_ratio = l1_ratio)
    case_subcohort_df = prentice_trans(cases,subcohort,n_covariates,len_cohort).drop(columns = "subcohort")
    cph.fit(case_subcohort_df, entry_col = "start_time", duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    if penalizer_show:
        return(cph, penalizer)
    else:
        return(cph)

In [100]:
cph6, penalizer = fit_pen_cox_prentice(cases,subcohort,10,1000, l1_ratio = 0,penalizer_show = True)
print(penalizer)

0.0


In [101]:
cph6.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'event'
weights col,'weight'
entry col,'start_time'
robust variance,True
baseline estimation,breslow
number of observations,600
number of events observed,300
partial log-likelihood,-1225.31

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
0,-1.83,0.16,0.12,-2.05,-1.6,0.13,0.2,0.0,-15.75,<0.005,183.26
1,-0.6,0.55,0.1,-0.79,-0.41,0.45,0.66,0.0,-6.18,<0.005,30.51
2,1.38,3.96,0.08,1.21,1.54,3.37,4.66,0.0,16.52,<0.005,201.33
3,-0.4,0.67,0.07,-0.54,-0.25,0.58,0.78,0.0,-5.38,<0.005,23.66
4,-0.06,0.94,0.07,-0.21,0.08,0.81,1.08,0.0,-0.88,0.38,1.39
5,1.26,3.52,0.15,0.97,1.55,2.63,4.7,0.0,8.48,<0.005,55.27
6,0.55,1.73,0.15,0.25,0.84,1.29,2.31,0.0,3.67,<0.005,12.0
7,-0.11,0.9,0.16,-0.41,0.2,0.66,1.22,0.0,-0.7,0.48,1.05
8,1.22,3.38,0.17,0.89,1.55,2.43,4.7,0.0,7.22,<0.005,40.85
9,0.89,2.45,0.14,0.62,1.17,1.85,3.23,0.0,6.31,<0.005,31.76

0,1
Concordance,0.50
Partial AIC,2470.62
log-likelihood ratio test,853.16 on 10 df
-log2(p) of ll-ratio test,585.05


## Penalised Cox Regression, Self and Prentice 

In [102]:
def fit_pen_cox_self_prentice(cases, subcohort,n_covariates,len_cohort, l1_ratio = 0, penalizer_show = False):
    # Fits penalized Cox model to case-subcohort data with Self-Prentice weights
    
    # choosing the penaliser via k-fold cross-validation
    avg_score = []
    for penalizer in range(0,20):
        score = cox_k_fold(CoxPHFitter(penalizer = penalizer/10),cases, subcohort,n_covariates,len_cohort, self_prentice_trans,"time", event_col="event", k=5, scoring_method="log_likelihood", fitter_kwargs={"weights_col": "weight", "robust": True})
        avg_score.append(np.mean(score))
    penalizer = int(np.where(avg_score == max(avg_score))[0])/10
    
    # creating the model and fitting the data
    cph = CoxPHFitter(penalizer = penalizer,l1_ratio = l1_ratio)
    case_subcohort_df = self_prentice_trans(cases,subcohort,n_covariates,len_cohort).drop(columns = "subcohort")
    cph.fit(case_subcohort_df, duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    if penalizer_show:
        return(cph, penalizer)
    else:
        return(cph)

In [103]:
cph7, penalizer = fit_pen_cox_self_prentice(cases,subcohort,10,1000, l1_ratio = 0,penalizer_show = True)
print(penalizer)

0.0


In [104]:
cph7.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'event'
weights col,'weight'
robust variance,True
baseline estimation,breslow
number of observations,300.022
number of events observed,85.0215
partial log-likelihood,-354.09
time fit was run,2022-07-31 07:58:44 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
0,-1.82,0.16,0.16,-2.13,-1.5,0.12,0.22,0.0,-11.36,<0.005,96.86
1,-0.47,0.62,0.14,-0.74,-0.2,0.48,0.82,0.0,-3.41,<0.005,10.61
2,1.18,3.24,0.14,0.9,1.46,2.45,4.29,0.0,8.27,<0.005,52.78
3,-0.39,0.68,0.13,-0.64,-0.15,0.53,0.86,0.0,-3.11,<0.005,9.06
4,-0.19,0.83,0.13,-0.45,0.07,0.64,1.07,0.0,-1.44,0.15,2.73
5,0.94,2.56,0.24,0.48,1.4,1.62,4.06,0.0,4.0,<0.005,13.96
6,0.5,1.65,0.22,0.07,0.93,1.07,2.53,0.0,2.27,0.02,5.41
7,-0.2,0.82,0.23,-0.65,0.25,0.52,1.28,0.0,-0.87,0.38,1.38
8,1.05,2.86,0.23,0.59,1.51,1.81,4.51,0.0,4.51,<0.005,17.23
9,0.85,2.35,0.23,0.41,1.3,1.51,3.67,0.0,3.77,<0.005,12.58

0,1
Concordance,0.83
Partial AIC,728.18
log-likelihood ratio test,229.82 on 10 df
-log2(p) of ll-ratio test,142.94


## Decision tree, random over sampler

In [105]:
from imblearn.over_sampling import RandomOverSampler

In [27]:
from sksurv.tree import SurvivalTree
from sksurv.util import Surv

We set the "class" to event, because controls are undersampled. We want there to be similar to in the cohort, so we want the number of cases simply to be $n_\text{cases}$, and the number of controls to be $n_\text{cohort} - n_\text{controls}$.

In [107]:
def ros_tree(cases,subcohort,n_covariates):
    # Fit a decision tree to case-subcohort dataset using random-oversampling
    
    # creating case-subcohort data frame and removing duplicate entries of cases
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop(columns = 'subcohort').drop_duplicates()
    
    # oversampled data set
    # "covariates"
    X = case_subcohort[[i for i in range(0,n_covariates)]+['time']]
    # "classes" to be oversampled. Here, cases
    y = case_subcohort["event"]
    ros = RandomOverSampler(sampling_strategy = {True: len(cases), False: len(cohort) - len(cases)})
    X_resampled, y_resampled = ros.fit_resample(X, y)
    
    # matrix of covariates
    X_train = X_resampled[range(0,n_covariates)]
    # (event,time) response array
    y_train = Surv().from_arrays(y_resampled, X_resampled['time'])
    
    # fitting the tree
    tree = SurvivalTree()
    tree.fit(X_train, y_train)
    
    return(tree)

In [108]:
tree = ros_tree(cases,subcohort,n_covariates)

### SMOTENC

In [22]:
from imblearn.over_sampling import SMOTENC

In [25]:
def smotenc_tree(cases,subcohort,n_covariates):
    # Fit a decision tree with to case-subcohort data oversampled using SMOTENC
    
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop_duplicates()

    # oversampled data set
    # "covariates"
    X = case_subcohort[[i for i in range(0,n_covariates)]+['time']]
    # "classes" to be oversampled. Here, cases
    y = case_subcohort["event"]
    categorical_features = list(np.where([sum(~(cases[i].isin([0,1]))) == 0 for i in range(0,n_covariates)])[0])
    smote_nc = SMOTENC(categorical_features=categorical_features)
    X_resampled, y_resampled = smote_nc.fit_resample(X, y)
    
    # matrix of covariates
    X_train = X_resampled[range(0,n_covariates)]
    # (event,time) response array
    y_train = Surv().from_arrays(y_resampled, X_resampled['time'])
    
    # fitting the tree
    tree = SurvivalTree()
    tree.fit(X_train, y_train)
    
    return(tree)

In [35]:
tree2 = smotenc_tree(cases,subcohort,n_covariates)

In [38]:
tree2 = smotenc_tree(cases_cat,subcohort_cat,n_covariates)

### SMOTE

In [29]:
from imblearn.over_sampling import SMOTE

In [30]:
def smote_tree(cases,subcohort,n_covariates):
    # Fit a decision tree to case-subcohort data oversampled using SMOTE
    
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop_duplicates()

    # oversampled data set
    # "covariates"
    X = case_subcohort[[i for i in range(0,n_covariates)]+['time']]
    # "classes" to be oversampled. Here, cases
    y = case_subcohort["event"]
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # matrix of covariates
    X_train = X_resampled[range(0,n_covariates)]
    # (event,time) response array
    y_train = Surv().from_arrays(y_resampled, X_resampled['time'])
    
    # fitting the tree
    tree = SurvivalTree()
    tree.fit(X_train, y_train)
    
    return(tree)

In [31]:
tree3 = smote_tree(cases_cont,subcohort_cont,n_covariates)

## Random survival forest, Naive random over-sampling

In [115]:
from imblearn.over_sampling import RandomOverSampler

In [47]:
from sksurv.ensemble import RandomSurvivalForest

We set the "class" to event, because controls are undersampled. We want there to be similar to in the cohort, so we want the number of cases simply to be $n_\text{cases}$, and the number of controls to be $n_\text{cohort} - n_\text{controls}$.

Function fitting random oversampled random survival forest.

In [117]:
def ros_rsf(cases,subcohort,n_covariates):
    # Fit a random survival forest to case-subcohort data with random oversampling
    
    # creating case-subcohort data frame and removing duplicate entries of cases
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop(columns = 'subcohort').drop_duplicates()
    
    # oversampled data set
    # "covariates"
    X = case_subcohort[[i for i in range(0,n_covariates)]+['time']]
    # "classes" to be oversampled. Here, cases
    y = case_subcohort["event"]
    ros = RandomOverSampler(sampling_strategy = {True: len(cases), False: len(cohort) - len(cases)})
    X_resampled, y_resampled = ros.fit_resample(X, y)
    
    # matrix of covariates
    X_train = X_resampled[range(0,n_covariates)]
    # (event,time) response array
    y_train = Surv().from_arrays(y_resampled, X_resampled['time'])
    
    # fitting the random survival forest
    rsf = RandomSurvivalForest(n_estimators=1000)
    rsf.fit(X_train, y_train)
    
    return(rsf)

In [118]:
rsf = ros_rsf(cases,subcohort,n_covariates)

### SMOTENC

In [119]:
from imblearn.over_sampling import SMOTENC

In [120]:
def smotenc_rsf(cases,subcohort,n_covariates):
    # Fit a random survival forest oversampled with SMOTENC
    
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop(columns = 'subcohort').drop_duplicates()

    # oversampled data set
    # "covariates"
    X = case_subcohort[[i for i in range(0,n_covariates)]+['time']]
    # "classes" to be oversampled. Here, cases
    y = case_subcohort["event"]
    categorical_features = list(np.where([sum(~(cases[i].isin([0,1]))) == 0 for i in range(0,10)])[0])
    smote_nc = SMOTENC(categorical_features=categorical_features)
    X_resampled, y_resampled = smote_nc.fit_resample(X, y)
    
    # matrix of covariates
    X_train = X_resampled[range(0,n_covariates)]
    # (event,time) response array
    y_train = Surv().from_arrays(y_resampled, X_resampled['time'])
    
    # fitting the random survival forest
    rsf = RandomSurvivalForest(n_estimators=1000)
    rsf.fit(X_train, y_train)
    
    return(rsf)

In [121]:
rsf2 = smotenc_rsf(cases,subcohort,n_covariates)

### SMOTE

In [45]:
def smote_rsf(cases,subcohort,n_covariates):
    # Fit a random survival forest oversampled with SMOTE
    
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop_duplicates()

    # oversampled data set
    # "covariates"
    X = case_subcohort[[i for i in range(0,n_covariates)]+['time']]
    # "classes" to be oversampled. Here, cases
    y = case_subcohort["event"]
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    # matrix of covariates
    X_train = X_resampled[range(0,n_covariates)]
    # (event,time) response array
    y_train = Surv().from_arrays(y_resampled, X_resampled['time'])
    
    # fitting the random survival forest
    rsf = RandomSurvivalForest(n_estimators=1000)
    rsf.fit(X_train, y_train)
    
    return(rsf)

In [48]:
rsf2 = smote_rsf(cases_cont,subcohort_cont,n_covariates)