# Linear Model Fits
Fitting the models for simple linear model

In [1]:
import numpy as np
from scipy.stats import weibull_min # r weibull simulation
from scipy.stats import norm # for covariate simulation
from scipy.stats import gamma # for weibull shape parameter
from scipy.stats import bernoulli # for censoring
from scipy.stats import uniform
from scipy.stats.mstats import mquantiles
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
from data_simulation import *

For writing the functions, we start with a simulated case-subcohort and test dataset:

In [3]:
n_covariates = 10
sample = weibull_simple_linear_sim([1,1,1,1,1,1,1,1,1,1], 0.5, 1500, 0.7, pi = 0.5)
sample

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y,end_censor,dropout,end_censor_time,time,event
0,1.196476,0.191833,-0.073135,0.757535,0.061021,1.0,1.0,0.0,0.0,1.0,0.300343,False,False,0.300343,0.300343,True
1,0.589900,-1.433206,0.065517,0.710613,-0.154888,1.0,0.0,0.0,0.0,0.0,0.925423,True,False,0.425730,0.425730,False
2,1.962799,1.096334,0.070341,-0.014216,0.274551,0.0,0.0,1.0,1.0,1.0,0.267360,False,False,0.267360,0.267360,True
3,0.833765,3.390531,1.831505,-0.176279,-2.001757,0.0,1.0,1.0,0.0,1.0,0.236180,False,True,0.236180,0.098400,False
4,-1.096859,-0.663814,0.520606,-1.222916,0.353255,0.0,1.0,1.0,0.0,1.0,0.651060,True,False,0.425730,0.425730,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,-0.502243,0.249911,0.212943,1.415288,-0.840517,0.0,0.0,0.0,1.0,1.0,0.628848,True,False,0.425730,0.425730,False
1496,-1.876473,1.443938,-0.412877,0.444524,-0.583017,0.0,0.0,1.0,0.0,0.0,1.021437,True,False,0.425730,0.425730,False
1497,-0.400989,0.171076,0.208079,0.950630,0.404573,0.0,0.0,1.0,1.0,1.0,0.446388,True,False,0.425730,0.425730,False
1498,-1.047828,1.184693,-0.405803,0.479497,0.592869,1.0,0.0,1.0,0.0,1.0,0.466816,True,False,0.425730,0.425730,False


In [4]:
cases, subcohort, cohort, test = cch_splitter(sample)

In [5]:
cases.shape, subcohort.shape, cohort.shape, test.shape

((300, 16), (300, 16), (1000, 16), (500, 16))

## Cox mode, unweighted
First we look at the standard Cox model without adjusting for bias. When the model is not mispecified at all, the performance of the unweighted version may not be that different, however, if the model is mispecified, the results may be poorer.

The following function fits an unweighted Cox model for the Weibull simple linear case.

In [6]:
from lifelines import CoxPHFitter

def fit_cox(cases, subcohort,n_covariates):
    # creating a single case subcohort dataframe
    case_subcohort_df = pd.concat([cases,subcohort])
    # removing unnecessary columns and duplicate rows
    case_subcohort_df = case_subcohort_df.loc[case_subcohort_df.duplicated() == False,[i for i in range(0,n_covariates)]+["time", "event"]]
    
    # creating the model and fitting the data
    cph = CoxPHFitter()
    cph.fit(case_subcohort_df, duration_col = "time", event_col = "event")
    return(cph)

In [7]:
cph = fit_cox(cases, subcohort,n_covariates)
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'event'
baseline estimation,breslow
number of observations,516
number of events observed,300
partial log-likelihood,-1503.32
time fit was run,2022-07-31 12:15:09 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
0,0.83,2.3,0.07,0.7,0.96,2.02,2.62,0.0,12.56,<0.005,117.84
1,0.77,2.16,0.08,0.62,0.92,1.86,2.5,0.0,10.17,<0.005,78.32
2,0.82,2.28,0.07,0.68,0.97,1.98,2.63,0.0,11.3,<0.005,95.97
3,0.78,2.17,0.07,0.65,0.9,1.91,2.47,0.0,11.82,<0.005,104.64
4,0.84,2.32,0.07,0.69,0.99,2.0,2.68,0.0,11.28,<0.005,95.67
5,0.83,2.3,0.13,0.58,1.08,1.78,2.96,0.0,6.45,<0.005,33.03
6,1.0,2.73,0.13,0.75,1.26,2.11,3.52,0.0,7.72,<0.005,46.24
7,0.69,1.99,0.12,0.45,0.93,1.57,2.54,0.0,5.63,<0.005,25.7
8,0.94,2.57,0.13,0.69,1.19,2.0,3.3,0.0,7.38,<0.005,42.49
9,0.74,2.1,0.13,0.49,0.99,1.64,2.69,0.0,5.87,<0.005,27.78

0,1
Concordance,0.83
Partial AIC,3026.64
log-likelihood ratio test,485.26 on 10 df
-log2(p) of ll-ratio test,322.91


In [8]:
from lifelines.utils import concordance_index
from sksurv.util import Surv

def concordance_score(n_covariates,test,model,censored = False,lifelines = False):
    
    X_test = test[range(0,n_covariates)]
    
    if censored == False:
        event_times = test["y"]
        event_observed = test["event"]
    else:
        event_times = test["time"]
        event_observed = test["event"]
    
    if lifelines:
        # test predictions
        test_preds = model.predict_partial_hazard(X_test)
        score = concordance_index(event_times, -1*test_preds, event_observed)
    else:
        y_test = Surv().from_arrays(event_observed,event_times)
        score = model.score(X_test,y_test)
    return(score)

In [9]:
concordance_score(n_covariates,test,cph,lifelines = True)

0.8707086899972778

In [10]:
int_brier_score(cases,subcohort,test,cph,lifelines = True)

NameError: name 'int_brier_score' is not defined

## Weighted Cox Model
Now we fit a Cox model using weighting methods:

### Barlow weights

To fit the model with Barlow weights, we split the data points corresponding to events into two parts. We use the case data to construct the interval at the event which has weight 1. Using the subcohort data, we construct the interval before the event that has weight $\frac{1}{\alpha}$. All non-events in the subcohort have weight $\frac{1}{\alpha}$ while in the risk set.

Function for changing data for Cox model with Barlow weights:

In [None]:
def barlow_trans(cases,subcohort, n_covariates, alpha = len(subcohort)/len(cohort)):
    # cases: cases dataframe
    # subcohort: subcohort dataframe
    # n_covariates: number of covariates used in the simulation
    # alpha: the sampling proportion used for the subcohort
    
    
    # finding the order of magnitude of data to pick the appropriate size of each "instant". We use the largest event time for this.
    order = int(np.floor(np.log(max(cases["time"]))/np.log(10))) 
    
    
    cases = cases.assign(
        # setting events outside subcohort to start just before they occur
        start_time = lambda df: df["time"] - 10**-(- order + 5),
        # adding appropriate weight
        weight = 1,
        subcohort = False
    )
    # setting times < 0  to 0
    cases["start_time"] = np.where(cases["start_time"] < 0, 0, cases["start_time"]) 
    
    subcohort = subcohort.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        time = lambda df: np.where(df["event"], df["time"] - 10**-(- order + 5), df["time"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1/alpha,
        subcohort = True
    )
    # drop any rows where the start time in cases is 0.
    if len(np.where(cases["start_time"] == 0)) == 0:
        subcohort.drop(np.where(cases["start_time"] == 0))

    return(pd.concat([cases,subcohort])[[i for i in range(0,n_covariates)]+["start_time","time", "event","weight","subcohort"]])

In [None]:
def fit_cox_barlow(cases, subcohort,n_covariates):
    case_subcohort_df = barlow_trans(cases,subcohort,n_covariates,len(subcohort)/len(cohort)).drop(columns = "subcohort")
    
    # creating the model and fitting the data
    cph = CoxPHFitter()
    cph.fit(case_subcohort_df, entry_col = "start_time", duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    return(cph)

In [None]:
cph2 = fit_cox_barlow(cases,subcohort,n_covariates)

In [None]:
cph2.print_summary()

In [None]:
test_preds = cph2.predict_partial_hazard(test[range(0,10)])
test_preds
event_times = test["time"]
event_observed = test["event"]
event_times, event_observed
from lifelines.utils import concordance_index
concordance_index(event_times, -test_preds, event_observed)

In [None]:
int_brier_score(cases,subcohort,test,cph2,lifelines = True)

### Prentice

To fit the model with Prentice weights, we split the data points corresponding to events into two parts. We use the case data to construct the interval at the event which has weight 1. Using the subcohort data, we construct the interval before the event that has weight $1$. All non-events in the subcohort have weight $1$ while in the risk set.

Function for changing data for Cox model with Prentice weights:

In [None]:
def prentice_trans(cases,subcohort,n_covariates):
    # finding the order of magnitude of data to pick the appropriate size of each "instant". We use the largest event time for this.
    order = int(np.floor(np.log(max(cases["time"]))/np.log(10))) 
    
    
    cases = cases.assign(
        # rounding all of the 
#         time = round(cases["time"],- order + 5),
        # setting events outside subcohort to start just before they occur
        start_time = lambda df: df["time"] - 10**-(- order + 5),
        # adding appropriate weight
        weight = 1,
        subcohort = False
    )
    #filtering out readings with negative start times
    cases = cases.query("start_time > 0") 
    
    subcohort = subcohort.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        time = lambda df: np.where(df["event"], df["time"] - 10**-(- order + 5), df["time"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1,
        subcohort = True
    ) 

    return(pd.concat([cases,subcohort])[[i for i in range(0,n_covariates)]+["start_time","time", "event","weight","subcohort"]])

In [None]:
def fit_cox_prentice(cases, subcohort,n_covariates):
    case_subcohort_df = prentice_trans(cases,subcohort,n_covariates).drop(columns = "subcohort")
    
    # creating the model and fitting the data
    cph = CoxPHFitter()
    cph.fit(case_subcohort_df, entry_col = "start_time", duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    return(cph)

In [None]:
cph3 = fit_cox_prentice(cases,subcohort,n_covariates)

In [None]:
cph3.print_summary()

In [None]:
test_preds = cph3.predict_partial_hazard(test[range(0,10)])
test_preds
event_times = test["time"]
event_observed = test["event"]
event_times, event_observed
from lifelines.utils import concordance_index
concordance_index(event_times, -test_preds, event_observed)

In [None]:
int_brier_score(cases,subcohort,test,cph3,lifelines = True)

### Self and Prentice

To fit the model with Self and Prentice weights, we split the data points corresponding to events into two parts. Case data has weight $0$ in the pseudo-partial likelihood. The Cox fitter does not support settingn weight to 0, so let the weight be extremely small. Subcohort data has weight $1$.

Function for changing data for Cox model with Prentice weights:

In [None]:
def self_prentice_trans(cases,subcohort,n_covariates):
    # finding the order of magnitude of data to pick the appropriate size of each "instant". We use the largest event time for this.
    order = int(np.floor(np.log(max(cases["time"]))/np.log(10))) 
    
    # removing the cases that are in the subcohort from the cases data frame
    cases = cases[~cases.index.isin(subcohort.index)]
    # Adding the non-subcohort case weights
    cases["weight"] = 10**(-order - 5)
    cases["subcohort"] = False
    
    subcohort = subcohort.assign(
        weight = 1,
        subcohort = True
    )

    return(pd.concat([cases,subcohort])[[i for i in range(0,n_covariates)]+["time", "event","weight","subcohort"]])

In [None]:
self_prentice_trans(cases,subcohort,n_covariates)

In [None]:
def fit_cox_self_prentice(cases, subcohort):
    case_subcohort_df = self_prentice_trans(cases,subcohort,n_covariates).drop(columns = "subcohort")
    
    # creating the model and fitting the data
    cph = CoxPHFitter()
    cph.fit(case_subcohort_df, duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    return(cph)

In [None]:
cph4 = fit_cox_self_prentice(cases,subcohort)

In [None]:
cph4.print_summary()

In [None]:
test_preds = cph4.predict_partial_hazard(test[range(0,10)])
test_preds
event_times = test["time"]
event_observed = test["event"]
event_times, event_observed
from lifelines.utils import concordance_index
concordance_index(event_times, -test_preds, event_observed)

In [None]:
int_brier_score(cases,subcohort,test,cph4,lifelines = True)

## Penalised Cox Regression

For simplicity, consider L1, L2 and 0.5 L1 weight in the penality function. We use k-fold cross validation to find the optimal $\alpha$. We need to adapt the inbuilt `k_fold_cross_validation` function to accomodate changing the dataset for the time dependent weights.

In [None]:
from cox_k_fold import cox_k_fold

In [None]:
 cox_k_fold(
    cph2, cases, subcohort,n_covariates, barlow_trans, "time", event_col="event", k=5, scoring_method="log_likelihood", fitter_kwargs={"weights_col": "weight", "robust": True}
)

In [None]:
def fit_pen_cox_barlow(cases, subcohort,n_covariates, l1_ratio = 0, penalizer_show = False):
    # choosing the penaliser
    avg_score = []
    for penalizer in range(0,20):
        score = cox_k_fold(CoxPHFitter(penalizer = penalizer/10),cases, subcohort,n_covariates, barlow_trans,"time", event_col="event", k=5, scoring_method="log_likelihood", fitter_kwargs={"weights_col": "weight", "robust": True})
        avg_score.append(np.mean(score))
    penalizer = int(np.where(avg_score == max(avg_score))[0])/10
    
    # creating the model and fitting the data
    cph = CoxPHFitter(penalizer = penalizer,l1_ratio = l1_ratio)
    case_subcohort_df = barlow_trans(cases,subcohort,n_covariates).drop(columns = "subcohort")
    cph.fit(case_subcohort_df, entry_col = "start_time", duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    if penalizer_show:
        return(cph, penalizer)
    else:
        return(cph)

In [None]:
cph5, penalizer = fit_pen_cox_barlow(cases,subcohort,n_covariates, l1_ratio = 0, penalizer_show = True)
print(penalizer)

In [None]:
cph5.print_summary()

In [None]:
test_preds = cph5.predict_partial_hazard(test[range(0,10)])
test_preds
event_times = test["time"]
event_observed = test["event"]
event_times, event_observed
from lifelines.utils import concordance_index
concordance_index(event_times, -test_preds, event_observed)

In [None]:
int_brier_score(cases,subcohort,test,cph5,lifelines = True)

In [None]:
def fit_pen_cox_prentice(cases, subcohort,n_covariates, l1_ratio = 0, penalizer_show = False):
    # choosing the penaliser
    avg_score = []
    for penalizer in range(0,20):
        score = cox_k_fold(CoxPHFitter(penalizer = penalizer/10),cases, subcohort,n_covariates, prentice_trans,"time", event_col="event", k=5, scoring_method="log_likelihood", fitter_kwargs={"weights_col": "weight", "robust": True})
        avg_score.append(np.mean(score))
    penalizer = int(np.where(avg_score == max(avg_score))[0])/10
    
    # creating the model and fitting the data
    cph = CoxPHFitter(penalizer = penalizer,l1_ratio = l1_ratio)
    case_subcohort_df = prentice_trans(cases,subcohort,n_covariates).drop(columns = "subcohort")
    cph.fit(case_subcohort_df, entry_col = "start_time", duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    if penalizer_show:
        return(cph, penalizer)
    else:
        return(cph)

In [None]:
cph6, penalizer = fit_pen_cox_prentice(cases,subcohort,n_covariates, l1_ratio = 0,penalizer_show = True)
print(penalizer)

In [None]:
cph6.print_summary()

In [None]:
test_preds = cph6.predict_partial_hazard(test[range(0,10)])
test_preds
event_times = test["time"]
event_observed = test["event"]
event_times, event_observed
from lifelines.utils import concordance_index
concordance_index(event_times, -test_preds, event_observed)

In [None]:
int_brier_score(cases,subcohort,test,cph6,lifelines = True)

In [None]:
def fit_pen_cox_self_prentice(cases, subcohort,n_covariates, l1_ratio = 0, penalizer_show = False):
    # choosing the penaliser
    avg_score = []
    for penalizer in range(0,20):
        score = cox_k_fold(CoxPHFitter(penalizer = penalizer/10),cases, subcohort,n_covariates, self_prentice_trans,"time", event_col="event", k=5, scoring_method="log_likelihood", fitter_kwargs={"weights_col": "weight", "robust": True})
        avg_score.append(np.mean(score))
    penalizer = int(np.where(avg_score == max(avg_score))[0])/10
    
    # creating the model and fitting the data
    cph = CoxPHFitter(penalizer = penalizer,l1_ratio = l1_ratio)
    case_subcohort_df = self_prentice_trans(cases,subcohort,n_covariates).drop(columns = "subcohort")
    cph.fit(case_subcohort_df, duration_col = "time",event_col = "event",weights_col = "weight",robust = True)
    if penalizer_show:
        return(cph, penalizer)
    else:
        return(cph)

In [None]:
cph7, penalizer = fit_pen_cox_self_prentice(cases,subcohort,n_covariates, l1_ratio = 0,penalizer_show = True)
print(penalizer)

In [None]:
cph7.print_summary()

In [None]:
test_preds = cph7.predict_partial_hazard(test[range(0,10)])
test_preds
event_times = test["time"]
event_observed = test["event"]
event_times, event_observed
from lifelines.utils import concordance_index
concordance_index(event_times, -test_preds, event_observed)

In [None]:
def int_brier_score(cases,subcohort,test,n_covariates,model,lifelines = False):
    # First we get a copy of the training data to estimate the censoring distribution
    # creating case-subcohort data frame and removing duplicate entries of cases
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop(columns = 'subcohort').drop_duplicates()

    # oversampled data set
    # "covariates"
    X = case_subcohort[[i for i in range(0,n_covariates)]+['time']]
    # "classes" to be oversampled. Here, cases
    y = case_subcohort["event"]
    ros = RandomOverSampler(sampling_strategy = {True: len(cases), False: len(cohort) - len(cases)})
    X_resampled, y_resampled = ros.fit_resample(X, y)
    y_train = Surv().from_arrays(y_resampled, X_resampled['time'])
    
    if lifelines == False:
        # survival function predictions
        survs = model.predict_survival_function(X_test)

        # times at which to evaluate survival function
        times = np.arange(min(model.event_times_),max(model.event_times_),(max(model.event_times_) - min(model.event_times_))/100)

        preds = np.asarray([[fn(t) for t in times] for fn in survs])

        score = integrated_brier_score(y_train, y_test, preds, times)
    else:
        # survival function predictions
        survs = model.predict_survival_function(X_test)

        # times at which to evaluate survival function
        times = survs.index[np.where((survs.index < max(test['time'])) & (survs.index > min(test['time']))) ]

        preds = np.array(survs.iloc[np.where((survs.index < max(test['time'])) & (survs.index > min(test['time']))) ]).transpose()

        score = integrated_brier_score(y_train, y_test, preds, times)
        
    return(score)
    

In [None]:
int_brier_score(cases,subcohort,test,cph7,lifelines = True)

## Decision tree

In [None]:
from sksurv.tree import SurvivalTree
from sksurv.util import Surv

### No weighting

In [None]:
def unweighted_tree(cases,subcohort,n_covariates):
    # creating case-subcohort data frame and removing duplicate entries of cases
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop(columns = 'subcohort').drop_duplicates()
    
    # matrix of covariates
    X_train = case_subcohort[range(0,n_covariates)]
    # (event,time) response array
    y_train = Surv().from_arrays(case_subcohort['event'], case_subcohort['time'])
    
    # fitting the tree
    tree = SurvivalTree()
    tree.fit(X_train, y_train)
    
    return(tree)

In [None]:
tree = unweighted_tree(cases,subcohort,n_covariates)

In [None]:
X_test = test[range(0,10)]
X_test

In [None]:
y_test = Surv().from_dataframe('event','time',test)

In [None]:
tree.score(X_test,y_test)

### Random over sampler

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
def ros_tree(cases,subcohort,n_covariates):
    # creating case-subcohort data frame and removing duplicate entries of cases
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop(columns = 'subcohort').drop_duplicates()
    
    # oversampled data set
    # "covariates"
    X = case_subcohort[[i for i in range(0,n_covariates)]+['time']]
    # "classes" to be oversampled. Here, cases
    y = case_subcohort["event"]
    ros = RandomOverSampler(sampling_strategy = {True: len(cases), False: len(cohort) - len(cases)})
    X_resampled, y_resampled = ros.fit_resample(X, y)
    
    # matrix of covariates
    X_train = X_resampled[range(0,n_covariates)]
    # (event,time) response array
    y_train = Surv().from_arrays(y_resampled, X_resampled['time'])
    
    # fitting the tree
    tree = SurvivalTree()
    tree.fit(X_train, y_train)
    
    return(tree)

In [None]:
tree = ros_tree(cases,subcohort,n_covariates)

In [None]:
X_test = test[range(0,10)]
X_test

In [None]:
y_test = Surv().from_dataframe('event','time',test)

In [None]:
tree.score(X_test,y_test)

In [None]:
int_brier_score(cases,subcohort,test,tree)

### SMOTENC

In [None]:
from imblearn.over_sampling import SMOTENC

In [None]:
def smotenc_tree(cases,subcohort,n_covariates):
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop(columns = 'subcohort').drop_duplicates()

    # oversampled data set
    # "covariates"
    X = case_subcohort[[i for i in range(0,n_covariates)]+['time']]
    # "classes" to be oversampled. Here, cases
    y = case_subcohort["event"]
    categorical_features = list(np.where([sum(~(cases[i].isin([0,1]))) == 0 for i in range(0,n_covariates)])[0])
    smote_nc = SMOTENC(categorical_features=categorical_features)
    X_resampled, y_resampled = smote_nc.fit_resample(X, y)
    
    # matrix of covariates
    X_train = X_resampled[range(0,n_covariates)]
    # (event,time) response array
    y_train = Surv().from_arrays(y_resampled, X_resampled['time'])
    
    # fitting the tree
    tree = SurvivalTree()
    tree.fit(X_train, y_train)
    
    return(tree)

In [None]:
tree2 = smotenc_tree(cases,subcohort,n_covariates)

In [None]:
tree2.score(X_test,y_test)

In [None]:
int_brier_score(cases,subcohort,test,tree2)

## Balanced Survival Forest

In [11]:
from sksurv.ensemble import RandomSurvivalForest

In [12]:
def unweighted_rsf(cases,subcohort,n_covariates):
    # creating case-subcohort data frame and removing duplicate entries of cases
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop(columns = 'subcohort').drop_duplicates()
    
    # matrix of covariates
    X_train = case_subcohort[range(0,n_covariates)]
    # (event,time) response array
    y_train = Surv().from_arrays(case_subcohort['event'], case_subcohort['time'])
    
    # fitting the random survival forest
    rsf = RandomSurvivalForest(n_estimators=1000)
    rsf.fit(X_train, y_train)
    
    return(rsf)

In [14]:
rsf = unweighted_rsf(cases,subcohort,n_covariates)

KeyError: "['subcohort'] not found in axis"

### Naive random over-sampling

In [None]:
from imblearn.over_sampling import RandomOverSampler

We set the "class" to event, because controls are undersampled. We want there to be similar to in the cohort, so we want the number of cases simply to be $n_\text{cases}$, and the number of controls to be $n_\text{cohort} - n_\text{controls}$.

Function fitting random oversampled random survival forest.

In [None]:
X_test = test[range(0,n_covariates)]
X_test

In [None]:
y_test = Surv().from_dataframe('event','time',test)

In [None]:
rsf.score(X_test,y_test)

So concordance not quite as good here.

In [None]:
from sksurv.metrics import integrated_brier_score

In [None]:
def int_brier_score(cases,subcohort,test,model):
    # First we get a copy of the training data to estimate the censoring distribution
    # creating case-subcohort data frame and removing duplicate entries of cases
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop(columns = 'subcohort').drop_duplicates()

    # oversampled data set
    # "covariates"
    X = case_subcohort[[i for i in range(0,n_covariates)]+['time']]
    # "classes" to be oversampled. Here, cases
    y = case_subcohort["event"]
    ros = RandomOverSampler(sampling_strategy = {True: len(cases), False: len(cohort) - len(cases)})
    X_resampled, y_resampled = ros.fit_resample(X, y)
    y_train = Surv().from_arrays(y_resampled, X_resampled['time'])
    
    # survival function predictions
    survs = model.predict_survival_function(X_test)
    
    # times at which to evaluate survival function
    times = np.arange(min(model.event_times_),max(model.event_times_),(max(model.event_times_) - min(model.event_times_))/100)
    
    preds = np.asarray([[fn(t) for t in times] for fn in survs])
    
    score = integrated_brier_score(y_train, y_test, preds, times)
    
    return(score)
    

In [None]:
int_brier_score(cases,subcohort,test,rsf)

### SMOTENC

In [None]:
from imblearn.over_sampling import SMOTENC

In [None]:
def smotenc_rsf(cases,subcohort,n_covariates):
    case_subcohort = pd.concat([cases,subcohort])
    case_subcohort = case_subcohort.drop(columns = 'subcohort').drop_duplicates()

    # oversampled data set
    # "covariates"
    X = case_subcohort[[i for i in range(0,n_covariates)]+['time']]
    # "classes" to be oversampled. Here, cases
    y = case_subcohort["event"]
    categorical_features = list(np.where([sum(~(cases[i].isin([0,1]))) == 0 for i in range(0,10)])[0])
    smote_nc = SMOTENC(categorical_features=categorical_features)
    X_resampled, y_resampled = smote_nc.fit_resample(X, y)
    
    # matrix of covariates
    X_train = X_resampled[range(0,n_covariates)]
    # (event,time) response array
    y_train = Surv().from_arrays(y_resampled, X_resampled['time'])
    
    # fitting the random survival forest
    rsf = RandomSurvivalForest(n_estimators=1000)
    rsf.fit(X_train, y_train)
    
    return(rsf)

In [None]:
rsf2 = smotenc_rsf(cases,subcohort,n_covariates)

In [None]:
rsf2.score(X_test,y_test)

In [None]:
int_brier_score(cases,subcohort,test,rsf2)