# Model fitting

In [1]:
import numpy as np
from scipy.stats import weibull_min # r weibull simulation
from scipy.stats import norm # for covariate simulation
from scipy.stats import gamma # for weibull shape parameter
from scipy.stats import bernoulli # for censoring
from scipy.stats import uniform
from scipy.stats.mstats import mquantiles
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [3]:
from data_simulation import *

## Cox mode, unweighted
First we look at the standard Cox model without adjusting for bias. When the model is not mispecified at all, the performance of the unweighted version may not be that different, however, if the model is mispecified, the results may be poorer.

The following function fits an unweighted Cox model:

In [4]:
from lifelines import CoxPHFitter

def fit_cox(cases, subcohort,n_covariates):
    # cases: cases dataframe
    # cohort: cohort dataframe
    # n_covariates: the number of covariates used in the simulation
    
    # creating a single case subcohort dataframe
    case_subcohort_df = pd.concat([cases,subcohort])
    # removing unnecessary columns and duplicate rows
    case_subcohort_df = case_subcohort_df.loc[case_subcohort_df.duplicated() == False,[i for i in range(0,n_covariates)]+["time", "event"]]
    
    # creating the model and fitting the data
    cph = CoxPHFitter()
    cph.fit(case_subcohort_df, duration_col = "time", event_col = "event")
    return(cph)

## Weighted Cox Model
Now we fit a Cox model using weighting methods:

To fit the model with Barlow weights, we split the data points corresponding to events into two parts. We use the case data to construct the interval at the event which has weight 1. Using the subcohort data, we construct the interval before the event that has weight $\frac{1}{\alpha}$. All non-events in the subcohort have weight $\frac{1}{\alpha}$ while in the risk set.

Function for changing data for Cox model with Barlow weights:

In [261]:
def barlow_trans(cases,subcohort, n_covariates, alpha = len(subcohort)/len(cohort)):
    # cases: cases dataframe
    # subcohort: subcohort dataframe
    # n_covariates: number of covariates used in the simulation
    # alpha: the sampling proportion used for the subcohort
    
    
    # finding the order of magnitude of data to pick the appropriate size of each "instant". We use the largest event time for this.
    order = int(np.floor(np.log(max(cases["time"]))/np.log(10))) 
    
    
    cases = cases.assign(
        # setting events outside subcohort to start just before they occur
        start_time = lambda df: df["time"] - 10**-(- order + 5),
        # adding appropriate weight
        weight = 1,
        subcohort = False
    )
    # setting times < 0  to 0
    cases["start_time"] = np.where(cases["start_time"] < 0, 0, cases["start_time"]) 
    
    subcohort = subcohort.assign(
        # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        time = lambda df: np.where(df["event"], df["time"] - 10**-(- order + 5), df["time"]), 
        # the events start from the origin
        start_time = 0, 
        event = False,
        weight = 1/alpha,
        subcohort = True
    )
    # drop any rows where the start time in cases is 0.
    if len(np.where(cases["start_time"] == 0)) == 0:
        subcohort.drop(np.where(cases["start_time"] == 0))

    return(pd.concat([cases,subcohort])[[i for i in range(0,n_covariates)]+["start_time","time", "event","weight","subcohort"]])