In [1]:
import numpy as np
import pandas as pd
from lifelines import CoxPHFitter
from scipy.stats import invgauss

### Explore HR that would be obtained in a real-world cohort

In [2]:
def rinvg(theta_0, delta, N):  # random number following invG    
    lam = 2 * delta**2   
    mu = delta / ( np.sqrt(theta_0) * lam)
    y  = invgauss.rvs( mu, loc = 0, scale = lam, size=N)
    return y

In [3]:
def calc_lambda_0(p0, theta_0, delta, rate, a = 0, frailty ="gamma"): # returns baseline hazard lambda0 for a given p0
    
    if frailty == "gamma":        
        r = -np.log(1-p0)/delta
        r = (np.exp(r)-1)/(rate - 1) 
        lambda_0_ref = theta_0 * r * np.log(rate)        
    
    else:
        y = - a * np.log(1 - p0)/delta + theta_0**a      
        y =  y**(1/a) - theta_0
        y /= rate - 1  
        lambda_0_ref = y * np.log(rate)
        
    return lambda_0_ref

In [4]:
def generate_event(lambda_0, rate, ln_r, N):  
    
    u = np.random.rand(N)
    t = 1 - ln_r * np.log(1 - u)/lambda_0    
    t = np.log(t) / ln_r
    lambda_t = lambda_0 * rate**t     
    
    return t, lambda_t

def generate_event_d(lambda_0, theta_0, delta, rate, a, N, start, d): 
    
    ln_r = np.log(rate)
    lambda_ = lambda_0
    t = 0
    tmin = 0    
    t_ev_list = np.empty((N,d))
    i = 0
    while (tmin < start):
        t_ev, lambda_ = generate_event(lambda_, rate, ln_r, N)
        t += t_ev
        t_ev_list[:,i] = t
        tmin = t.min()   
        i += 1
    return t_ev_list[:, :i]

def generate_cohort_dataset(lambda_0_ref, theta_0, delta, rate, start, end, N, d, frailty):
    
    if frailty == "gamma":       # Gamma 
        lambda_0_list = np.random.gamma(theta_0, 1/delta, N)
        a = 0
    else:  # inverse-Gaussian
        lambda_0_list = rinvg(theta_0, delta, N)   # at this stage, lambda_0_list is a set of frailties
        a = 0.5
        
    lambda_0_list *= lambda_0_ref 
  
    t_seq = generate_event_d(lambda_0_list, theta_0, delta, rate, a, N, start, d)
    observation = end - start
    timin = t_seq.min(axis = 1)
    history = timin < start
    t_obs = t_seq - start
    t_obs = np.where(t_obs > 0, t_obs, 100)
    timin =  np.min(t_obs, axis = 1)
    event = timin < observation    
    df = pd.DataFrame({"T": timin, "event": event, "history": history})
    
    return df

In [5]:
def cohort_study(lambda_0_ref, theta_0, delta, rate, start, end, N, d, frailty):
   
    df = generate_cohort_dataset(lambda_0_ref, theta_0, delta, rate, start, end, N, d, frailty)
    prior = df["history"].mean()
    events = df.groupby("history")["event"].mean().values 
    cph = CoxPHFitter()
    cph.fit(df[["T", 'event', 'history']], duration_col = 'T', event_col = 'event')
    
    return cph.params_[0], cph.variance_matrix_.values[0,0], prior, *events   

In [6]:
def simulate_cohort(lambda_0_ref, theta_0, delta, rate, start, end, N, d, frailty, Nsim):
    results = np.empty((Nsim, 5))    
    for k in range(Nsim):
        results[k,:] = cohort_study(lambda_0_ref, theta_0, delta, rate, start, end, N, d, frailty)
    return results

In [7]:
def calc_summary(results, ):
    """
    values:
        beta: coefficient of prior event
        beta2.5, 97.5, 95%CI of beta
        HR: hazard ratio of prior event
        2.5%, 97.5%: 95%CI of HR
        history: proportion of individuals with prior events
        event_0: proportion of events among individuals without prior events
        event_1: proportion of events among individuals with prior events
    """
    Nsim = results.shape[0]
    v = results[:,0].var()
    m = results.mean(axis = 0)
    se = np.sqrt((v + m[1])/Nsim )
    ci = m[0] + se * np.array([-1.959964, 1.959964])
    hr = np.exp( np.array([ m[0], ci[0], ci[1] ]) )
    mean_outcomes = results[:,2:].mean(axis = 0)
    res = [m[0], *ci, *hr, *mean_outcomes]
    res = pd.Series(res)
    res.index = ["beta", "beta2.5", "beta97.5", "HR", "2.5%", "97.5%", "history", "event_0", "event_1"]
    res = res.round(2)
    return res

In [8]:
N = 8000
Nsim = 100
d = 100  # set an arbitrarily large integer to ensure that every individual enters the cohort

# setting
rate = 1.5**0.1
RR = 2.5
start = 1
end = 5
# initial risk
p0 = 0.03  

# parameters 
theta_0 = 0.67 
delta   = 0.67
# frailty 
frailty = "gamma"
a = 0

#  or 
# frailty = "invG"
# a = 0.5


In [9]:
lambda_0 = calc_lambda_0(p0, theta_0, delta, rate, a = a, frailty = "gamma")
results = simulate_cohort(lambda_0_ref = lambda_0, theta_0 = theta_0, delta = delta, rate = rate,
                           start = start, end = end, N = N, d = d, frailty = frailty,  Nsim = Nsim)
res = calc_summary(results)
res           

beta        0.94
beta2.5     0.90
beta97.5    0.97
HR          2.55
2.5%        2.46
97.5%       2.64
history     0.03
event_0     0.11
event_1     0.27
dtype: float64