# Weibull Simple linear predictor 2

In [396]:
import numpy as np
from scipy.stats import weibull_min # r weibull simulation
from scipy.stats import norm # for covariate simulation
from scipy.stats import gamma # for weibull shape parameter
from scipy.stats import bernoulli # for censoring
from scipy.stats import uniform
from scipy.stats.mstats import mquantiles
import pandas as pd

## Simulated data

In [397]:
def weibull_simple_linear_sim(n_beta, prop_cat, obs, censor_prop, show_beta = False, sigma = 1):
    betas = norm.rvs(scale = sigma, size = n_beta) #generates the coefficients
    X_norm = norm.rvs(size = obs*int(np.floor(n_beta*(1-prop_cat)))).reshape((obs,int(np.floor(n_beta*(1-prop_cat))))) # matrix of normal covariates
    X_cat = bernoulli.rvs(p = 0.5,size = obs*int(np.ceil(n_beta*(prop_cat)))).reshape((obs,int(np.ceil(n_beta*(prop_cat))))) # matrix of categorical) covariates
    X = np.hstack([X_norm,X_cat])
    
    c = uniform.rvs(size = 1, loc = 0.5, scale = 4.5) # shape parameter of weibull
    
    lin_pred = np.matmul(X,betas) # linear predictor
    
    sim_data = pd.DataFrame(X)
    sim_data["y"] = weibull_min.rvs(c, scale = np.exp(-lin_pred/c)) # simulating survival times from weibull distribution
    
    dropout_prop = uniform.rvs(size = 1, scale = 0.5)*censor_prop # proportion that will be censored by dropping out of the study
    sim_data["dropout"] = bernoulli.rvs(size = obs, p = dropout_prop) == 1 #indicator for subject droppoing out
    sim_data["dropout_time"] = np.where(sim_data["dropout"], uniform.rvs(scale = sim_data["y"]), sim_data["y"])
    
    max_time = float(mquantiles(sim_data["dropout_time"], prob = 1 - censor_prop)) # quantile above which we censor
    sim_data = sim_data.assign(
        end_censor = lambda df: df["dropout_time"].map(lambda dropout_time: dropout_time > max_time), # indicator for censoring because of study ending
        time = lambda df: df["dropout_time"].map(lambda dropout_time: min(dropout_time,max_time)), # censoring any times above max time
        event = lambda df: ~(df["dropout"] | df["end_censor"])
    )
    
    if show_beta:
        print(betas)
        return(sim_data)
    else:
        return(sim_data)

Testing the function:

In [398]:
weibull_simple_linear_sim(10, 0.5, 10, 0.8, show_beta = True, sigma = 1)

[ 0.99335905 -0.91175497 -0.97261741  0.80215821 -0.24793277 -0.25628624
  0.45011885  0.18505204 -1.24288921  1.81387447]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y,dropout,dropout_time,end_censor,time,event
0,-2.119236,-0.844431,-0.746485,0.942714,-0.157153,1.0,0.0,0.0,0.0,1.0,0.645877,False,0.645877,True,0.552398,False
1,-0.098515,0.907997,-0.925666,0.103603,0.730367,1.0,0.0,1.0,1.0,0.0,0.478949,False,0.478949,False,0.478949,True
2,0.009567,1.025939,-0.634794,0.310362,0.057909,0.0,1.0,1.0,1.0,0.0,1.248158,True,1.080439,True,0.552398,False
3,-0.014837,1.161171,-0.827063,-1.043067,-0.787809,0.0,0.0,0.0,0.0,0.0,1.176903,False,1.176903,True,0.552398,False
4,0.685399,-1.161378,-1.618598,0.540781,1.613435,1.0,1.0,0.0,0.0,1.0,0.44494,False,0.44494,False,0.44494,True
5,-1.293535,2.053723,0.763244,-0.726033,0.116093,1.0,0.0,1.0,0.0,1.0,1.944332,False,1.944332,True,0.552398,False
6,1.584488,0.584718,-0.629296,1.290437,-1.36848,1.0,0.0,1.0,1.0,0.0,0.870396,True,0.774606,True,0.552398,False
7,0.904383,1.002728,-1.461946,1.854734,-0.578809,0.0,0.0,1.0,1.0,0.0,0.697251,False,0.697251,True,0.552398,False
8,-0.063827,-0.008873,-0.560671,-0.19232,-0.035407,1.0,0.0,0.0,1.0,1.0,0.958788,False,0.958788,True,0.552398,False
9,0.211496,0.48344,-0.947376,-0.176751,0.462666,0.0,1.0,1.0,1.0,1.0,0.671288,False,0.671288,True,0.552398,False


## Test analysis

### Case-subcohort

To test, sample a dataset from the sampler:

In [399]:
sample = weibull_simple_linear_sim(10, 0.5, 1500, 0.6, show_beta = True, sigma = 1)
sample

[ 1.56146171 -1.40294708 -0.95545337  1.5660094   0.50543421 -0.64846066
 -0.74779592  0.22656381  0.49238391  0.56893492]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y,dropout,dropout_time,end_censor,time,event
0,0.401737,-2.040680,-1.712033,0.150991,-0.010425,1.0,1.0,1.0,1.0,1.0,0.008579,False,0.008579,False,0.008579,True
1,-0.054105,2.300240,-0.492668,-0.135268,1.889308,0.0,1.0,1.0,0.0,0.0,24.939105,False,24.939105,True,0.254026,False
2,-1.197646,0.197013,-0.707396,-1.243677,-0.084562,1.0,1.0,1.0,1.0,1.0,41.696365,False,41.696365,True,0.254026,False
3,0.061536,1.143535,-0.082242,0.789187,0.438659,0.0,0.0,1.0,1.0,0.0,0.081426,False,0.081426,False,0.081426,True
4,1.310500,-0.432920,1.388366,-0.949266,-0.035949,1.0,0.0,1.0,0.0,1.0,3.143562,False,3.143562,True,0.254026,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,0.382094,-0.349913,0.039085,0.368830,0.449423,1.0,1.0,1.0,0.0,1.0,0.065636,False,0.065636,False,0.065636,True
1496,-0.117285,-1.499173,-0.385979,1.788696,-0.579649,0.0,0.0,0.0,1.0,1.0,0.000989,False,0.000989,False,0.000989,True
1497,0.539230,-0.694668,-0.337374,0.422725,-0.744344,1.0,1.0,1.0,0.0,1.0,0.235945,True,0.087281,False,0.087281,False
1498,-0.329189,0.469401,0.853344,-0.840725,1.369359,0.0,0.0,1.0,0.0,1.0,29.392769,True,9.161548,True,0.254026,False


Function for splitting data samples:

In [400]:
def cch_splitter(sample):
    cohort = sample.iloc[0:int(round(2*len(sample)/3))] # subsetting the cohort
    cases = cohort[cohort['event'] == True] # subsetting cases in the cohort
    subcohort = cohort.sample(n = len(cases))
    
    test = sample.iloc[int(np.round(2*len(sample)/3)):len(sample)] # subsetting the test set
    
    return(cases, subcohort, cohort, test)

Function for changing data for Cox model with Barlow weights:

In [401]:
def barlow_trans(cases,subcohort):
    order = int(np.floor(np.log(max(cases["time"]))/np.log(10))) # deciding what the granularity will be
    cases = cases.assign(
        time = round(cases["time"],- order + 5),
        start_time = lambda df: df["time"] - 10**-(- order + 5), # setting events outside subcohort to start just before they occur
        subcohort = False # showing that these are the cases chosen outside of the subcohort
    )
    cases = cases.query("start_time > 0") #filtering out readings with negative start times
    
    subcohort = subcohort.assign(
        time = lambda df: np.where(df["event"], df["time"] - 10**-(- order + 5), df["time"]), # if it is a case, the weight should be the same as the subcohort until close to the time of the event. 
        start_time = 0, # the events start from the origin
        event = False,
        subcohort = True
    ) 

    return(pd.concat([cases,subcohort]))
    

Splitting the test data:

In [402]:
cases, subcohort, cohort, test = cch_splitter(sample)

In [403]:
case_subcohort = barlow_trans(cases,subcohort)

## Cox PH

In [404]:
from lifelines import CoxPHFitter

In [405]:
 cph = CoxPHFitter(baseline_estimation_method='spline')

In [406]:
cph = CoxPHFitter()

In [407]:
case_subcohort_df = case_subcohort[[i for i in range(0,10)]+["start_time", "time", "event"]]
case_subcohort_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,start_time,time,event
0,0.401737,-2.040680,-1.712033,0.150991,-0.010425,1.0,1.0,1.0,1.0,1.0,0.008578,0.008579,True
3,0.061536,1.143535,-0.082242,0.789187,0.438659,0.0,0.0,1.0,1.0,0.0,0.081425,0.081426,True
7,-1.480458,-1.511955,-0.902503,1.353365,0.362763,1.0,0.0,1.0,0.0,1.0,0.016059,0.016060,True
10,1.034789,-2.061877,-1.227543,-0.251175,-1.391121,1.0,0.0,0.0,0.0,1.0,0.001029,0.001030,True
14,1.121648,-0.658126,0.307816,0.345568,0.785112,1.0,1.0,0.0,0.0,1.0,0.000062,0.000063,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,-2.216204,-0.401926,0.880416,-0.834288,-1.292830,0.0,1.0,0.0,0.0,0.0,0.000000,0.254026,False
755,0.667518,2.066603,1.235534,0.926106,-0.191390,0.0,0.0,0.0,0.0,1.0,0.000000,0.254026,False
979,-0.852963,-0.115235,-1.649511,1.198011,-0.706702,1.0,1.0,0.0,0.0,1.0,0.000000,0.254026,False
217,-0.026961,-0.871687,0.588966,-0.050204,-1.649098,1.0,1.0,1.0,0.0,0.0,0.000000,0.254026,False


In [408]:
case_subcohort[["start_time","time","event"]]

Unnamed: 0,start_time,time,event
0,0.008578,0.008579,True
3,0.081425,0.081426,True
7,0.016059,0.016060,True
10,0.001029,0.001030,True
14,0.000062,0.000063,True
...,...,...,...
618,0.000000,0.254026,False
755,0.000000,0.254026,False
979,0.000000,0.254026,False
217,0.000000,0.254026,False


In [409]:
samp_fraction = len(subcohort)/len(cohort)
samp_fraction

0.326

In [410]:
case_subcohort_df["weights"] = np.where(case_subcohort["subcohort"],1/samp_fraction,1)
case_subcohort_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  case_subcohort_df["weights"] = np.where(case_subcohort["subcohort"],1/samp_fraction,1)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,start_time,time,event,weights
0,0.401737,-2.040680,-1.712033,0.150991,-0.010425,1.0,1.0,1.0,1.0,1.0,0.008578,0.008579,True,1.000000
3,0.061536,1.143535,-0.082242,0.789187,0.438659,0.0,0.0,1.0,1.0,0.0,0.081425,0.081426,True,1.000000
7,-1.480458,-1.511955,-0.902503,1.353365,0.362763,1.0,0.0,1.0,0.0,1.0,0.016059,0.016060,True,1.000000
10,1.034789,-2.061877,-1.227543,-0.251175,-1.391121,1.0,0.0,0.0,0.0,1.0,0.001029,0.001030,True,1.000000
14,1.121648,-0.658126,0.307816,0.345568,0.785112,1.0,1.0,0.0,0.0,1.0,0.000062,0.000063,True,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,-2.216204,-0.401926,0.880416,-0.834288,-1.292830,0.0,1.0,0.0,0.0,0.0,0.000000,0.254026,False,3.067485
755,0.667518,2.066603,1.235534,0.926106,-0.191390,0.0,0.0,0.0,0.0,1.0,0.000000,0.254026,False,3.067485
979,-0.852963,-0.115235,-1.649511,1.198011,-0.706702,1.0,1.0,0.0,0.0,1.0,0.000000,0.254026,False,3.067485
217,-0.026961,-0.871687,0.588966,-0.050204,-1.649098,1.0,1.0,1.0,0.0,0.0,0.000000,0.254026,False,3.067485


In [411]:
len(case_subcohort)

652

In [412]:
cph.fit(case_subcohort_df, entry_col = "start_time", duration_col = "time",event_col = "event",weights_col = "weights",robust = True)

<lifelines.CoxPHFitter: fitted with 1326 total observations, 1000 right-censored observations>

In [418]:
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'event'
weights col,'weights'
entry col,'start_time'
robust variance,True
baseline estimation,breslow
number of observations,1326
number of events observed,326
partial log-likelihood,-1620.06

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
0,1.9,6.7,0.11,1.68,2.12,5.37,8.36,0.0,16.88,<0.005,209.9
1,-1.71,0.18,0.13,-1.96,-1.46,0.14,0.23,0.0,-13.41,<0.005,133.72
2,-1.23,0.29,0.09,-1.41,-1.05,0.24,0.35,0.0,-13.5,<0.005,135.63
3,1.92,6.85,0.12,1.69,2.16,5.42,8.66,0.0,16.1,<0.005,191.39
4,0.61,1.84,0.09,0.44,0.78,1.55,2.18,0.0,6.97,<0.005,38.18
5,-0.63,0.53,0.16,-0.95,-0.3,0.39,0.74,0.0,-3.8,<0.005,12.76
6,-0.79,0.46,0.16,-1.11,-0.46,0.33,0.63,0.0,-4.78,<0.005,19.12
7,0.82,2.26,0.16,0.5,1.14,1.64,3.12,0.0,5.01,<0.005,20.78
8,0.35,1.41,0.16,0.03,0.66,1.03,1.93,0.0,2.17,0.03,5.06
9,0.42,1.53,0.16,0.11,0.73,1.12,2.08,0.0,2.69,0.01,7.12

0,1
Concordance,0.50
Partial AIC,3260.11
log-likelihood ratio test,1111.68 on 10 df
-log2(p) of ll-ratio test,770.01


In [423]:
test_preds = cph.predict_partial_hazard(test[range(0,10)])
test_preds

1000      0.039620
1001      9.774609
1002      0.064054
1003    153.490925
1004      0.056916
           ...    
1495      1.427778
1496    139.208917
1497      2.950806
1498      0.023504
1499      0.061803
Length: 500, dtype: float64

In [428]:
event_times = test["time"]
event_observed = test["event"]
event_times, event_observed

(1000    0.254026
 1001    0.028862
 1002    0.254026
 1003    0.002142
 1004    0.254026
           ...   
 1495    0.065636
 1496    0.000989
 1497    0.087281
 1498    0.254026
 1499    0.254026
 Name: time, Length: 500, dtype: float64,
 1000    False
 1001     True
 1002    False
 1003     True
 1004    False
         ...  
 1495     True
 1496     True
 1497    False
 1498    False
 1499    False
 Name: event, Length: 500, dtype: bool)

In [429]:
test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,y,dropout,dropout_time,end_censor,time,event
1000,-0.691998,2.368224,-0.625954,1.054698,1.505078,1.0,0.0,0.0,1.0,1.0,4.160831,False,4.160831,True,0.254026,False
1001,0.644966,-1.534516,0.588573,0.025510,0.039152,0.0,0.0,1.0,0.0,0.0,0.028862,False,0.028862,False,0.028862,True
1002,-1.116994,-0.645984,0.103880,0.456469,-0.193211,1.0,0.0,0.0,0.0,0.0,15.503424,False,15.503424,True,0.254026,False
1003,1.854373,-1.293095,1.516850,1.285330,0.126523,0.0,0.0,0.0,1.0,0.0,0.002142,False,0.002142,False,0.002142,True
1004,1.019901,-0.555794,0.048500,-1.904063,-1.499900,1.0,0.0,1.0,0.0,1.0,0.998057,False,0.998057,True,0.254026,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,0.382094,-0.349913,0.039085,0.368830,0.449423,1.0,1.0,1.0,0.0,1.0,0.065636,False,0.065636,False,0.065636,True
1496,-0.117285,-1.499173,-0.385979,1.788696,-0.579649,0.0,0.0,0.0,1.0,1.0,0.000989,False,0.000989,False,0.000989,True
1497,0.539230,-0.694668,-0.337374,0.422725,-0.744344,1.0,1.0,1.0,0.0,1.0,0.235945,True,0.087281,False,0.087281,False
1498,-0.329189,0.469401,0.853344,-0.840725,1.369359,0.0,0.0,1.0,0.0,1.0,29.392769,True,9.161548,True,0.254026,False


In [430]:
from lifelines.utils import concordance_index

In [433]:
concordance_index(event_times, -test_preds, event_observed)

0.922162750612356

## Regularised

In [451]:

cph.fit(case_subcohort_df, entry_col = "start_time", duration_col = "time",event_col = "event",weights_col = "weights",robust = True)
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'time'
event col,'event'
weights col,'weights'
entry col,'start_time'
penalizer,0.05
l1 ratio,1
robust variance,True
baseline estimation,breslow
number of observations,1326

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
0,1.31,3.7,0.08,1.16,1.46,3.18,4.31,0.0,16.97,<0.005,212.12
1,-1.11,0.33,0.09,-1.29,-0.94,0.28,0.39,0.0,-12.5,<0.005,116.67
2,-0.77,0.46,0.07,-0.9,-0.64,0.4,0.53,0.0,-11.41,<0.005,97.8
3,1.29,3.62,0.09,1.12,1.46,3.06,4.29,0.0,14.99,<0.005,166.35
4,0.37,1.44,0.08,0.21,0.52,1.24,1.68,0.0,4.69,<0.005,18.5
5,-0.16,0.85,0.14,-0.44,0.12,0.64,1.13,0.0,-1.12,0.26,1.92
6,-0.37,0.69,0.14,-0.65,-0.09,0.52,0.91,0.0,-2.61,0.01,6.78
7,0.46,1.58,0.14,0.19,0.73,1.2,2.07,0.0,3.3,<0.005,10.02
8,0.0,1.0,0.0,-0.0,0.0,1.0,1.0,0.0,0.13,0.90,0.15
9,0.14,1.15,0.14,-0.13,0.41,0.88,1.51,0.0,1.02,0.31,1.7

0,1
Concordance,0.50
Partial AIC,3713.66
log-likelihood ratio test,658.13 on 10 df
-log2(p) of ll-ratio test,445.86


In [454]:
test_preds = cph.predict_partial_hazard(test[range(0,10)])
concordance_index(event_times, -test_preds, event_observed)

0.9155704738576915

In [None]:
cph.fit(case_subcohort_df, entry_col = "start_time", duration_col = "time",event_col = "event",weights_col = "weights",robust = True)
cph.print_summary()