# Single-level model

In this notebook, we relax both the the random sampling assumption and the perfect testing assumption to create the final single-level model.

As in the Imperfect Testing model, test accuracy parameter priors are calibrated with data from the [Joint PHE Porton Down & University of Oxford SARS-CoV-2 test development and validation cell](https://www.ox.ac.uk/sites/files/oxford/media_wysiwyg/UK%20evaluation_PHE%20Porton%20Down%20%20University%20of%20Oxford_final.pdf). 

In [1]:
import time
import arviz
import numpy as np
import pandas as pd
import stan
import nest_asyncio
nest_asyncio.apply()

In [2]:
# Parameters
P = 10
N = 3*(10 ** 4)
SEED = 1729

In [3]:
# Ground truth
np.random.seed(SEED)
true_transmission_rate = np.random.beta(2, 10, P)
true_occurrence_rate = np.random.beta(2, 10, P)
base_rate = np.random.beta(2, 10, 1)

t_i = np.random.beta(8, 2, 1)  # Prob(tested | infected)
t_not_i = np.random.beta(2, 20, 1)  # Prob(tested | not-infected)
true_gamma = np.array([t_i, t_not_i])

test_sensitivity = np.random.beta(4, 3, 1)  # True positive rate
test_specificity = np.random.beta(50, 2, 1)  # True negative rate
true_lambda = np.array([test_sensitivity, test_specificity])

In [4]:
# Simulate data
data = {}
for p in range(P):
    occurrence = np.random.binomial(1, true_occurrence_rate[p], N)
    transmission = occurrence * np.random.binomial(1, true_transmission_rate[p], N)
    data[f'O{p+1}'] = occurrence
    data[f'T{p+1}'] = transmission

data['T0'] = np.random.binomial(1, base_rate, N)
X = pd.DataFrame(data)
z = X.loc[:, X.columns.str.startswith('T')].sum(axis=1)
y = (z > 0).astype(int)

# Resampling using testing probabilites conditional on infected
tested = y*np.random.binomial(1, true_gamma[0], N) + (1-y)*np.random.binomial(1, true_gamma[1], N)
y = y[tested == 1]
X_survey = X[tested == 0].reset_index()
X = X[tested == 1].reset_index()

N = X.shape[0]
NA = X_survey.shape[0]

# Introducing false positives and negatives
y = y*np.random.binomial(1, true_lambda[0], N) + (1-y)*np.random.binomial(1, (1-true_lambda[1]), N)

X = X.loc[:, X.columns.str.startswith('O')]
X_survey = X_survey.loc[:, X_survey.columns.str.startswith('O')]
X.info()
X_survey.info()

INFO:numexpr.utils:NumExpr defaulting to 8 threads.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8193 entries, 0 to 8192
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   O1      8193 non-null   int64
 1   O2      8193 non-null   int64
 2   O3      8193 non-null   int64
 3   O4      8193 non-null   int64
 4   O5      8193 non-null   int64
 5   O6      8193 non-null   int64
 6   O7      8193 non-null   int64
 7   O8      8193 non-null   int64
 8   O9      8193 non-null   int64
 9   O10     8193 non-null   int64
dtypes: int64(10)
memory usage: 640.2 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21807 entries, 0 to 21806
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   O1      21807 non-null  int64
 1   O2      21807 non-null  int64
 2   O3      21807 non-null  int64
 3   O4      21807 non-null  int64
 4   O5      21807 non-null  int64
 5   O6      21807 non-null  int64
 6   O7      21807 non-null  int64
 7   O8 

In [6]:
# Set antigen test mean and std. error for TP and TN rates (for strong priors)

# True positive
mean_tp = 0.73000
se_tp = 0.04133

# True negative
mean_tn = 0.99680
se_tn = 0.00066

mean_rates = np.array([mean_tp, mean_tn])
se_rates = np.array([se_tp, se_tn])

In [5]:
model_code = """
data {
  int<lower=0> N;                            // number of test observations
  int<lower=0> NA;                           // number of survey observations
  int<lower=0> P;                            // number of places
  int<lower=0, upper=1> X[N,P];              // activity occurrences of tested individuals
  int<lower=0, upper=1> y[N];                // transmission (tested positive)
  int<lower=0, upper=1> survey[NA,P];        // activity occurrences of surveyed individuals
  real<lower=0, upper=1> mean_lambda[2];     // mean TP and TN test rates (for strong priors on tests)
  real<lower=0> se_lambda[2];                // standard error of test rates (for strong priors on tests)
}
parameters {
  real<lower=0, upper=1> theta[P];           // transmission rates
  real<lower=0, upper=1> rho;                // underlying risk
  real<lower=0, upper=1> gamma[2];           // Testing rates, given infected status [T|I , T|!I]
  real<lower=0, upper=1> lambda[2];          // True positive and true negative rates of tests [TP,TN]
}
model {
  // Precomputation
  real log1m_theta[P];
  real log1m_rho;
  real log_gamma[2];
  real log1m_gamma[2];
  real log_lambda[2];
  real log1m_lambda[2];
  real a_lambda[2];
  real b_lambda[2];
  
  for (p in 1:P) {
    log1m_theta[p] = log1m(theta[p]);
  }
  for(i in 1:2){
    log_gamma[i] = log(gamma[i]);
    log1m_gamma[i] = log1m(gamma[i]);
    log_lambda[i] = log(lambda[i]);
    log1m_lambda[i] = log1m(lambda[i]);
    
    a_lambda[i] = (((1-mean_lambda[i])/se_lambda[i]^2)-(1/mean_lambda[i]))*(mean_lambda[i]^2);
    b_lambda[i] = a_lambda[i]*((1/mean_lambda[i])-1);
  }
  
  log1m_rho = log1m(rho);
  
  // Priors
  theta ~ uniform(0, 1);
  rho ~ uniform(0, 1);
  gamma ~ uniform(0, 1);
  lambda[1] ~ beta(a_lambda[1], b_lambda[1]);
  lambda[2] ~ beta(a_lambda[2], b_lambda[2]);

  // Likelihood
  for (n in 1:NA) {
    real s = 0.0;
    for (p in 1:P) {
      if (survey[n,p] == 1) {
        s += log1m_theta[p];
      }
    }
    s += log1m_rho;
    target += log_sum_exp((log1m_exp(s)+log1m_gamma[1]), (s+log1m_gamma[2]));
  }
  
  for (n in 1:N) {
    real s = 0.0;
    for (p in 1:P) {
      if (X[n,p] == 1) {
        s += log1m_theta[p];
      }
    }
    s += log1m_rho;
    
    if (y[n] == 1) {
      target += log_sum_exp((log1m_exp(s) + log_gamma[1] + log_lambda[1]), (s + log_gamma[2] + log1m_lambda[2]));
    } 
    else {
      target += log_sum_exp((s + log_gamma[2] + log_lambda[2]), (log1m_exp(s) + log_gamma[1] + log1m_lambda[1]));
    }
  }
}
""")

INFO:pystan:COMPILING THE C++ CODE FOR MODEL tt_mod_a03bc91f50c930c152bd3c864e9830b7 NOW.


In [6]:
# Build model
model_data = {'N': N, 'NA': NA, 'P': P, 'X': X.to_numpy(),
              'survey': X_survey.to_numpy(), 'y': y.to_numpy(), 
              'mean_lambda':mean_rates, 'se_lambda':se_rates }
posterior = stan.build(model_code, data=model_data, random_seed=1)

Building... This may take some time.
Done.


In [7]:
# Set NUTS parameters
nuts_samples = 1000
nuts_burnin = 500
nuts_chains = 16

# Posterior sampling
start = time.time()
fit = posterior.sample(num_samples=nuts_samples, num_warmup=nuts_burnin, num_chains=nuts_chains)
print(f"{time.time() - start:.02f} seconds elapsed")

Sampling...
     0/24000 [>---------------------------]   0%  1 sec/0     
     1/24000 [>---------------------------]   0%  1 sec/27513 
     1/24000 [>---------------------------]   0% 3 secs/52929 
     1/24000 [>---------------------------]   0% 4 secs/80477 
     1/24000 [>---------------------------]   0% 5 secs/108067
     1/24000 [>---------------------------]   0% 6 secs/134210
     1/24000 [>---------------------------]   0% 7 secs/158981
     1/24000 [>---------------------------]   0% 8 secs/184337
     1/24000 [>---------------------------]   0% 9 secs/209075
     1/24000 [>---------------------------]   0% 10 secs/237038
     1/24000 [>---------------------------]   0% 11 secs/262076
     1/24000 [>---------------------------]   0% 13 secs/291337
     1/24000 [>---------------------------]   0% 14 secs/318410
     1/24000 [>---------------------------]   0% 15 secs/344222
     1/24000 [>---------------------------]   0% 16 secs/370871
     1/24000 [>----------------------

189.34 seconds elapsed


In [None]:
# Evaluate fit with test and trace resample
arviz.plot_trace(fit, figsize=(8, 6));

In [16]:
fit = fit.to_frame()
print("Inference on fitted model:")
fit.describe().T

Inference on fitted model:


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
parameters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
lp__,16000.0,-6244.083799,2.600639,-6255.38116,-6245.592456,-6243.674758,-6242.175793,-6239.241258
accept_stat__,16000.0,0.940213,0.06522104,0.538875,0.915272,0.960406,0.986669,1.0
stepsize__,16000.0,0.361898,8.265869e-14,0.361898,0.361898,0.361898,0.361898,0.361898
treedepth__,16000.0,3.458,0.504234,2.0,3.0,3.0,4.0,4.0
n_leapfrog__,16000.0,12.108,3.964507,3.0,7.0,15.0,15.0,31.0
divergent__,16000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
energy__,16000.0,6249.594005,3.572676,6240.867771,6247.054267,6249.27171,6251.676799,6264.177631
theta.1,16000.0,0.181797,0.03058368,0.089707,0.162426,0.181668,0.202333,0.284292
theta.2,16000.0,0.190226,0.01966616,0.128776,0.177035,0.190416,0.20335,0.246257
theta.3,16000.0,0.126042,0.03039697,0.034283,0.105261,0.126503,0.146348,0.221498


In [None]:
print('Ground truth:')
print('Setting-specific transmission rates: ', true_transmission_rate)
print('Base rate: ', base_rate)
print('Testing rates: ', true_gamma)
print('True positive and negative rates: ', true_lambda)