# Simulat an example data set

In this notebook, we simulate an example data set for testing purpose. It models the systolic blood pressure (SBP) by predictors age, bmi, gender, and meds.

In [1]:
versioninfo()

Julia Version 1.4.1
Commit 381693d3df* (2020-04-14 17:20 UTC)
Platform Info:
  OS: macOS (x86_64-apple-darwin18.7.0)
  CPU: Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-8.0.1 (ORCJIT, haswell)
Environment:
  JULIA_NUM_THREADS = 4


In [2]:
# load needed packages
using CSV, DataFrames, Distributions, LinearAlgebra, Random, StatsBase, WiSER

In [3]:
Random.seed!(123)
# dimensions
m  = 500 # number of individuals
ns = rand(9:11, m) # numbers of observations per individual
p  = 5    # number of fixed effects, including intercept
q  = 2    # number of random effects, including intercept
l  = 4    # number of WS variance covariates, including intercept
obsvec = Vector{WSVarLmmObs{Float64}}(undef, m)
# true parameter values
βtrue = [100.0; 15.0; 10.0; 0.25; -10.0] # intercept, agegroup, #gender, #bmi, #meds
τtrue = [-2.5; 1.5; -0.5; 0.0] # intercept, agegroup, meds, bmi
Σγ    = Matrix(Diagonal([1.0; 0.0]))
δγω   = [0.0; 0.0; zeros(q - 2) ./ 10]
σω    = [0.0]
Σγω   = [Σγ δγω; δγω' σω]
Lγω   = cholesky(Symmetric(Σγω), check = false).L
Lγ    = Lγω[1:q, 1:q]
lγω   = Lγω[q + 1, 1:q]
lω    = Lγω[q + 1, q + 1]
# generate data
γω = Vector{Float64}(undef, q + 1)
z  = similar(γω) # hold vector of iid std normal
df = DataFrame(id = String[], sbp = Float64[], agegroup = Float64[], 
    gender = Int64[], bmi = Float64[], meds = Float64[])
for i in 1:m
    # first column intercept, remaining entries iid std normal
    X = Matrix{Float64}(undef, ns[i], p)
    X[:, 1] .= 1
    agegroup = Distributions.rand(1:3) #age
    gender = Distributions.rand(Bernoulli(0.5)) #gender
    meds = Distributions.rand(Bernoulli(0.2)) #meds
    bmi = Distributions.rand(Normal(25, 1.2), ns[i])
    @views fill!(X[:, 2], agegroup)
    @views fill!(X[:, 3], Int(gender))
    @views copyto!(X[:, 4], bmi)
    @views fill!(X[:, 5], meds)
    # first column intercept, remaining entries iid std normal
    Z = Matrix{Float64}(undef, ns[i], q)
    Z[:, 1] .= 1
    @views copyto!(Z[:, 2], X[:, 4]) #bmi 
    # first column intercept, remaining entries iid std normal
    W = Matrix{Float64}(undef, ns[i], l)
    W[:, 1] .= 1
    @views copyto!(W[:, 2], X[:, 2]) #agegroup
    @views copyto!(W[:, 3], X[:, 5]) #meds
    @views copyto!(W[:, 4], X[:, 4]) #bmi
    # generate random effects: γω = Lγω * z
    mul!(γω, Lγω, Distributions.rand!(Normal(), z))
    # generate y
    μy = X * βtrue + Z * γω[1:q]
    @views vy = exp.(W * τtrue .+ dot(γω[1:q], lγω) .+ γω[end])
    y = rand(MvNormal(μy, Diagonal(vy)))
    if i == 8
        @show vy
        @show μy
        @show y
    end
    id = fill(string(i), ns[i])
    tempdf = DataFrame([id y X[:, 2:p]])
    rename!(tempdf, [:id, :sbp, :agegroup, :gender, :bmi, :meds])
    # form a WSVarLmmObs instance
    append!(df, tempdf)
end
rename!(df, [:id, :sbp, :agegroup, :gender, :bmi, :meds])
df[!, :bmi_std] = zscore(df[!, :bmi])
df

vy = [7.38905609893065, 7.38905609893065, 7.38905609893065, 7.38905609893065, 7.38905609893065, 7.38905609893065, 7.38905609893065, 7.38905609893065, 7.38905609893065]
μy = [160.73568614876524, 161.12726738169906, 160.7523165946845, 160.7897198908203, 160.3457789407866, 160.27668438054704, 160.90340388121535, 160.5821039860477, 161.0410901880801]
y = [155.92040542556265, 164.06032803785934, 160.3399467983824, 157.3128981314478, 163.2894577026046, 159.60558405082082, 159.88272843704567, 165.7592300539918, 163.30987783782464]


Unnamed: 0_level_0,id,sbp,agegroup,gender,bmi,meds,bmi_std
Unnamed: 0_level_1,String,Float64,Float64,Int64,Float64,Float64,Float64
1,1,159.586,3.0,1,23.1336,0.0,-1.57733
2,1,161.849,3.0,1,26.5885,0.0,1.29927
3,1,160.484,3.0,1,24.8428,0.0,-0.154204
4,1,161.134,3.0,1,24.9289,0.0,-0.0825105
5,1,165.443,3.0,1,24.8057,0.0,-0.185105
6,1,160.053,3.0,1,24.1583,0.0,-0.72415
7,1,162.1,3.0,1,25.2543,0.0,0.188379
8,1,163.153,3.0,1,24.3951,0.0,-0.527037
9,1,166.675,3.0,1,26.1514,0.0,0.935336
10,2,130.765,1.0,1,22.6263,0.0,-1.99977


In [4]:
df = leftjoin(df, combine(groupby(df, :id), :id => (x -> rand(1.0:5.0)) => :obswt), on = :id) #add observation weights 
df[!, :gender] = map(x -> x == 1 ? "Male" : "Female", df[!, :gender]);
df[!, :meds] = map(x -> x == 1 ? "OnMeds" : "NoMeds", df[!, :meds]);
describe(df)

Unnamed: 0_level_0,variable,mean,min,median,max,nunique,nmissing,eltype
Unnamed: 0_level_1,Symbol,Union…,Any,Union…,Any,Union…,Union…,Type
1,id,,1,,99,500.0,,String
2,sbp,140.365,109.865,139.44,171.545,,,Float64
3,agegroup,2.05388,1.0,2.0,3.0,,,Float64
4,gender,,Female,,Male,2.0,,String
5,bmi,25.028,20.7168,25.0226,29.6457,,,Float64
6,meds,,NoMeds,,OnMeds,2.0,,String
7,bmi_std,-4.95579e-16,-3.58964,-0.00450848,3.84477,,,Float64
8,obswt,3.12213,1.0,3.0,5.0,,0.0,"Union{Missing, Float64}"


Form a `WSVarLmmModel` object and fit the WiSER model.

In [5]:
vlmm = WSVarLmmModel(
    @formula(sbp ~ 1 + agegroup + gender + bmi_std + meds), 
    @formula(sbp ~ 1 + bmi_std), 
    @formula(sbp ~ 1 + agegroup + meds + bmi_std),
    :id, df);

In [6]:
WiSER.fit!(vlmm)


******************************************************************************
This program contains Ipopt, a library for large-scale nonlinear optimization.
 Ipopt is released as open source code under the Eclipse Public License (EPL).
         For more information visit http://projects.coin-or.org/Ipopt
******************************************************************************

run = 1, ‖Δβ‖ = 0.037311, ‖Δτ‖ = 0.166678, ‖ΔL‖ = 0.100999, status = Optimal, time(s) = 0.336550
run = 2, ‖Δβ‖ = 0.005220, ‖Δτ‖ = 0.006748, ‖ΔL‖ = 0.048735, status = Optimal, time(s) = 0.206969



Within-subject variance estimation by robust regression (WiSER)
Number of individuals/clusters: 500
Total observations: 5011

Fixed-effects parameters:
───────────────────────────────────────────────────────────
                     Estimate  Std. Error       Z  Pr(>|Z|)
───────────────────────────────────────────────────────────
β1: (Intercept)   106.308       0.14384    739.07    <1e-99
β2: agegroup       14.9844      0.0633245  236.63    <1e-99
β3: gender: Male   10.0749      0.100279   100.47    <1e-99
β4: bmi_std         0.296424    0.0139071   21.31    <1e-99
β5: meds: OnMeds  -10.1107      0.122918   -82.26    <1e-99
τ1: (Intercept)    -2.5212      0.393792    -6.40    <1e-9
τ2: agegroup        1.50759     0.135456    11.13    <1e-28
τ3: meds: OnMeds   -0.435225    0.0621076   -7.01    <1e-11
τ4: bmi_std         0.0052695   0.0224039    0.24    0.8140
───────────────────────────────────────────────────────────
Random effects covariance matrix Σγ:
 "γ1: (Intercept)"  1.00196    

In [7]:
CSV.write("sbp.csv", df)

"sbp.csv"