# Benchmarks for Neg Bin Auto-regressive model 

In [1]:
using Revise
using DataFrames, Random, GLM, GLMCopula
using ForwardDiff, Test, LinearAlgebra
using LinearAlgebra: BlasReal, copytri!
using ToeplitzMatrices
using BenchmarkTools

## 8 BLAS threads, single thread loglikelihood

In [2]:
BLAS.set_num_threads(8)
Threads.nthreads()

1

In [5]:
p_fixed = 3    # number of fixed effects, including intercept

# true parameter values
Random.seed!(12345)
βtrue = rand(Uniform(-2, 2), p_fixed)
rtrue = 10.0
σ2true = [0.5]
ρtrue = [0.5]

function get_V(ρ, n)
    vec = zeros(n)
    vec[1] = 1.0
    for i in 2:n
        vec[i] = vec[i - 1] * ρ
    end
    V = ToeplitzMatrices.SymmetricToeplitz(vec)
    V
end

#simulation parameters
samplesize = 10000

st = time()
currentind = 1
d = NegativeBinomial()
link = LogLink()
D = typeof(d)
Link = typeof(link)
T = Float64

gcs = Vector{NBCopulaARObs{T, D, Link}}(undef, samplesize)

ni = 25 #  number of observations per individual
V = get_V(ρtrue[1], ni)

# true Gamma
Γ = σ2true[1] * V

# for reproducibility I will simulate all the design matrices here
Random.seed!(12345)
X_samplesize = [randn(ni, p_fixed - 1) for i in 1:samplesize]

for i in 1:samplesize
    X = [ones(ni) X_samplesize[i]]
    # X = [ones(ni) randn(ni, p - 1)]
    # X = ones(ni, 1)
    # y = Float64.(Y_nsample[i])
    η = X * βtrue
    μ = exp.(η)
    p = rtrue ./ (μ .+ rtrue)
    vecd = Vector{DiscreteUnivariateDistribution}(undef, ni)
    vecd = [NegativeBinomial(rtrue, p[i]) for i in 1:ni]
    nonmixed_multivariate_dist = NonMixedMultivariateDistribution(vecd, Γ)
    # simuate single vector y
    y = Vector{Float64}(undef, ni)
    res = Vector{Float64}(undef, ni)
    rand(nonmixed_multivariate_dist, y, res)
    # push!(Ystack, y)
    V = [ones(ni, ni)]
    # V = [ones(ni, ni)]
    gcs[i] = NBCopulaARObs(y, X, d, link)
end

# form model
gcm = NBCopulaARModel(gcs);

In [4]:
@time GLMCopula.fit!(gcm, maxBlockIter = 20, tol=1e-8);

Initializing NegBin r to Poisson regression values
initializing β using Newton's Algorithm under Independence Assumption
initializing σ2 and ρ using method of moments
initializing r using Newton update
initializing variance parameters in CS model using mom
Converging when tol ≤ 1.0e-8 (max block iter = 20)

******************************************************************************
This program contains Ipopt, a library for large-scale nonlinear optimization.
 Ipopt is released as open source code under the Eclipse Public License (EPL).
         For more information visit https://github.com/coin-or/Ipopt
******************************************************************************

Block iter 1 r = 9.63, logl = -659876.38, tol = 659876.3844181958
Block iter 2 r = 9.87, logl = -659849.9, tol = 4.013178849649874e-5
Block iter 3 r = 9.89, logl = -659840.08, tol = 1.4888447019632147e-5
Block iter 4 r = 9.91, logl = -659840.95, tol = 1.3245809250537448e-6
Block iter 5 r = 9.96, logl = -

In [6]:
@time GLMCopula.fit!(gcm, maxBlockIter = 20, tol=1e-8);

Initializing NegBin r to Poisson regression values
initializing β using Newton's Algorithm under Independence Assumption
initializing σ2 and ρ using method of moments
initializing r using Newton update
initializing variance parameters in CS model using mom
Converging when tol ≤ 1.0e-8 (max block iter = 20)
Block iter 1 r = 9.63, logl = -659876.38, tol = 659876.3844181958
Block iter 2 r = 9.87, logl = -659849.9, tol = 4.013178849649874e-5
Block iter 3 r = 9.89, logl = -659840.08, tol = 1.4888447019632147e-5
Block iter 4 r = 9.91, logl = -659840.95, tol = 1.3245809250537448e-6
Block iter 5 r = 9.96, logl = -659840.73, tol = 3.3603836102156463e-7
Block iter 6 r = 9.95, logl = -659839.72, tol = 1.5373802639043203e-6
Block iter 7 r = 9.94, logl = -659839.91, tol = 2.9393202737438934e-7
Block iter 8 r = 9.92, logl = -659840.0, tol = 1.312235713428575e-7
Block iter 9 r = 9.97, logl = -659840.8, tol = 1.218456117301108e-6
Block iter 10 r = 9.96, logl = -659839.64, tol = 1.7524068759646176e-6
B

After precompilation, NBAR takes ~85 seconds with multithreaded BLAS and single-threaded loglikelihood evaluations.

## 1 BLAS thread, 8 thread loglikelihood

In [2]:
BLAS.set_num_threads(1)
Threads.nthreads()

8

In [7]:
p_fixed = 3    # number of fixed effects, including intercept

# true parameter values
Random.seed!(12345)
βtrue = rand(Uniform(-2, 2), p_fixed)
rtrue = 10.0
σ2true = [0.5]
ρtrue = [0.5]

function get_V(ρ, n)
    vec = zeros(n)
    vec[1] = 1.0
    for i in 2:n
        vec[i] = vec[i - 1] * ρ
    end
    V = ToeplitzMatrices.SymmetricToeplitz(vec)
    V
end

#simulation parameters
samplesize = 10000

st = time()
currentind = 1
d = NegativeBinomial()
link = LogLink()
D = typeof(d)
Link = typeof(link)
T = Float64

gcs = Vector{NBCopulaARObs{T, D, Link}}(undef, samplesize)

ni = 25 #  number of observations per individual
V = get_V(ρtrue[1], ni)

# true Gamma
Γ = σ2true[1] * V

# for reproducibility I will simulate all the design matrices here
Random.seed!(12345)
X_samplesize = [randn(ni, p_fixed - 1) for i in 1:samplesize]

for i in 1:samplesize
    X = [ones(ni) X_samplesize[i]]
    # X = [ones(ni) randn(ni, p - 1)]
    # X = ones(ni, 1)
    # y = Float64.(Y_nsample[i])
    η = X * βtrue
    μ = exp.(η)
    p = rtrue ./ (μ .+ rtrue)
    vecd = Vector{DiscreteUnivariateDistribution}(undef, ni)
    vecd = [NegativeBinomial(rtrue, p[i]) for i in 1:ni]
    nonmixed_multivariate_dist = NonMixedMultivariateDistribution(vecd, Γ)
    # simuate single vector y
    y = Vector{Float64}(undef, ni)
    res = Vector{Float64}(undef, ni)
    rand(nonmixed_multivariate_dist, y, res)
    # push!(Ystack, y)
    V = [ones(ni, ni)]
    # V = [ones(ni, ni)]
    gcs[i] = NBCopulaARObs(y, X, d, link)
end

# form model
gcm = NBCopulaARModel(gcs);

In [8]:
@time GLMCopula.fit!(gcm, maxBlockIter = 20, tol=1e-8);

Initializing NegBin r to Poisson regression values
initializing β using Newton's Algorithm under Independence Assumption
initializing σ2 and ρ using method of moments
initializing r using Newton update
initializing variance parameters in CS model using mom
Converging when tol ≤ 1.0e-8 (max block iter = 20)
Block iter 1 r = 9.63, logl = -659876.38, tol = 659876.3844181958
Block iter 2 r = 9.87, logl = -659849.9, tol = 4.0131788494028864e-5
Block iter 3 r = 9.89, logl = -659840.08, tol = 1.4888447022102083e-5
Block iter 4 r = 9.91, logl = -659840.95, tol = 1.3245809299937661e-6
Block iter 5 r = 9.96, logl = -659840.73, tol = 3.3603836649086505e-7
Block iter 6 r = 9.95, logl = -659839.72, tol = 1.5373802619636005e-6
Block iter 7 r = 9.94, logl = -659839.91, tol = 2.939320238458003e-7
Block iter 8 r = 9.92, logl = -659840.0, tol = 1.3122357416572785e-7
Block iter 9 r = 9.97, logl = -659840.8, tol = 1.2184561255932867e-6
Block iter 10 r = 9.96, logl = -659839.64, tol = 1.7524068854917753e-6

In [6]:
@time GLMCopula.fit!(gcm, maxBlockIter = 20, tol=1e-8);

Initializing NegBin r to Poisson regression values
initializing β using Newton's Algorithm under Independence Assumption
initializing σ2 and ρ using method of moments
initializing r using Newton update
initializing variance parameters in CS model using mom
Converging when tol ≤ 1.0e-8 (max block iter = 20)
Block iter 1 r = 9.63, logl = -659876.38, tol = 659876.3844181958
Block iter 2 r = 9.87, logl = -659849.9, tol = 4.0131788494028864e-5
Block iter 3 r = 9.89, logl = -659840.08, tol = 1.4888447022102083e-5
Block iter 4 r = 9.91, logl = -659840.95, tol = 1.3245809299937661e-6
Block iter 5 r = 9.96, logl = -659840.73, tol = 3.3603836649086505e-7
Block iter 6 r = 9.95, logl = -659839.72, tol = 1.5373802619636005e-6
Block iter 7 r = 9.94, logl = -659839.91, tol = 2.939320238458003e-7
Block iter 8 r = 9.92, logl = -659840.0, tol = 1.3122357416572785e-7
Block iter 9 r = 9.97, logl = -659840.8, tol = 1.2184561255932867e-6
Block iter 10 r = 9.96, logl = -659839.64, tol = 1.7524068854917753e-6

**Conclusion:** After precompilation, NBAR takes ~25 seconds with single threaded BLAS and 8-thread loglikelihood evaluations.

# Profile code

In [None]:
using ProfileView
BLAS.set_num_threads(1)

@profview GLMCopula.fit!(gcm, maxBlockIter = 20);
@profview GLMCopula.fit!(gcm, maxBlockIter = 20);