# Autoregressive covariance structure 

Next we try to parameterize covariance Gamma only using two parameters rho and sigma2 as in the AR(1) structure.

1. First we need to modify the structure itself
2. Second modify the loglikelihood function 
3. Third modify gradient and hessian with respect to beta
4. To-do: ADD gradient and hessian with respect to AR(1) parameterization

In [1]:
using DataFrames, Random, GLM, GLMCopula, Test
using LinearAlgebra, BenchmarkTools

Random.seed!(1234)

# sample size
N = 1000
# observations per subject
n = 5
ρ = 0.1
σ2 = 0.2

V = zeros(n, n) # will store the AR(1) structure without sigma2

mean = 5

dist = Poisson

V = get_AR_cov(n, ρ, σ2, V)

# true Gamma
Γ = σ2 * V

5×5 Array{Float64,2}:
 0.2     0.02    0.002  0.0002  2.0e-5
 0.02    0.2     0.02   0.002   0.0002
 0.002   0.02    0.2    0.02    0.002
 0.0002  0.002   0.02   0.2     0.02
 2.0e-5  0.0002  0.002  0.02    0.2

In [2]:
vecd = [dist(mean) for i in 1:n]
nonmixed_multivariate_dist = NonMixedMultivariateDistribution(vecd, Γ)

Y_Nsample = simulate_nobs_independent_vectors(nonmixed_multivariate_dist, N)

1000-element Array{Array{Float64,1},1}:
 [3.0, 8.0, 7.0, 4.0, 2.0]
 [2.0, 6.0, 6.0, 6.0, 3.0]
 [3.0, 5.0, 5.0, 10.0, 3.0]
 [5.0, 6.0, 3.0, 5.0, 2.0]
 [10.0, 10.0, 1.0, 2.0, 2.0]
 [5.0, 5.0, 4.0, 5.0, 4.0]
 [1.0, 4.0, 6.0, 5.0, 5.0]
 [7.0, 8.0, 6.0, 4.0, 9.0]
 [5.0, 3.0, 4.0, 4.0, 4.0]
 [5.0, 5.0, 6.0, 4.0, 7.0]
 [3.0, 7.0, 6.0, 3.0, 6.0]
 [6.0, 3.0, 3.0, 8.0, 5.0]
 [3.0, 4.0, 7.0, 3.0, 4.0]
 ⋮
 [1.0, 6.0, 1.0, 0.0, 5.0]
 [7.0, 7.0, 7.0, 4.0, 8.0]
 [7.0, 1.0, 4.0, 8.0, 7.0]
 [2.0, 6.0, 7.0, 1.0, 7.0]
 [3.0, 6.0, 6.0, 2.0, 6.0]
 [4.0, 6.0, 15.0, 3.0, 8.0]
 [4.0, 9.0, 5.0, 1.0, 4.0]
 [6.0, 5.0, 2.0, 4.0, 3.0]
 [7.0, 7.0, 10.0, 3.0, 5.0]
 [2.0, 5.0, 2.0, 7.0, 1.0]
 [4.0, 2.0, 5.0, 4.0, 12.0]
 [3.0, 3.0, 5.0, 9.0, 4.0]

In [3]:
Random.seed!(1234)

d = Poisson()
link = LogLink()
D = typeof(d)
Link = typeof(link)
T = Float64
gcs = Vector{GLMCopulaARObs{T, D, Link}}(undef, N)

for i in 1:N
    y = Float64.(Y_Nsample[i])
    X = ones(n, 1)
    gcs[i] = GLMCopulaARObs(y, X, d, link)
end

gcm = GLMCopulaARModel(gcs);

In [4]:
initialize_model!(gcm)
@show gcm.β
@show exp.(gcm.β);

initializing β using Newton's Algorithm under Independence Assumption
1 0.0 -11363.432404320156 3999
2 -11363.432404320156 -11363.432404320156 3999
gcm.β = [1.6278670479024675]
exp.(gcm.β) = [5.093]


In [5]:
gc = gcm.data[1]
β  = gcm.β

n_i  = length(gc.y)

5

In [6]:
update_res!(gc, β)
standardize_res!(gc)

In [7]:
@test gc.η == gc.X*β                         # systematic linear component
@test gc.μ == exp.(gc.η)                     # mu = ginverse of XB = mean component for GLM
@test gc.varμ == exp.(gc.η)                  # variance of the GLM response as a function of mean mu
@test gc.res ≈ (gc.y - gc.μ)./sqrt.(gc.varμ) # standardized residual for GLM outcome

[32m[1mTest Passed[22m[39m

# loglikelihood 

## Loglikelihood for observation i = 1, j in [1, n_1]
$$\mathcal{L}(\mathbf{\beta})_1 =  - \ln \Big[1\! +\! \frac{1}{2}tr(\mathbf{\Gamma_{1}})\Big] +
\ln \Big\{1\!+\!\frac{1}{2}\mathbf{r_1}(\mathbf{\beta})^t \mathbf{\Gamma_1} \mathbf{r_1}(\mathbf{\beta})\Big\} +  \sum_{j=1}^{n_1}y_{1j}log(\mu_{1j}(\mathbf{\beta})) - \mu_{1j}(\mathbf{\beta})$$

In [8]:
Γ_est = σ2 * V
trace_gamma = tr(Γ_est)

term1 = -log(1 + 0.5 * trace_gamma)
@show term1;

term1 = -0.4054651081081644


In [9]:
term1 = -log(1 + 0.5 * n_i * σ2)

-0.4054651081081644

In [10]:
term2 = log(1 + σ2 * 0.5 * transpose(gc.res) * V * gc.res)

0.4267297849640152

In [11]:
function poisson_density(y, μ)
    logl = 0.0
    for j in 1:length(y)
        logl += y[j] * log(μ[j]) - μ[j] - log(factorial(y[j]))
    end
    logl
end

term3 = poisson_density(gc.y, gc.μ)

-11.18891559428739

In [12]:
logl_gc1 = term1 + term2 + term3

-11.16765091743154

In [13]:
@test loglikelihood!(gc, β, ρ, σ2, true, true) ≈ -11.16765091743154
@test gc.∇β ≈ [-1.6506528461688563] # i get this value hard coded below
@test gc.Hβ ≈ [-21.589097278447113] # i get this value hard coded below

[32m[1mTest Passed[22m[39m

# A Closer Look at the Gradient for observation i=1

$$\begin{eqnarray*}
\nabla_\beta &=& \sum_{i=1}^n \sum_j \nabla \ln f_{ij}(y_{ij} \mid \mathbf{\beta}) + \sum_{i=1}^n
\frac{\nabla \mathbf{r_i(\mathbf{\beta})}\mathbf{\Gamma_i}\mathbf{r_i(\mathbf{\beta})}}{1+\frac{1}{2}\mathbf{r_i}(\mathbf{\beta})^t \mathbf{\Gamma_i} \mathbf{r_i(\mathbf{\beta})}}
\end{eqnarray*}
$$

The gradient is made of two terms. The first is from the GLM component loglikelihood that corresponds to the Logistic Regression density. The second part is specific to our copula model. We start with Term 1 for observation 1:

$$\begin{eqnarray*}
    \text{Term 1} &=& \sum_{j=1}^{n_1} \frac{(y_{1j}-\mu_{1j}) \mu_{1j}'(\eta_{1j})}{\sigma_{1j}^2} \mathbf{x}_{1j}
\end{eqnarray*}
$$

We will check if the field $\mu_{1j}'$ or `mueta` from the GLM.jl package matches our theoretical value

In [14]:
function poisson_gradient(y, X, dμ, σ2, μ)
    grad = zeros(size(X, 2))
    for j in 1:length(y)
        grad += (y[j] - μ[j]) * dμ[j]/σ2[j] * X[j, :]
    end
    grad
end

# check if glm gradient is right
term1_gradient = poisson_gradient(gc.y, gc.X, gc.dμ, gc.varμ, gc.μ)

1-element Array{Float64,1}:
 -1.4649999999999999

In [15]:
term1_grad_fctn = GLMCopula.glm_gradient(gc, β, 1.0)
@test term1_gradient == term1_grad_fctn

[32m[1mTest Passed[22m[39m

In [16]:
update_res!(gc, β)
standardize_res!(gc)
std_res_differential!(gc)


$$\begin{eqnarray*}
\text{Term 2} &=& \sum_{i=1}^n
\frac{\nabla \mathbf{r_i(\mathbf{\beta})}\mathbf{\Gamma_i}\mathbf{r_i(\mathbf{\beta})}}{1+\frac{1}{2}\mathbf{r_i}(\mathbf{\beta})^t \mathbf{\Gamma_i} \mathbf{r_i(\mathbf{\beta})}}
\end{eqnarray*}
$$

In [17]:
grad_t2_numerator = transpose(gc.∇resβ) * Γ_est * gc.res       # new term ∇resβ^t * Γ * res
@show grad_t2_numerator

quadratic_form = transpose(gc.res) * Γ_est * gc.res
@show quadratic_form 

grad_t2_denominator = inv(1 + 0.5 * quadratic_form)
@show grad_t2_denominator

gradient_term2 = grad_t2_numerator * grad_t2_denominator

grad_t2_numerator = [-0.2844644518790497]
quadratic_form = 1.0644771437580993
grad_t2_denominator = 0.6526398815124835


1-element Array{Float64,1}:
 -0.18565284616885655

In [18]:
gradient_hard_code = term1_gradient + gradient_term2

1-element Array{Float64,1}:
 -1.6506528461688563

## Hessian

$$\text{Term 1 }= - \sum_{j=1}^{n_1} \frac{[\mu_{1j}'(\eta_{1j})]^2}{\sigma_{1j}^2} \mathbf{x}_{1j} \mathbf{x}_{1j}^T = - \mathbf{X_1}^T \mathbf{W_{21}} \mathbf{X_1}. $$

In [19]:
hess_term1 = -transpose(gc.X) * Diagonal(gc.w2) * gc.X

1×1 Array{Float64,2}:
 -25.465

## Term 2: Copula Model Specific

The hessian of our model specific component is the partial of this second term in the gradient:


$$\begin{eqnarray*}
\nabla_\beta \text{Term 2} &=& \sum_{i=1}^n
\frac{\nabla \mathbf{r_i(\mathbf{\beta})}\mathbf{\Gamma_i}\mathbf{r_i(\mathbf{\beta})}}{1+\frac{1}{2}\mathbf{r_i}(\mathbf{\beta})^t \mathbf{\Gamma_i} \mathbf{r_i(\mathbf{\beta})}}
\end{eqnarray*}
$$

$$ H_\beta \text{Term 2} = - \frac{[\nabla \mathbf{r_1}(\mathbf{\beta})^t\mathbf{\Gamma_1} \mathbf{r_1}(\mathbf{\beta})]
[\nabla \mathbf{r_1}(\mathbf{\beta})^t\mathbf{\Gamma_1} \mathbf{r_1}(\mathbf{\beta})]^t}
{\Big[1+\frac{1}{2}\mathbf{r_1}(\mathbf{\beta})^t \mathbf{\Gamma_1} \mathbf{r_1}(\mathbf{\beta})\Big]^2}$$

We notice the quantity we need to first form this matrix: 

\begin{eqnarray*}
&  & \nabla \mathbf{r_1}(\mathbf{\beta})^t\mathbf{\Gamma_1} \mathbf{r_1}(\mathbf{\beta}) =
\sum_{k=1}^m \mathbf\Sigma[k] * \nabla \mathbf{r_1}(\mathbf{\beta})^t * \mathbf{V_1}[k] * \mathbf{r_1}(\mathbf{\beta})
\end{eqnarray*}

In [20]:
update_res!(gc, β)
standardize_res!(gc)

In [21]:
hess_term2 = -inv(1 + 0.5 * quadratic_form)^2 * (transpose(gc.∇resβ) * Γ_est * gc.res) * transpose(transpose(gc.∇resβ) * Γ_est * gc.res)

1×1 Array{Float64,2}:
 -0.03446697929059711

We will add the additional term in the approximate hessian that is not the second derivative. 

\begin{eqnarray*}
&=& - \sum_{i=1}^n \mathbf{X_i}^T \mathbf{W_{2i}} \mathbf{X_i} -\sum_{i=1}^n\frac{[\nabla \mathbf{r_i}(\mathbf{\beta})^t\mathbf{\Gamma_i} \mathbf{r_i}(\mathbf{\beta})]
[\nabla \mathbf{r_i}(\mathbf{\beta})^t\mathbf{\Gamma_i} \mathbf{r_i}(\mathbf{\beta})]^t}
{\Big[1+\frac{1}{2}\mathbf{r_i}(\mathbf{\beta})^t \mathbf{\Gamma_i} \mathbf{r_i}(\mathbf{\beta})\Big]^2} +  \sum_{i=1}^n\frac{\nabla \mathbf{r_i}(\mathbf{\beta})^t\mathbf{\Gamma_i} \nabla \mathbf{r_i}(\mathbf{\beta})}{1+\frac{1}{2}\mathbf{r_i}(\mathbf{\beta})^t \mathbf{\Gamma_i} \mathbf{r_i}(\mathbf{\beta})}
\end{eqnarray*}


In [22]:
## additional hessian term

hess_term3 = inv(1 + 0.5 * quadratic_form) * transpose(gc.∇resβ) * Γ_est * gc.∇resβ

1×1 Array{Float64,2}:
 3.910369700843483

In [23]:
hessian_gc1 = hess_term1 + hess_term2 + hess_term3

1×1 Array{Float64,2}:
 -21.589097278447113

## TO DO: check hessian term2 

1. loglikelihood gradient and hessian is ok 
2. add gradient and hessian wrt to AR(1)


## scratch work below is for first observation gradient hessian wrt rho and sigma2 forAR1

In [24]:
fill!(gcm.ρ, 1.0)
fill!(gcm.σ2, 1.0)

1-element Array{Float64,1}:
 1.0

In [25]:
gc = gcm.data[1]
gc2 = deepcopy(gc)
gc3 = deepcopy(gc)
gc4 = deepcopy(gc);

In [26]:
n = 5
ρ = 0.1
σ2 = 0.2
V = zeros(n, n)
@benchmark GLMCopula.get_AR_cov($n, $ρ, $σ2, $V)

BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     40.348 ns (0.00% GC)
  median time:      41.330 ns (0.00% GC)
  mean time:        43.900 ns (0.00% GC)
  maximum time:     173.709 ns (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     987

In [27]:
## gradient wrt rho 
n = length(gc.y)
storage_n = zeros(n)

Vi = get_AR_cov(n, ρ, σ2, V)

5×5 Array{Float64,2}:
 1.0     0.1    0.01  0.001  0.0001
 0.1     1.0    0.1   0.01   0.001
 0.01    0.1    1.0   0.1    0.01
 0.001   0.01   0.1   1.0    0.1
 0.0001  0.001  0.01  0.1    1.0

In [28]:
function get_∇ARV(n, ρ, σ2, ∇ARV)
    @inbounds for i in 1:n
        ∇ARV[i, i] = 0.0
        @inbounds for j in i+1:n
            ∇ARV[i, j] = (j-i)* ρ^(j-i-1)
            ∇ARV[j, i] = ∇ARV[i, j]
        end
    end
    ∇ARV
end

∇ARV = zeros(n, n)

∇ARVi = get_∇ARV(n, ρ, σ2, ∇ARV)

5×5 Array{Float64,2}:
 0.0    1.0   0.2  0.03  0.004
 1.0    0.0   1.0  0.2   0.03
 0.2    1.0   0.0  1.0   0.2
 0.03   0.2   1.0  0.0   1.0
 0.004  0.03  0.2  1.0   0.0

In [29]:
update_res!(gc, gcm.β)

standardize_res!(gc)

## gradient wrt to rho

In [30]:
grad_rho_gc1 = inv(1 + 0.5 * σ2 * transpose(gc.res) * Vi * gc.res) * 0.5 * σ2 * transpose(gc.res) * ∇ARVi * gc.res

-0.05210434528647787

In [31]:
function get_∇2ARV(n, ρ, σ2, ∇ARV)
    @inbounds for i in 1:n
        ∇ARV[i, i] = 0.0
        @inbounds for j in i+1:n
            ∇ARV[i, j] = (j-i)*(j-i-1)* ρ^(j-i-2)
            ∇ARV[j, i] = ∇ARV[i, j]
        end
    end
    ∇ARV
end

∇2ARV = zeros(n, n)

∇2ARVi = get_∇2ARV(n, ρ, σ2, ∇2ARV)

5×5 Array{Float64,2}:
 0.0   0.0  2.0  0.6  0.12
 0.0   0.0  0.0  2.0  0.6
 2.0   0.0  0.0  0.0  2.0
 0.6   2.0  0.0  0.0  0.0
 0.12  0.6  2.0  0.0  0.0

##  hessian wrt to rho

In [32]:
hess_rho_gc1_t1 = inv(1 + 0.5 * σ2 * transpose(gc.res) * Vi * gc.res) * 0.5 * σ2 * transpose(gc.res) * ∇2ARVi * gc.res

-0.7529639280856892

In [33]:
hess_rho_gc1_t2 = -inv(1 + 0.5 * σ2 * transpose(gc.res) * Vi * gc.res)^2 * 0.5 * σ2 * transpose(gc.res) * ∇ARVi * gc.res

0.03400537373405244

In [34]:
hessian_rho_gc1 = hess_rho_gc1_t1 + hess_rho_gc1_t2

-0.7189585543516368

## gradient wrt to sigma2

In [35]:
grad_sigma2_gc1_t1 = -0.5 * n * inv(1 + 0.5 * n * σ2) 

-1.6666666666666665

In [36]:
grad_sigma2_gc1_t2 = inv(1 + 0.5 * σ2 * transpose(gc.res) * Vi * gc.res) * 0.5 * transpose(gc.res) * Vi * gc.res

1.736800592437582

In [37]:
grad_sigma2_gc1 = grad_sigma2_gc1_t1 + grad_sigma2_gc1_t2

0.07013392577091548

## hessian wrt to sigma2

In [38]:
hess_sigma2_gc1_t1 = 0.25 * n^2 * inv(1 + 0.5 * n * σ2)^2

2.7777777777777777

In [39]:
hess_sigma2_gc1_t2 = -inv(1 + 0.5 * σ2 * transpose(gc.res) * Vi * gc.res)^2 * (0.5 * transpose(gc.res) * Vi * gc.res)^2

-3.0164762978915354

In [40]:
hess_sigma2_gc1 = hess_sigma2_gc1_t1 + hess_sigma2_gc1_t2

-0.2386985201137577