In [1]:
import numpy as np
import pandas as pd
import numpy.random as rgt
from conquer import conquer
from scipy.stats import norm, t
import matplotlib.pyplot as plt
import time

### Estimation and inference in a heterogeneous model

Let $z=(z_1, \ldots, z_p)^T \sim N(0, \Sigma)$ with $\Sigma = (0.5^{|j-k|})_{1\leq j, k \leq p}$. Generate independent data vectors $\{(y_i , x_i) \}_{i=1}^n$ from the model 
$$
    y_i = \beta_0 + \langle x_i, \beta \rangle +  x_{i1}  \varepsilon_i \quad {\rm with } \quad  x_i \sim (2\Phi(z_1), z_2, \ldots, z_p)^T, 
$$
where $\beta_0=4$, $\beta= (0,1,\ldots, 1)^T \in \mathbb R^p$, and $\varepsilon_i$'s are iid $N(0,1)$ variables that are independent of $x_i$'s.

Consider two quantile levels: $\tau=0.5$ and $\tau=0.8$. Note that the effect of $x_{i1}$ is only present for $\tau=0.8$.

In [2]:
def cov_generate(std, corr=0.5):
    p = len(std)
    Sig = np.empty(shape=[p,p])
    for j in range(p):
        for k in range(p):
            Sig[j,k] = std[j]*std[k]*np.power(corr,abs(j-k))
    return Sig
        
n = 2000
p = 10
mu, Sig = np.zeros(p), cov_generate(np.ones(p))
beta = np.ones(p)
beta[0] = 0

### Case 1: $\tau=0.5$.
The conditional median of $y_i$ given $x_i$ is $Q_{0.5}(y_i | x_i) = 4 + x_{i2} + \cdots + x_{ip}$.

In [3]:
tau = 0.5
B = 200
ci_cover = np.zeros([4, p])
ci_width = np.empty([B, 4, p])
for b in range(B):
    X = rgt.multivariate_normal(mean=mu, cov=Sig, size=n)
    X[:,0] = 2*norm.cdf(X[:,0])
    Y = 4 + X.dot(beta) +  X[:,0]*rgt.normal(0,1,size=n)

    sqr = conquer(X, Y)
    mb_beta, boot_ci = sqr.mb_ci(tau)
    sqr_beta, norm_ci = sqr.norm_ci(tau)

    ci = np.concatenate([norm_ci[None,:,:], boot_ci], axis=0)
    
    for i in range(4):
        ci_cover[i,:] += 1*(beta >= ci[i,1:,0])*(beta<= ci[i,1:,1])
        ci_width[b,:,:] = ci[:,1:,1] - ci[:,1:,0]

In [4]:
cover = pd.DataFrame(ci_cover/B, index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
cover.columns = pd.Index(np.linspace(1,p,p), dtype=int)
cover

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.935,0.955,0.95,0.955,0.95,0.94,0.95,0.94,0.935,0.955
MB-Percentile,0.93,0.955,0.945,0.95,0.96,0.935,0.955,0.92,0.94,0.95
MB-Pivotal,0.945,0.975,0.955,0.97,0.965,0.955,0.955,0.96,0.955,0.96
MB-Normal,0.945,0.97,0.955,0.965,0.96,0.955,0.955,0.96,0.955,0.955


In [5]:
width = pd.DataFrame(np.mean(ci_width, axis=0), index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
width.columns = cover.columns
width

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.172011,0.069555,0.070326,0.070781,0.070488,0.070313,0.070754,0.07035,0.070395,0.063069
MB-Percentile,0.173128,0.072658,0.073631,0.07392,0.073767,0.07386,0.074214,0.073823,0.073949,0.066115
MB-Pivotal,0.173128,0.072658,0.073631,0.07392,0.073767,0.07386,0.074214,0.073823,0.073949,0.066115
MB-Normal,0.174207,0.073126,0.073835,0.074093,0.074046,0.074058,0.074604,0.073997,0.074258,0.066494


### Case 2: $\tau=0.8$. 
In this case, the conditional $0.8$-quantile of $y_i$ given $x_i$ is $Q_{0.8}(y_i | x_i) = 4 + \Phi^{-1}(0.8) x_{i1} + x_{i2} + \cdots + x_{ip}$.

In [6]:
tau = 0.8
true_beta = np.copy(beta)
true_beta[0] = norm.ppf(tau)

B = 200
ci_cover = np.zeros([4, p])
ci_width = np.empty([B, 4, p])
for b in range(B):
    X = rgt.multivariate_normal(mean=mu, cov=Sig, size=n)
    X[:,0] = 2*norm.cdf(X[:,0])
    Y = 4 + X.dot(beta) + X[:,0]*rgt.normal(0,1,size=n)

    sqr = conquer(X, Y)
    mb_beta, boot_ci = sqr.mb_ci(tau)
    sqr_beta, norm_ci = sqr.norm_ci(tau)

    ci = np.concatenate([norm_ci[None,:,:], boot_ci], axis=0)
    
    for i in range(4):
        ci_cover[i,:] += 1*(true_beta >= ci[i,1:,0])*(true_beta<= ci[i,1:,1])
        ci_width[b,:,:] = ci[:,1:,1] - ci[:,1:,0]

In [7]:
cover = pd.DataFrame(ci_cover/B, index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
cover.columns = pd.Index(np.linspace(1,p,p), dtype=int)
cover

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.885,0.965,0.975,0.965,0.975,0.99,0.94,0.955,0.965,0.945
MB-Percentile,0.91,0.955,0.98,0.96,0.965,0.97,0.93,0.95,0.965,0.93
MB-Pivotal,0.92,0.98,0.97,0.97,0.975,0.995,0.965,0.97,0.975,0.945
MB-Normal,0.91,0.975,0.98,0.97,0.98,0.99,0.95,0.955,0.97,0.95


In [8]:
width = pd.DataFrame(np.mean(ci_width, axis=0), index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
width.columns = cover.columns
width

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.195253,0.078257,0.079341,0.079067,0.079218,0.079149,0.079295,0.079556,0.07905,0.070479
MB-Percentile,0.197057,0.081827,0.083692,0.08278,0.083089,0.082717,0.083329,0.083807,0.083222,0.074082
MB-Pivotal,0.197057,0.081827,0.083692,0.08278,0.083089,0.082717,0.083329,0.083807,0.083222,0.074082
MB-Normal,0.198263,0.082147,0.083818,0.083286,0.083352,0.083001,0.083748,0.084028,0.083525,0.074458
