In [1]:
import sys
sys.path.append('/Users/wez243/Dropbox/git/Conquer/code')
from conquer import conquer

In [2]:
import numpy as np
import pandas as pd
import numpy.random as rgt
from scipy.stats import norm, t
import matplotlib.pyplot as plt
import time

### Estimation and inference in a heterogeneous model

Let $z=(z_1, \ldots, z_p)^T \sim N(0, \Sigma)$ with $\Sigma = (0.5^{|j-k|})_{1\leq j, k \leq p}$. Generate independent data vectors $\{(y_i , x_i) \}_{i=1}^n$ from the model 
$$
    y_i = \beta_0 + \langle x_i, \beta \rangle +  x_{i1}  \varepsilon_i \quad {\rm with } \quad  x_i \sim (2\Phi(z_1), z_2, \ldots, z_p)^T, 
$$
where $\beta_0=4$, $\beta= (0,1,\ldots, 1)^T \in \mathbb R^p$, and $\varepsilon_i$'s are iid $N(0,1)$ variables that are independent of $x_i$'s.

Consider two quantile levels: $\tau=0.5$ and $\tau=0.8$. Note that the effect of $x_{i1}$ is only present for $\tau=0.8$.

In [3]:
def cov_generate(std, corr=0.5):
    p = len(std)
    Sig = np.empty(shape=[p,p])
    for j in range(p):
        for k in range(p):
            Sig[j,k] = std[j]*std[k]*np.power(corr,abs(j-k))
    return Sig
        
n = 2000
p = 10
mu, Sig = np.zeros(p), cov_generate(np.ones(p))
beta = np.ones(p)
beta[0] = 0

### Case 1: $\tau=0.5$.
The conditional median of $y_i$ given $x_i$ is $Q_{0.5}(y_i | x_i) = 4 + x_{i2} + \cdots + x_{ip}$.

In [4]:
tau = 0.5
B = 200
ci_cover = np.zeros([4, p])
ci_width = np.empty([B, 4, p])
for b in range(B):
    X = rgt.multivariate_normal(mean=mu, cov=Sig, size=n)
    X[:,0] = 2*norm.cdf(X[:,0])
    Y = 4 + X.dot(beta) +  X[:,0]*rgt.normal(0,1,size=n)

    sqr = conquer(X, Y)
    mb_beta, boot_ci = sqr.mb_ci(tau)
    sqr_beta, norm_ci = sqr.norm_ci(tau)

    ci = np.concatenate([norm_ci[None,:,:], boot_ci], axis=0)
    
    for i in range(4):
        ci_cover[i,:] += 1*(beta >= ci[i,1:,0])*(beta<= ci[i,1:,1])
        ci_width[b,:,:] = ci[:,1:,1] - ci[:,1:,0]

In [5]:
cover = pd.DataFrame(ci_cover/B, index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
cover.columns = pd.Index(np.linspace(1,p,p), dtype=int)
cover

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.95,0.955,0.94,0.905,0.945,0.97,0.935,0.945,0.965,0.94
MB-Percentile,0.955,0.96,0.935,0.915,0.935,0.955,0.94,0.935,0.965,0.94
MB-Pivotal,0.94,0.975,0.97,0.93,0.965,0.98,0.955,0.955,0.965,0.975
MB-Normal,0.95,0.96,0.955,0.92,0.955,0.985,0.945,0.95,0.965,0.965


In [6]:
width = pd.DataFrame(np.mean(ci_width, axis=0), index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
width.columns = cover.columns
width

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.172411,0.069897,0.071146,0.070931,0.07085,0.070569,0.070566,0.070667,0.071004,0.063587
MB-Percentile,0.17352,0.073289,0.074489,0.074076,0.074103,0.073841,0.073596,0.07364,0.074461,0.066143
MB-Pivotal,0.17352,0.073289,0.074489,0.074076,0.074103,0.073841,0.073596,0.07364,0.074461,0.066143
MB-Normal,0.174374,0.07365,0.07486,0.074636,0.074559,0.074306,0.074122,0.073983,0.074627,0.066618


### Case 2: $\tau=0.8$. 
In this case, the conditional $0.8$-quantile of $y_i$ given $x_i$ is $Q_{0.8}(y_i | x_i) = 4 + \Phi^{-1}(0.8) x_{i1} + x_{i2} + \cdots + x_{ip}$.

In [7]:
tau = 0.8
true_beta = np.copy(beta)
true_beta[0] = norm.ppf(tau)

B = 200
ci_cover = np.zeros([4, p])
ci_width = np.empty([B, 4, p])
for b in range(B):
    X = rgt.multivariate_normal(mean=mu, cov=Sig, size=n)
    X[:,0] = 2*norm.cdf(X[:,0])
    Y = 4 + X.dot(beta) + X[:,0]*rgt.normal(0,1,size=n)

    sqr = conquer(X, Y)
    mb_beta, boot_ci = sqr.mb_ci(tau)
    sqr_beta, norm_ci = sqr.norm_ci(tau)

    ci = np.concatenate([norm_ci[None,:,:], boot_ci], axis=0)
    
    for i in range(4):
        ci_cover[i,:] += 1*(true_beta >= ci[i,1:,0])*(true_beta<= ci[i,1:,1])
        ci_width[b,:,:] = ci[:,1:,1] - ci[:,1:,0]

In [8]:
cover = pd.DataFrame(ci_cover/B, index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
cover.columns = pd.Index(np.linspace(1,p,p), dtype=int)
cover

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.91,0.965,0.975,0.945,0.94,0.94,0.925,0.94,0.95,0.96
MB-Percentile,0.895,0.945,0.955,0.97,0.945,0.95,0.91,0.94,0.945,0.97
MB-Pivotal,0.925,0.975,0.97,0.97,0.96,0.97,0.945,0.945,0.975,0.975
MB-Normal,0.92,0.96,0.965,0.965,0.945,0.955,0.945,0.95,0.96,0.975


In [9]:
width = pd.DataFrame(np.mean(ci_width, axis=0), index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
width.columns = cover.columns
width

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.198481,0.079711,0.080168,0.080042,0.080732,0.079573,0.079769,0.079021,0.079924,0.070433
MB-Percentile,0.197687,0.083145,0.083804,0.08386,0.084067,0.082929,0.083597,0.083203,0.082771,0.07379
MB-Pivotal,0.197687,0.083145,0.083804,0.08386,0.084067,0.082929,0.083597,0.083203,0.082771,0.07379
MB-Normal,0.199265,0.083623,0.08421,0.084242,0.08443,0.083126,0.083872,0.083362,0.083145,0.074102
