In [1]:
import time
import pandas as pd
import numpy as np
import numpy.random as rgt
from scipy.stats import norm, t

from conquer.linear import low_dim
rgt.seed(42)

# number of monte carlo simulations
M = 500 

# Homoscedastic model

In [2]:
n, p = 8000, 400
itcp, beta = 4, 1*np.ones(p)*(2*rgt.binomial(1, 1/2, p) - 1)
tau, t_df = 0.75, 2
runtime = []
itcp_se, coef_se = [], []
for m in range(M):
    X = rgt.normal(0, 1.5, size=(n,p))
    Y = itcp + X @ beta + rgt.standard_t(t_df, n) - t.ppf(tau, t_df)

    tic = time.time()
    model = low_dim(X, Y).fit(tau=tau)
    runtime.append(time.time() - tic)

    itcp_se.append((model['beta'][0] - itcp)**2)
    coef_se.append(np.sum((model['beta'][1:] - beta)**2))

out = pd.DataFrame({'MSE (itcp)': np.mean(itcp_se),
                    'std (itcp)': np.std(itcp_se),
                    'MSE (coef)': np.mean(coef_se),
                    'std (coef)': np.std(coef_se),
                    'Runtime': np.mean(runtime)}, index=['conquer'])
out

Unnamed: 0,MSE (itcp),std (itcp),MSE (coef),std (coef),Runtime
conquer,0.001858,0.001746,0.076207,0.005759,0.199185


### Construction of confidence intervals

In [3]:
n, p = 500, 20
mask = 2*rgt.binomial(1, 1/2, p) - 1
itcp, beta = 4, 1*np.ones(p)*mask
tau, t_df = 0.75, 2

ci_cover = np.zeros([4, p])
ci_width = np.empty([M, 4, p])
for m in range(M):
    X = rgt.normal(0, 1.5, size=(n,p))
    Y = itcp + X@beta + rgt.standard_t(t_df, n) - t.ppf(tau, t_df)

    model = low_dim(X, Y)    
    sol1 = model.norm_ci(tau)
    sol2 = model.mb_ci(tau)
    
    ci_cover[0,:] += (beta >= sol1['normal_ci'][1:,0])*(beta<= sol1['normal_ci'][1:,1])
    ci_cover[1,:] += (beta >= sol2['percentile_ci'][1:,0])*(beta<= sol2['percentile_ci'][1:,1])
    ci_cover[2,:] += (beta >= sol2['pivotal_ci'][1:,0])*(beta<= sol2['pivotal_ci'][1:,1])
    ci_cover[3,:] += (beta >= sol2['normal_ci'][1:,0])*(beta<= sol2['normal_ci'][1:,1])
    
    ci_width[m,0,:] = sol1['normal_ci'][1:,1] - sol1['normal_ci'][1:,0]
    ci_width[m,1,:] = sol2['percentile_ci'][1:,1] - sol2['percentile_ci'][1:,0]
    ci_width[m,2,:] = sol2['pivotal_ci'][1:,1] - sol2['pivotal_ci'][1:,0]
    ci_width[m,3,:] = sol2['normal_ci'][1:,1] - sol2['normal_ci'][1:,0]

cover = pd.DataFrame(ci_cover/M, index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
cover.columns = pd.Index(np.linspace(1,p,p), dtype=int)
cover

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Normal,0.956,0.972,0.982,0.97,0.958,0.968,0.962,0.964,0.964,0.974,0.97,0.98,0.97,0.96,0.974,0.954,0.976,0.936,0.956,0.96
MB-Percentile,0.964,0.984,0.964,0.95,0.968,0.964,0.954,0.96,0.962,0.968,0.964,0.978,0.97,0.95,0.966,0.958,0.968,0.936,0.94,0.964
MB-Pivotal,0.914,0.944,0.936,0.928,0.926,0.934,0.946,0.926,0.936,0.938,0.942,0.942,0.936,0.932,0.948,0.94,0.938,0.916,0.92,0.926
MB-Normal,0.956,0.974,0.962,0.962,0.956,0.96,0.96,0.964,0.952,0.968,0.962,0.982,0.966,0.95,0.974,0.952,0.974,0.934,0.942,0.954


In [4]:
width = pd.DataFrame(np.mean(ci_width, axis=0), index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
width.columns = cover.columns
width

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
Normal,0.257232,0.256364,0.2601,0.25904,0.26281,0.262288,0.259765,0.261542,0.260884,0.259548,0.26067,0.260475,0.257223,0.259515,0.261979,0.257357,0.25841,0.256217,0.255783,0.255602
MB-Percentile,0.225682,0.224754,0.225801,0.225659,0.226853,0.226255,0.223725,0.226813,0.226576,0.226612,0.224329,0.225485,0.225495,0.225787,0.22756,0.224438,0.225008,0.223986,0.224594,0.225444
MB-Pivotal,0.225682,0.224754,0.225801,0.225659,0.226853,0.226255,0.223725,0.226813,0.226576,0.226612,0.224329,0.225485,0.225495,0.225787,0.22756,0.224438,0.225008,0.223986,0.224594,0.225444
MB-Normal,0.229029,0.227963,0.228481,0.229065,0.23122,0.230484,0.228093,0.230915,0.230312,0.229835,0.228292,0.229615,0.22906,0.229212,0.230733,0.228168,0.228701,0.227866,0.227855,0.229004


# Heteroscedastic model

Let $z=(z_1, \ldots, z_p)^T \sim N(0, \Sigma)$ with $\Sigma = (0.5^{|j-k|})_{1\leq j, k \leq p}$ and $z_0 \sim {\rm Unif}(0,2)$ be independent. Generate independent data vectors $\{(y_i , x_i) \}_{i=1}^n$ from the model 
$$
    y_i =  \varepsilon_i x_{i1}  +  x_{i2} + \cdots + x_{ip}   \quad {\rm with } \ \  x_i = (x_{i1}, \ldots, x_{ip})^T \sim (z_0, z_2, \ldots, z_p)^T,
$$
where $\varepsilon_i$'s are iid $N(0,1)$ variables that are independent of $x_i$'s.

Consider two quantile levels: $\tau=0.5$ and $\tau=0.8$. Note that the effect of $x_{i1}$ is only present for $\tau=0.8$.

In [5]:
def cov_generate(std, corr=0.5):
    p = len(std)
    R = np.zeros(shape=[p,p])
    for j in range(p-1):
        R[j, j+1:] = np.array(range(1, len(R[j,j+1:])+1))
    R += R.T
    return np.outer(std, std) * (corr*np.ones(shape=[p,p]))** R

n, p = 2000, 10
mu, Sig = np.zeros(p), cov_generate(np.ones(p), 0.5)
beta = np.ones(p)
beta[0] = 0

### Case 1: $\tau=0.5$.
The conditional median of $y_i$ given $x_i$ is $Q_{0.5}(y_i | x_i) =  x_{i2} + \cdots + x_{ip}$.

In [6]:
tau = 0.5
ci_cover = np.zeros([4, p])
ci_width = np.empty([M, 4, p])
for m in range(M):
    X = rgt.multivariate_normal(mean=mu, cov=Sig, size=n)
    X[:,0] = rgt.uniform(0, 2, size=n)
    Y = X@beta +  X[:,0]*rgt.normal(0,1,size=n)

    model = low_dim(X, Y, intercept=False)    
    sol1 = model.norm_ci(tau)
    sol2 = model.mb_ci(tau)
    
    ci_cover[0,:] += (beta >= sol1['normal_ci'][:,0])*(beta<= sol1['normal_ci'][:,1])
    ci_cover[1,:] += (beta >= sol2['percentile_ci'][:,0])*(beta<= sol2['percentile_ci'][:,1])
    ci_cover[2,:] += (beta >= sol2['pivotal_ci'][:,0])*(beta<= sol2['pivotal_ci'][:,1])
    ci_cover[3,:] += (beta >= sol2['normal_ci'][:,0])*(beta<= sol2['normal_ci'][:,1])
    
    ci_width[m,0,:] = sol1['normal_ci'][:,1] - sol1['normal_ci'][:,0]
    ci_width[m,1,:] = sol2['percentile_ci'][:,1] - sol2['percentile_ci'][:,0]
    ci_width[m,2,:] = sol2['pivotal_ci'][:,1] - sol2['pivotal_ci'][:,0]
    ci_width[m,3,:] = sol2['normal_ci'][:,1] - sol2['normal_ci'][:,0]

cover = pd.DataFrame(ci_cover/M, index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
cover.columns = pd.Index(np.linspace(1,p,p), dtype=int)
cover

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.95,0.96,0.954,0.946,0.964,0.95,0.96,0.946,0.94,0.944
MB-Percentile,0.94,0.944,0.936,0.944,0.95,0.928,0.942,0.942,0.932,0.942
MB-Pivotal,0.93,0.968,0.96,0.968,0.972,0.96,0.968,0.962,0.956,0.956
MB-Normal,0.944,0.968,0.956,0.964,0.972,0.954,0.962,0.966,0.948,0.954


In [7]:
width = pd.DataFrame(np.mean(ci_width, axis=0), index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
width.columns = cover.columns
width

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.124639,0.062723,0.070083,0.070554,0.070516,0.070514,0.07028,0.07047,0.070151,0.063217
MB-Percentile,0.120284,0.06471,0.072364,0.072847,0.072851,0.072331,0.072572,0.072618,0.071924,0.064974
MB-Pivotal,0.120284,0.06471,0.072364,0.072847,0.072851,0.072331,0.072572,0.072618,0.071924,0.064974
MB-Normal,0.123069,0.065882,0.073729,0.074109,0.074163,0.073518,0.073834,0.073873,0.073341,0.06611


### Case 2: $\tau=0.8$. 
In this case, the conditional $0.8$-quantile of $y_i$ given $x_i$ is $Q_{0.8}(y_i | x_i) =   \Phi^{-1}(0.8) x_{i1} + x_{i2} + \cdots + x_{ip}$.

In [8]:
tau = 0.8
true_beta = np.copy(beta)
true_beta[0] = norm.ppf(tau)

ci_cover = np.zeros([4, p])
ci_width = np.empty([M, 4, p])
for m in range(M):
    X = rgt.multivariate_normal(mean=mu, cov=Sig, size=n)
    X[:,0] = rgt.uniform(0, 2, size=n)
    Y = X@beta + X[:,0]*rgt.normal(0,1,size=n)

    model = low_dim(X, Y, intercept=False)    
    sol1 = model.norm_ci(tau)
    sol2 = model.mb_ci(tau)
    
    ci_cover[0,:] += (true_beta>=sol1['normal_ci'][:,0])*(true_beta<= sol1['normal_ci'][:,1])
    ci_cover[1,:] += (true_beta>=sol2['percentile_ci'][:,0])*(true_beta<= sol2['percentile_ci'][:,1])
    ci_cover[2,:] += (true_beta>=sol2['pivotal_ci'][:,0])*(true_beta<= sol2['pivotal_ci'][:,1])
    ci_cover[3,:] += (true_beta>=sol2['normal_ci'][:,0])*(true_beta<= sol2['normal_ci'][:,1])
    
    ci_width[m,0,:] = sol1['normal_ci'][:,1] - sol1['normal_ci'][:,0]
    ci_width[m,1,:] = sol2['percentile_ci'][:,1] - sol2['percentile_ci'][:,0]
    ci_width[m,2,:] = sol2['pivotal_ci'][:,1] - sol2['pivotal_ci'][:,0]
    ci_width[m,3,:] = sol2['normal_ci'][:,1] - sol2['normal_ci'][:,0]
        
cover = pd.DataFrame(ci_cover/M, index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
cover.columns = pd.Index(np.linspace(1,p,p), dtype=int)

width = pd.DataFrame(np.mean(ci_width, axis=0), index=["Normal", "MB-Percentile", "MB-Pivotal", "MB-Normal"])
width.columns = cover.columns

In [9]:
cover

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.946,0.96,0.95,0.962,0.966,0.966,0.962,0.966,0.968,0.96
MB-Percentile,0.948,0.95,0.944,0.958,0.956,0.956,0.956,0.96,0.966,0.954
MB-Pivotal,0.928,0.97,0.968,0.964,0.966,0.974,0.982,0.974,0.972,0.976
MB-Normal,0.95,0.966,0.968,0.966,0.97,0.97,0.976,0.972,0.974,0.974


In [10]:
width

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
Normal,0.143916,0.065354,0.072771,0.072959,0.072172,0.072553,0.072384,0.072357,0.07319,0.065104
MB-Percentile,0.138397,0.067852,0.075306,0.075343,0.074802,0.075333,0.075051,0.075192,0.075521,0.06716
MB-Pivotal,0.138397,0.067852,0.075306,0.075343,0.074802,0.075333,0.075051,0.075192,0.075521,0.06716
MB-Normal,0.141345,0.068888,0.076665,0.076569,0.075947,0.076416,0.076144,0.076233,0.076892,0.068287
