/
gaussianity_test.R
175 lines (163 loc) · 11.9 KB
/
gaussianity_test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#' One-sample gaussianity test in admixture models using Bordes and Vandekerkhove estimation method
#'
#' Perform the hypothesis test to know whether the unknown mixture component is gaussian or not, knowing that the known one
#' has support on the real line (R). The case of non-gaussian known component can be overcome thanks to the basic
#' transformation by cdf. Recall that an admixture model has probability density function (pdf) l = p*f + (1-p)*g, where g is
#' the known pdf and l is observed (others are unknown). Requires optimization (to estimate the unknown parameters) as defined
#' by Bordes & Vandekerkhove (2010), which means that the unknown mixture component must have a symmetric density.
#'
#' @param sample1 Observed sample with mixture distribution given by l = p*f + (1-p)*g, where f and p are unknown and g is known.
#' @param comp.dist List with two elements corresponding to the component distributions involved in the admixture model. Unknown
#' elements must be specified as 'NULL' objects. For instance if 'f' is unknown: list(f = NULL, g = 'norm').
#' @param comp.param List with two elements corresponding to the parameters of the component distributions, each element being a list
#' itself. The names used in this list must correspond to the native R names for distributions.
#' Unknown elements must be specified as 'NULL' objects (e.g. if 'f' is unknown: list(f=NULL, g=list(mean=0,sd=1)).
#' @param K Number of coefficients considered for the polynomial basis expansion.
#' @param lambda Rate at which the normalization factor is set in the penalization rule for model selection (in ]0,1/2[). See 'Details' below.
#' @param conf.level The confidence level, default to 95 percent. Equals 1-alpha, where alpha is the level of the test (type-I error).
#' @param support Support of the densities under consideration, useful to choose the polynomial orthonormal basis. One of 'Real',
#' 'Integer', 'Positive', or 'Bounded.continuous'.
#'
#' @details See the paper 'False Discovery Rate model Gaussianity test' (Pommeret & Vanderkerkhove, 2017).
#'
#' @return A list of 6 elements, containing: 1) the rejection decision; 2) the p-value of the test; 3) the test statistic; 4) the
#' variance-covariance matrix of the test statistic; 5) the selected rank for testing; and 6) a list of the estimates
#' (unknown component weight 'p', shift location parameter 'mu' and standard deviation 's' of the symmetric unknown distribution).
#'
#' @examples
#' ####### Under the null hypothesis H0.
#' ## Parameters of the gaussian distribution to be tested:
#' list.comp <- list(f = "norm", g = "norm")
#' list.param <- list(f = c(mean = 2, sd = 0.5),
#' g = c(mean = 0, sd = 1))
#' ## Simulate and plot the data at hand:
#' obs.data <- rsimmix(n = 150, unknownComp_weight = 0.9, comp.dist = list.comp,
#' comp.param = list.param)[['mixt.data']]
#' plot(density(obs.data))
#' ## Performs the test:
#' list.comp <- list(f = NULL, g = "norm")
#' list.param <- list(f = NULL, g = c(mean = 0, sd = 1))
#' gaussianity_test(sample1 = obs.data, comp.dist = list.comp, comp.param = list.param,
#' K = 3, lambda = 0.1, conf.level = 0.95, support = 'Real')
#'
#' @author Xavier Milhaud <xavier.milhaud.research@gmail.com>
#' @export
gaussianity_test <- function(sample1, comp.dist, comp.param, K = 3, lambda = 0.2, conf.level = 0.95,
support = c('Real','Integer','Positive','Bounded.continuous'))
{
if ( (length(comp.dist) != 2) | (length(comp.param) != 2) ) stop("Arguments 'comp.dist' and/or 'comp.param' were not correctly specified")
if ( any(sapply(comp.dist, is.null)) | any(sapply(comp.param, is.null)) ) {
comp.dist[which(sapply(comp.dist, is.null) == TRUE)] <- "norm"
comp.param[which(sapply(comp.param, is.null) == TRUE)] <- NA
}
## Extracts the information on component distributions and stores in expressions:
# comp.dens <- sapply(X = paste0("d",comp.dist), FUN = get, pos = "package:stats", mode = "function")
comp.dens <- sapply(X = paste0("d",comp.dist), FUN = get, mode = "function")
assign(x = names(comp.dens)[2], value = comp.dens[[2]])
# comp.sim <- sapply(X = paste0("r",comp.dist), FUN = get, pos = "package:stats", mode = "function")
comp.sim <- sapply(X = paste0("r",comp.dist), FUN = get, mode = "function")
assign(x = names(comp.sim)[2], value = comp.sim[[2]])
## Creates the expression allowing further to compute the theoretical 2nd-order moment of the known component:
expr.dens <- paste(names(comp.dens)[2],"(x,", paste(names(comp.param[[2]]), "=", comp.param[[2]], sep = "", collapse = ","), ")", sep="")
expr.sim <- paste(names(comp.sim)[2],"(n=1000000,", paste(names(comp.param[[2]]), "=", comp.param[[2]], sep = "", collapse = ","), ")", sep="")
## Accurate approximation of the second-order moment of the known component:
m1_knownComp <- mean(eval(parse(text = expr.sim)))
m2_knownComp <- mean(eval(parse(text = expr.sim))^2)
##-------- Split data sample -----------##
## Random sampling (p.6 Pommeret & Vandekerkhove) : create subsamples to get uncorrelated estimators of the different parameters
n <- length(sample1)
blocksize <- n %/% 3 # block size
values <- stats::runif(n) # simulate random values
rang <- rank(values) # associate a rank to each insured / observation
block <- (rang-1) %/% blocksize + 1 # associate to each individual a block number
block <- as.factor(block)
data.coef <- sample1[block == 1]
n.coef <- length(data.coef) # estimation of empirical moments
data.p <- sample1[block == 2]
n.p <- length(data.p) # estimation of component weights param de poids des composantes 'p'
data.loc <- sample1[block == 3]
n.loc <- length(data.loc) # estimation of localisation parameter 'mu' of the symmetric unknown density
##-------- Estimation of parameters and corresponding variances -----------##
## Focus on parameters (weight, localization and variance), consider independent subsamples of the original data:
hat_p <- BVdk_estimParam(data = data.p, method = "L-BFGS-B", comp.dist = comp.dist, comp.param = comp.param)[1]
hat_loc <- BVdk_estimParam(data = data.loc, method = "L-BFGS-B", comp.dist = comp.dist, comp.param = comp.param)[2]
## Plug-in method to estimate the variance:
kernelDensity_est_obs <- stats::density(sample1)
integrand_totweight <- function(x) {
w <- (1/hat_p) * stats::approxfun(kernelDensity_est_obs)(x) - ((1-hat_p)/hat_p) * eval(parse(text = expr.dens))
return( w * (w > 0) )
}
integrand_unknownComp <- function(x) {
w <- (1/hat_p) * stats::approxfun(kernelDensity_est_obs)(x) - ((1-hat_p)/hat_p) * eval(parse(text = expr.dens))
return( (x-hat_loc)^2 * (w * (w > 0)) / total_weight )
}
#total_weight <- stats::integrate(integrand_totweight, lower = min(kernelDensity_est_obs$x), upper = max(kernelDensity_est_obs$x))$value
#hat_s2 <- stats::integrate(integrand_unknownComp, lower = min(kernelDensity_est_obs$x), upper = max(kernelDensity_est_obs$x))$value
total_weight <- try(stats::integrate(integrand_totweight, lower = min(kernelDensity_est_obs$x), upper = max(kernelDensity_est_obs$x))$value, silent = TRUE)
hat_s2 <- try(stats::integrate(integrand_unknownComp, lower = min(kernelDensity_est_obs$x), upper = max(kernelDensity_est_obs$x))$value, silent = TRUE)
if (inherits(x = total_weight, what = "try-error", which = FALSE)) {
total_weight <- cubature::cubintegrate(integrand_totweight,
lower = min(kernelDensity_est_obs$x),
upper = max(kernelDensity_est_obs$x),
method = "pcubature")$integral
}
if (inherits(x = hat_s2, what = "try-error", which = FALSE)) {
hat_s2 <- cubature::cubintegrate(integrand_unknownComp,
lower = min(kernelDensity_est_obs$x),
upper = max(kernelDensity_est_obs$x),
method = "pcubature")$integral
}
#hat_s2 <- (1/hat_p) * ( mean(sample1^2) - ((1-hat_p) * m2_knownComp) ) - (1/hat_p^2) * (mean(sample1)-(1-hat_p)*m1_knownComp)^2
## Then on the variances of the estimators: semiparametric estimation (time-consuming), based on results by Bordes & Vandekerkhove (2010)
varCov <- BVdk_varCov_estimators(data = sample1, loc = hat_loc, p = hat_p, comp.dist = comp.dist, comp.param = comp.param)
var.hat_p <- varCov[["var_pEstim"]]
var.hat_loc <- varCov[["var_muEstim"]]
## Estimation of the variance of the variance estimator:
#var.hat_s2 <- BVdk_ML_varCov_estimators(data = sample1, hat_w = hat_p, hat_loc = hat_loc, hat_var = hat_s2,
# comp.dist = comp.dist, comp.param = comp.param)
##-------- Compute test statistics with data 'data.coef' -----------##
stat.R <- matrix(rep(NA, n.coef*K), nrow = K, ncol = n.coef)
## Plug-in (estimation of param. 'mu' et 's') to deduce coef. in Hermite polynomial orthonormal basis:
coef.unknown <- unlist(lapply(X = orthoBasis_coef(data = stats::rnorm(100000, hat_loc, sqrt(hat_s2)), supp = support, degree = K, m = 3, other = NULL), FUN = mean))
coef.known <- unlist(lapply(X = orthoBasis_coef(data = eval(parse(text = expr.sim)), supp = support, degree = K, m = 3, other = NULL), FUN = mean))
## Cf definition of R_kn below formula (12) p.5 :
poly_basis <- poly_orthonormal_basis(support = support, deg = K, x = data.coef, m = 3)
var.coef <- numeric(length = K)
for (i in 1:K) {
mixt.coefs <- orthopolynom::polynomial.values(poly_basis, data.coef)[[i+1]] / sqrt(factorial(i))
var.coef[i] <- stats::var(mixt.coefs)
stat.R[i, ] <- mixt.coefs - hat_p * (coef.unknown[i]-coef.known[i]) - coef.known[i]
mixt.coefs <- NULL
}
## Mean (at each order) of differences b/w 'theoretical' gaussian coef (hat_loc,schap) in ortho basis and computed coef from data:
statistics.R <- colMeans(t(stat.R))
##-------- Scaling (with variance) of the test statistic -----------##
var.R <- matrix(data = NA, nrow = K, ncol = K)
## Introduce the correction factor for the adjustment of the variance, given the initial split of the sample:
w.coef <- sqrt( (n.p * n.loc) / (n.coef + n.p + n.loc)^2 )
w.p <- sqrt( (n.coef * n.loc) / (n.coef + n.p + n.loc)^2 )
w.loc <- sqrt( (n.p * n.coef) / (n.coef + n.p + n.loc)^2 )
## FIXME: this formula has to be checked...
var.R[1,1] <- w.coef * var.coef[1] + w.p * var.hat_p * (hat_loc - coef.known[1])^2 + w.loc * var.hat_loc * hat_p^2
var.R[2,2] <- w.coef * var.coef[2] + w.p * var.hat_p * ((1/2)*(hat_s2+hat_loc^2-1) - coef.known[1])^2 + w.loc * var.hat_loc * hat_p^2 * hat_loc^2
var.R[3,3] <- w.coef * var.coef[2] + w.p * var.hat_p * (1/36) * (3*hat_loc*hat_s2 + hat_loc^2 - 6*hat_loc)^2 +
w.loc * var.hat_loc * (1/36) * (hat_p*(3*hat_s2+hat_loc-6))^2
##-------- Compute the test statistics at different orders -----------##
test.statistic <- numeric(length = K)
val <- 0
for (k in 1:K) {
## Equations (13) and (14) p.5 (except that we already scaled here, cf explanation top of p.6):
test.statistic[k] <- val + n^lambda * statistics.R[k]^2 * var.R[k,k]^(-1) - log(n)
val <- test.statistic[k]
}
## Select the right order (optimal number of coefficients needed in the expansion) :
selected.index <- which.max(test.statistic)
## Then remove previously introduced penalty in the selection rule to get back to the test statistic (Equation (14) p.5):
final.stat <- test.statistic[selected.index] + selected.index * log(n)
p.value <- 1 - stats::pchisq(final.stat, 1)
## If the test statistic is greater that the quantile of interest, reject the null hypothesis (otherwise do not reject):
rej <- FALSE
if (final.stat > stats::qchisq(conf.level,1)) { rej <- TRUE }
list(confidence_level = conf.level, rejection_rule = rej, p_value = p.value, test.stat = final.stat,
var.stat = var.R, rank = selected.index, estimates = list(p = hat_p, mu = hat_loc, s = sqrt(hat_s2)))
}