Skip to content

Commit

Permalink
improved efficiency when spcov_type is none
Browse files Browse the repository at this point in the history
  • Loading branch information
michaeldumelle committed Mar 1, 2024
1 parent 20d067f commit a1a52bc
Show file tree
Hide file tree
Showing 9 changed files with 142 additions and 7 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
* Improved efficiency of handling random effects in big data models fit using `splm(..., local)` and `spglm(..., local)`.
* Changed `Matrix::rankMatrix(X, method = "tolNorm2")` to `Matrix::rankMatrix(X, method = "qr")` to enhance stability when determining linear independence in `X`, the design matrix of explanatory variables.
* Replaced an error message with a warning message when `X` has perfect collinearities (i.e., is not full rank).
* Improved efficiency when `spcov_type` is `"none"` [(#15)](https://github.com/USEPA/spmodel/issues/15)
* Minor documentation updates

## Bug Fixes

Expand Down
2 changes: 1 addition & 1 deletion R/cov_betahat_adjust.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ cov_betahat_adjust <- function(invcov_betahat_list, betahat_list,
if (is.null(cov_betahat_list)) {
var_adjust <- "none"
warning(
"At least one partition's inverse covariance matrix is singular. Redjusting using var_adjust = \"none\".",
"At least one partition's inverse covariance matrix is singular. Readjusting using var_adjust = \"none\".",
call. = FALSE
)
} else {
Expand Down
6 changes: 5 additions & 1 deletion R/cov_estimate_gloglik.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,11 @@ cov_estimate_gloglik_splm <- function(data_object, formula, spcov_initial, estme
if (data_object$anisotropy) {
dist_matrix_list <- NULL
} else {
dist_matrix_list <- lapply(data_object$obdata_list, function(x) spdist(x, data_object$xcoord, data_object$ycoord))
if (inherits(spcov_initial, "none") && is.null(data_object$randcov_initial)) {
dist_matrix_list <- NULL
} else {
dist_matrix_list <- lapply(data_object$obdata_list, function(x) spdist(x, data_object$xcoord, data_object$ycoord))
}
}

if (is.null(data_object$randcov_initial)) {
Expand Down
12 changes: 12 additions & 0 deletions R/cov_initial_search.R
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,18 @@ cov_initial_search.none <- function(spcov_initial_NA, estmethod, data_object,
randcov_initial_NA = NULL, esv_dotlist, ...) {
# find ols sample variance
s2 <- data_object$s2

# exit if no random effects
if (is.null(randcov_initial_NA)) {
spcov_initial_NA$initial["ie"] <- s2
best_params <- list(
spcov_initial_val = spcov_initial_NA, randcov_initial_val = NULL, esv = NULL,
dist_vector = NULL, residual_vector2 = NULL
)
return(best_params)
}

# do other stuff
ns2 <- 1.2 * s2

# find sets of starting values
Expand Down
103 changes: 103 additions & 0 deletions R/get_model_stats.R
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,109 @@ get_model_stats_splm <- function(cov_est_object, data_object, estmethod) {
)
}

get_model_stats_splm_iid <- function(cov_est_object, data_object, estmethod) {


X <- do.call("rbind", data_object$X_list)
y <- do.call("rbind", data_object$y_list)
qr_val <- qr(X)
R_val <- qr.R(qr_val)
s2 <- cov_est_object$spcov_params_val[["ie"]]
cor_betahat <- chol2inv(chol(crossprod(R_val, R_val)))
cov_betahat <- s2 * cor_betahat
betahat <- as.numeric(backsolve(R_val, qr.qty(qr_val, y)))
names(betahat) <- colnames(data_object$X_list[[1]])
cov_betahat <- as.matrix(cov_betahat)
rownames(cov_betahat) <- colnames(data_object$X_list[[1]])
colnames(cov_betahat) <- colnames(data_object$X_list[[1]])
fitted <- X %*% betahat
resids <- y - fitted


# return coefficients
coefficients <- get_coefficients(betahat, cov_est_object$spcov_params_val, cov_est_object$randcov_params_val)

# return fitted
fitted <- list(
response = as.numeric(fitted),
spcov = list(de = as.numeric(rep(0, length(y))), ie = as.numeric(resids)),
randcov = NULL
)


# return hat values
hatvalues <- diag(X %*% tcrossprod(cor_betahat, X))
# return residuals
residuals <- list(
response = as.numeric(resids),
pearson = 1 / sqrt(s2) * resids
)
residuals$standardized <- residuals$pearson / sqrt(1 - hatvalues)

# return cooks distance
cooks_distance <- get_cooks_distance(residuals, hatvalues, data_object$p)

# reorder relevant quantities
## fitted
fitted$response <- fitted$response[order(data_object$order)]
names(fitted$response) <- data_object$observed_index
fitted$spcov$de <- fitted$spcov$de[order(data_object$order)]
names(fitted$spcov$de) <- data_object$observed_index
fitted$spcov$ie <- fitted$spcov$ie[order(data_object$order)]
names(fitted$spcov$ie) <- data_object$observed_index
hatvalues <- hatvalues[order(data_object$order)]
names(hatvalues) <- data_object$observed_index
residuals$response <- residuals$response[order(data_object$order)]
names(residuals$response) <- data_object$observed_index
residuals$pearson <- residuals$pearson[order(data_object$order)]
names(residuals$pearson) <- data_object$observed_index
residuals$standardized <- residuals$standardized[order(data_object$order)]
names(residuals$standardized) <- data_object$observed_index
cooks_distance <- cooks_distance[order(data_object$order)]
names(cooks_distance) <- data_object$observed_index

# return variance covariance matrices
vcov <- get_vcov(cov_betahat)

# return deviance
deviance <- as.numeric(crossprod(residuals$pearson, residuals$pearson))
muhat <- mean(y)
pearson_null <- 1 / sqrt(s2) * (y - muhat)
deviance_null <- as.numeric(crossprod(pearson_null, pearson_null))
pseudoR2 <- as.numeric(1 - deviance / deviance_null)

# set null model R2 equal to zero (no covariates)
if (length(labels(terms(data_object$formula))) == 0) {
pseudoR2 <- 0
}


## empirical semivariogram
if (estmethod == "sv-wls") {
esv_val <- cov_est_object$esv
} else {
esv_val <- NULL
}


# npar
p_theta_spcov <- length(cov_est_object$is_known$spcov) - sum(cov_est_object$is_known$spcov)
p_theta_randcov <- length(cov_est_object$is_known$randcov) - sum(cov_est_object$is_known$randcov)
npar <- p_theta_spcov + p_theta_randcov

list(
coefficients = coefficients,
fitted = fitted,
hatvalues = hatvalues,
residuals = residuals,
cooks_distance = cooks_distance,
vcov = vcov,
deviance = deviance,
pseudoR2 = pseudoR2,
npar = npar
)
}


get_model_stats_spautor <- function(cov_est_object, data_object, estmethod) {
# cov_est_object$randcov_params_val is NULL if not added so won't affect downstream calculations
Expand Down
14 changes: 13 additions & 1 deletion R/splm.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@
#' to \code{TRUE} or \code{FALSE} based on the sample size (the number of
#' non-missing observations in \code{data}) -- if the sample size exceeds 5,000,
#' \code{local} is set to \code{TRUE}. Otherwise it is set to \code{FALSE}.
#' \code{local} is also set to \code{FALSE} when \code{spcov_type} is \code{"none"}
#' and there are no random effects specified via \code{random}.
#' If \code{FALSE}, no big data approximation is implemented.
#' If a list is provided, the following arguments detail the big
#' data approximation:
Expand Down Expand Up @@ -289,6 +291,11 @@ splm <- function(formula, data, spcov_type, xcoord, ycoord, spcov_initial, estme
partition_factor <- NULL
}

# set local explicitly to FALSE if iid
if (inherits(spcov_initial, "none") && is.null(random)) {
local <- FALSE
}

if (missing(local)) {
local <- NULL
}
Expand Down Expand Up @@ -331,7 +338,12 @@ splm <- function(formula, data, spcov_type, xcoord, ycoord, spcov_initial, estme



model_stats <- get_model_stats_splm(cov_est_object, data_object, estmethod)
if (inherits(cov_est_object$spcov_params_val, "none") && is.null(random)) {
model_stats <- get_model_stats_splm_iid(cov_est_object, data_object, estmethod)
} else {
model_stats <- get_model_stats_splm(cov_est_object, data_object, estmethod)
}


# parallel cluster if necessary
if (data_object$parallel) {
Expand Down
2 changes: 2 additions & 0 deletions man/splm.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions tests/testthat/test-extras-spautor.R
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ if (test_local) {
M <- seq_len(NROW(exdata_poly))
expect_error(spautor(y ~ x, exdata_poly, "car", W = W, row_st = FALSE, M = M))
expect_error(spautor(y ~ x, exdata_poly, "car", partition_factor = ~ group + subgroup))
expect_error(spautor(y ~ as.factor(x) + group, exdata_poly, "car"))
expect_error(suppressWarnings(spautor(y ~ as.factor(x) + group, exdata_poly, "car")))
exdata_poly3 <- exdata_poly
exdata_poly3$y <- as.character(exdata_poly3$y)
expect_error(spautor(y ~ x, exdata_poly3, "car"))
Expand All @@ -592,7 +592,7 @@ if (test_local) {
expect_error(spautor(y ~ x, exdata_poly, "car", estmethod = "xyz"))
exdata_poly4 <- exdata_poly
exdata_poly4$x2 <- exdata_poly4$x
expect_error(spautor(y ~ x + x2, exdata_poly4, "car"))
expect_error(suppressWarnings(spautor(y ~ x + x2, exdata_poly4, "car")))
exdata_poly4$x[1] <- NA
expect_error(spautor(y ~ x, exdata_poly4, "car"))
})
Expand Down
4 changes: 2 additions & 2 deletions tests/testthat/test-extras-splm.R
Original file line number Diff line number Diff line change
Expand Up @@ -941,7 +941,7 @@ if (test_local) {
exdata2 <- exdata
exdata2[1, "xcoord"] <- NA
expect_error(splm(y ~ x, exdata2, "exponential", xcoord, ycoord))
expect_error(splm(y ~ as.factor(x) + group, exdata, "exponential", xcoord, ycoord))
expect_error(suppressWarnings(splm(y ~ as.factor(x) + group, exdata, "exponential", xcoord, ycoord)))
expect_error(splm(y ~ x, exdata, "exponential", xcoord = ycoord), NA) # changing to ycoord2 works
expect_error(splm(y ~ x, exdata, "exponential", ycoord = xcoord))
expect_error(splm(y ~ x, exdata, "xyz", xcoord, ycoord))
Expand Down Expand Up @@ -972,7 +972,7 @@ if (test_local) {
expect_error(splm(y ~ x, exdata, "exponential", xcoord, "xyz"))
exdata4 <- exdata
exdata4$x2 <- exdata4$x
expect_error(splm(y ~ x + x2, exdata4, "exponential", xcoord, ycoord))
expect_error(suppressWarnings(splm(y ~ x + x2, exdata4, "exponential", xcoord, ycoord)))
exdata4$x[1] <- NA
expect_error(splm(y ~ x, exdata4, "exponential", xcoord, ycoord))

Expand Down

0 comments on commit a1a52bc

Please sign in to comment.