improved efficiency when spcov_type is none

USEPA · Mar 1, 2024 · a1a52bc · a1a52bc
1 parent 20d067f
commit a1a52bc
Show file tree

Hide file tree

Showing 9 changed files with 142 additions and 7 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,8 @@
 * Improved efficiency of handling random effects in big data models fit using `splm(..., local)` and `spglm(..., local)`.
 * Changed `Matrix::rankMatrix(X, method = "tolNorm2")` to `Matrix::rankMatrix(X, method = "qr")` to enhance stability when determining linear independence in `X`, the design matrix of explanatory variables.
 * Replaced an error message with a warning message when `X` has perfect collinearities (i.e., is not full rank).
+* Improved efficiency when `spcov_type` is `"none"` [(#15)](https://github.com/USEPA/spmodel/issues/15)
+* Minor documentation updates
 
 ## Bug Fixes
 

diff --git a/R/cov_betahat_adjust.R b/R/cov_betahat_adjust.R
@@ -40,7 +40,7 @@ cov_betahat_adjust <- function(invcov_betahat_list, betahat_list,
     if (is.null(cov_betahat_list)) {
       var_adjust <- "none"
       warning(
-        "At least one partition's inverse covariance matrix is singular. Redjusting using var_adjust = \"none\".",
+        "At least one partition's inverse covariance matrix is singular. Readjusting using var_adjust = \"none\".",
         call. = FALSE
       )
     } else {

diff --git a/R/cov_estimate_gloglik.R b/R/cov_estimate_gloglik.R
@@ -19,7 +19,11 @@ cov_estimate_gloglik_splm <- function(data_object, formula, spcov_initial, estme
   if (data_object$anisotropy) {
     dist_matrix_list <- NULL
   } else {
-    dist_matrix_list <- lapply(data_object$obdata_list, function(x) spdist(x, data_object$xcoord, data_object$ycoord))
+    if (inherits(spcov_initial, "none") && is.null(data_object$randcov_initial)) {
+      dist_matrix_list <- NULL
+    } else {
+      dist_matrix_list <- lapply(data_object$obdata_list, function(x) spdist(x, data_object$xcoord, data_object$ycoord))
+    }
   }
 
   if (is.null(data_object$randcov_initial)) {

diff --git a/R/cov_initial_search.R b/R/cov_initial_search.R
@@ -312,6 +312,18 @@ cov_initial_search.none <- function(spcov_initial_NA, estmethod, data_object,
                                     randcov_initial_NA = NULL, esv_dotlist, ...) {
   # find ols sample variance
   s2 <- data_object$s2
+
+  # exit if no random effects
+  if (is.null(randcov_initial_NA)) {
+    spcov_initial_NA$initial["ie"] <- s2
+    best_params <- list(
+      spcov_initial_val = spcov_initial_NA, randcov_initial_val = NULL, esv = NULL,
+      dist_vector = NULL, residual_vector2 = NULL
+    )
+    return(best_params)
+  }
+
+  # do other stuff
   ns2 <- 1.2 * s2
 
   # find sets of starting values

diff --git a/R/get_model_stats.R b/R/get_model_stats.R
@@ -156,6 +156,109 @@ get_model_stats_splm <- function(cov_est_object, data_object, estmethod) {
   )
 }
 
+get_model_stats_splm_iid <- function(cov_est_object, data_object, estmethod) {
+
+
+  X <- do.call("rbind", data_object$X_list)
+  y <- do.call("rbind", data_object$y_list)
+  qr_val <- qr(X)
+  R_val <- qr.R(qr_val)
+  s2 <- cov_est_object$spcov_params_val[["ie"]]
+  cor_betahat <- chol2inv(chol(crossprod(R_val, R_val)))
+  cov_betahat <- s2 * cor_betahat
+  betahat <- as.numeric(backsolve(R_val, qr.qty(qr_val, y)))
+  names(betahat) <- colnames(data_object$X_list[[1]])
+  cov_betahat <- as.matrix(cov_betahat)
+  rownames(cov_betahat) <- colnames(data_object$X_list[[1]])
+  colnames(cov_betahat) <- colnames(data_object$X_list[[1]])
+  fitted <- X %*% betahat
+  resids <- y - fitted
+
+
+  # return coefficients
+  coefficients <- get_coefficients(betahat, cov_est_object$spcov_params_val, cov_est_object$randcov_params_val)
+
+  # return fitted
+  fitted <- list(
+    response = as.numeric(fitted),
+    spcov = list(de = as.numeric(rep(0, length(y))), ie = as.numeric(resids)),
+    randcov = NULL
+  )
+
+
+  # return hat values
+  hatvalues <- diag(X %*% tcrossprod(cor_betahat, X))
+  # return residuals
+  residuals <- list(
+    response = as.numeric(resids),
+    pearson = 1 / sqrt(s2) * resids
+  )
+  residuals$standardized <- residuals$pearson / sqrt(1 - hatvalues)
+
+  # return cooks distance
+  cooks_distance <- get_cooks_distance(residuals, hatvalues, data_object$p)
+
+  # reorder relevant quantities
+  ## fitted
+  fitted$response <- fitted$response[order(data_object$order)]
+  names(fitted$response) <- data_object$observed_index
+  fitted$spcov$de <- fitted$spcov$de[order(data_object$order)]
+  names(fitted$spcov$de) <- data_object$observed_index
+  fitted$spcov$ie <- fitted$spcov$ie[order(data_object$order)]
+  names(fitted$spcov$ie) <- data_object$observed_index
+  hatvalues <- hatvalues[order(data_object$order)]
+  names(hatvalues) <- data_object$observed_index
+  residuals$response <- residuals$response[order(data_object$order)]
+  names(residuals$response) <- data_object$observed_index
+  residuals$pearson <- residuals$pearson[order(data_object$order)]
+  names(residuals$pearson) <- data_object$observed_index
+  residuals$standardized <- residuals$standardized[order(data_object$order)]
+  names(residuals$standardized) <- data_object$observed_index
+  cooks_distance <- cooks_distance[order(data_object$order)]
+  names(cooks_distance) <- data_object$observed_index
+
+  # return variance covariance matrices
+  vcov <- get_vcov(cov_betahat)
+
+  # return deviance
+  deviance <- as.numeric(crossprod(residuals$pearson, residuals$pearson))
+  muhat <- mean(y)
+  pearson_null <- 1 / sqrt(s2) * (y - muhat)
+  deviance_null <- as.numeric(crossprod(pearson_null, pearson_null))
+  pseudoR2 <- as.numeric(1 - deviance / deviance_null)
+
+  # set null model R2 equal to zero (no covariates)
+  if (length(labels(terms(data_object$formula))) == 0) {
+    pseudoR2 <- 0
+  }
+
+
+  ## empirical semivariogram
+  if (estmethod == "sv-wls") {
+    esv_val <- cov_est_object$esv
+  } else {
+    esv_val <- NULL
+  }
+
+
+  # npar
+  p_theta_spcov <- length(cov_est_object$is_known$spcov) - sum(cov_est_object$is_known$spcov)
+  p_theta_randcov <- length(cov_est_object$is_known$randcov) - sum(cov_est_object$is_known$randcov)
+  npar <- p_theta_spcov + p_theta_randcov
+
+  list(
+    coefficients = coefficients,
+    fitted = fitted,
+    hatvalues = hatvalues,
+    residuals = residuals,
+    cooks_distance = cooks_distance,
+    vcov = vcov,
+    deviance = deviance,
+    pseudoR2 = pseudoR2,
+    npar = npar
+  )
+}
+
 
 get_model_stats_spautor <- function(cov_est_object, data_object, estmethod) {
   # cov_est_object$randcov_params_val is NULL if not added so won't affect downstream calculations

diff --git a/R/splm.R b/R/splm.R
@@ -74,6 +74,8 @@
 #'   to \code{TRUE} or \code{FALSE} based on the sample size (the number of
 #'   non-missing observations in \code{data}) -- if the sample size exceeds 5,000,
 #'   \code{local} is set to \code{TRUE}. Otherwise it is set to \code{FALSE}.
+#'   \code{local} is also set to \code{FALSE} when \code{spcov_type} is \code{"none"}
+#'   and there are no random effects specified via \code{random}.
 #'   If \code{FALSE}, no big data approximation is implemented.
 #'   If a list is provided, the following arguments detail the big
 #'   data approximation:
@@ -289,6 +291,11 @@ splm <- function(formula, data, spcov_type, xcoord, ycoord, spcov_initial, estme
     partition_factor <- NULL
   }
 
+  # set local explicitly to FALSE if iid
+  if (inherits(spcov_initial, "none") && is.null(random)) {
+    local <- FALSE
+  }
+
   if (missing(local)) {
     local <- NULL
   }
@@ -331,7 +338,12 @@ splm <- function(formula, data, spcov_type, xcoord, ycoord, spcov_initial, estme
 
 
 
-  model_stats <- get_model_stats_splm(cov_est_object, data_object, estmethod)
+  if (inherits(cov_est_object$spcov_params_val, "none") && is.null(random)) {
+    model_stats <- get_model_stats_splm_iid(cov_est_object, data_object, estmethod)
+  } else {
+    model_stats <- get_model_stats_splm(cov_est_object, data_object, estmethod)
+  }
+
 
   # parallel cluster if necessary
   if (data_object$parallel) {

diff --git a/man/splm.Rd b/man/splm.Rd
diff --git a/tests/testthat/test-extras-spautor.R b/tests/testthat/test-extras-spautor.R
@@ -578,7 +578,7 @@ if (test_local) {
     M <- seq_len(NROW(exdata_poly))
     expect_error(spautor(y ~ x, exdata_poly, "car", W = W, row_st = FALSE, M = M))
     expect_error(spautor(y ~ x, exdata_poly, "car", partition_factor = ~ group + subgroup))
-    expect_error(spautor(y ~ as.factor(x) + group, exdata_poly, "car"))
+    expect_error(suppressWarnings(spautor(y ~ as.factor(x) + group, exdata_poly, "car")))
     exdata_poly3 <- exdata_poly
     exdata_poly3$y <- as.character(exdata_poly3$y)
     expect_error(spautor(y ~ x, exdata_poly3, "car"))
@@ -592,7 +592,7 @@ if (test_local) {
     expect_error(spautor(y ~ x, exdata_poly, "car", estmethod = "xyz"))
     exdata_poly4 <- exdata_poly
     exdata_poly4$x2 <- exdata_poly4$x
-    expect_error(spautor(y ~ x + x2, exdata_poly4, "car"))
+    expect_error(suppressWarnings(spautor(y ~ x + x2, exdata_poly4, "car")))
     exdata_poly4$x[1] <- NA
     expect_error(spautor(y ~ x, exdata_poly4, "car"))
   })

diff --git a/tests/testthat/test-extras-splm.R b/tests/testthat/test-extras-splm.R
@@ -941,7 +941,7 @@ if (test_local) {
     exdata2 <- exdata
     exdata2[1, "xcoord"] <- NA
     expect_error(splm(y ~ x, exdata2, "exponential", xcoord, ycoord))
-    expect_error(splm(y ~ as.factor(x) + group, exdata, "exponential", xcoord, ycoord))
+    expect_error(suppressWarnings(splm(y ~ as.factor(x) + group, exdata, "exponential", xcoord, ycoord)))
     expect_error(splm(y ~ x, exdata, "exponential", xcoord = ycoord), NA) # changing to ycoord2 works
     expect_error(splm(y ~ x, exdata, "exponential", ycoord = xcoord))
     expect_error(splm(y ~ x, exdata, "xyz", xcoord, ycoord))
@@ -972,7 +972,7 @@ if (test_local) {
     expect_error(splm(y ~ x, exdata, "exponential", xcoord, "xyz"))
     exdata4 <- exdata
     exdata4$x2 <- exdata4$x
-    expect_error(splm(y ~ x + x2, exdata4, "exponential", xcoord, ycoord))
+    expect_error(suppressWarnings(splm(y ~ x + x2, exdata4, "exponential", xcoord, ycoord)))
     exdata4$x[1] <- NA
     expect_error(splm(y ~ x, exdata4, "exponential", xcoord, ycoord))