StatFunGen · xueweic · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025 · Apr 17, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -28,9 +28,10 @@ Suggests:
     testthat (>= 3.0.0),
     knitr,
     rmarkdown,
+    bookdown,
     ashr,
     MASS
-VignetteBuilder: knitr
+VignetteBuilder: knitr, bookdown
 Roxygen: list(markdown = TRUE)
 Config/testthat/edition: 3
 License: MIT + file LICENSE
diff --git a/R/colocboost.R b/R/colocboost.R
@@ -191,15 +191,9 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
   # - check individual level data
   if (!is.null(X) & !is.null(Y)) {
     # --- check input
-    if (is.data.frame(X)) {
-      X <- as.matrix(X)
-    }
-    if (is.data.frame(Y)) {
-      Y <- as.matrix(Y)
-    }
-    if (is.matrix(X)) {
-      X <- list(X)
-    }
+    if (is.data.frame(X))  X <- as.matrix(X)
+    if (is.data.frame(Y)) Y <- as.matrix(Y)
+    if (is.matrix(X))  X <- list(X)
     if (is.atomic(Y) && !is.list(Y)) {
       Y <- as.matrix(Y)
       if (ncol(Y) == 1) {
@@ -216,23 +210,17 @@ colocboost <- function(X = NULL, Y = NULL, # individual data
       }
     } else {
       Y <- lapply(1:length(Y), function(ii) {
-        if (is.null(dict_YX)) {
-          idx <- ii
-        } else {
-          idx <- dict_YX[ii, 2]
-        }
-        n <- nrow(X[[idx]])
         y <- Y[[ii]]
         y <- as.matrix(y)
+        n <- length(y)
         if (nrow(y) == n) {
           return(y)
         } else if (ncol(y) == n) {
           return(t(y))
-        } else {
-          stop("X and Y do not have the same sample size!")
-        }
+        } 
       })
     }
+
     # --- check if variables in individual data
     p.ind <- unique(sapply(X, ncol))
     if (length(p.ind) != 1) {

diff --git a/R/colocboost_init.R b/R/colocboost_init.R
@@ -68,69 +68,18 @@ colocboost_init_data <- function(X, Y, dict_YX,
   flag <- 1
   # if individual: X, Y
   if (!is.null(X) & !is.null(Y)) {
-    drop_lowfreq <- c()
-    dict_YX_final <- dict_YX
-    for (ij in 1:length(X)) {
-      index <- which(dict_YX == ij)
-      dict_YX_final[index] <- index[1]
-    }
+
+    ind_formated <- process_individual_data(
+      X, Y, dict_YX, target_variants = keep_variable_names,
+      intercept = intercept, standardize = standardize
+    )
     for (i in 1:length(Y)) {
-      tmp <- list(
-        "X" = NULL,
-        "Y" = scale(Y[[i]], center = intercept, scale = standardize),
-        "N" = length(Y[[i]]),
-        "variable_miss" = NULL
-      )
-      x_tmp <- X[[dict_YX[i]]]
-      change_x <- if (dict_YX_final[i] == i) TRUE else FALSE
-      # - if sample different
-      if (nrow(x_tmp) != length(Y[[i]])) {
-        change_x <- TRUE
-        ind_id_Y <- rownames(Y[[i]])
-        ind_id_X <- rownames(x_tmp)
-        if (is.null(ind_id_X) | is.null(ind_id_Y)) {
-          stop("Please provide the sample index of X and Y, since they do not have the same samples!")
-        } else {
-          pos <- match(ind_id_Y, ind_id_X)
-          x_tmp <- x_tmp[pos, , drop = FALSE]
-          if (sum(is.na(pos)) != 0) {
-            tmp$Y <- tmp$Y[-which(is.na(pos))]
-          }
-        }
-      }
-      # - if missing X
-      variable.name <- keep_variables[[dict_YX[i]]]
-      if (length(variable.name) != length(keep_variable_names)) {
-        x_extend <- matrix(0,
-          nrow = nrow(x_tmp), ncol = length(keep_variable_names),
-          dimnames = list(rownames(x_tmp), keep_variable_names)
-        )
-        variable.tmp <- intersect(keep_variable_names, variable.name)
-        pos <- match(variable.tmp, keep_variable_names)
-        tmp$variable_miss <- setdiff(1:length(keep_variable_names), pos)
-        poss <- match(variable.tmp, variable.name)
-        x_extend[, pos] <- x_tmp[, poss]
-        x_tmp <- x_extend
-      }
-      if (change_x) {
-        dict_YX_final[i] == i
-        if (!intercept & !standardize) {
-          x_stand <- x_tmp
-        } else {
-          x_stand <- Rfast::standardise(x_tmp, center = intercept, scale = standardize)
-        }
-        x_stand[which(is.na(x_stand))] <- 0
-        tmp$X <- x_stand
-      }
-      cb_data$data[[flag]] <- tmp
+      cb_data$data[[flag]] <- ind_formated$result[[i]]
       names(cb_data$data)[flag] <- paste0("ind_outcome_", i)
       flag <- flag + 1
     }
-    cb_data$dict <- c(dict_YX_final)
-    ind_idx <- max(dict_YX)
-  } else {
-    ind_idx <- 0
-  }
+    cb_data$dict <- c(ind_formated$new_dict)
+  } 
   n_ind <- flag - 1
   # if summary: XtX XtY, YtY
   if (!is.null(Z) & !is.null(LD)) {
@@ -649,3 +598,104 @@ process_sumstat <- function(Z, N, Var_y, SeBhat, ld_matrices, variant_lists, dic
     original_dict = dict
   ))
 }
+
+#' process individual level input format
+#' @noRd
+process_individual_data <- function(X, Y, dict_YX, target_variants,
+                                    intercept = TRUE,
+                                    standardize = TRUE) {
+
+  # Step 0: Check if sample IDs match between Y and corresponding X
+  for (i in 1:length(Y)) {
+    current_matrix_type <- dict_YX[i]
+    # If row counts match, we assume samples are in the same order
+    if (nrow(Y[[i]]) != nrow(X[[current_matrix_type]])) {
+      # Row counts don't match, so check rownames
+      ind_id_Y <- rownames(Y[[i]])
+      ind_id_X <- rownames(X[[current_matrix_type]])
+      if (is.null(ind_id_X) || is.null(ind_id_Y)) {
+        stop("Please provide the sample index of X and Y, since they do not have the same samples!")
+      }
+      # Find matching samples
+      pos <- match(ind_id_Y, ind_id_X)
+      if (sum(!is.na(pos)) == 0) {
+        stop("No samples in Y match any samples in the corresponding X matrix!")
+      }
+    }
+  }
+
+  # Step 1: Update dictionary to handle duplicates samples
+  sample_lists <- lapply(Y, rownames)
+  new_dict <- 1:length(dict_YX)
+  # For each pair of Y indices
+  for (i in 1:(length(dict_YX)-1)) {
+    for (j in (i+1):length(dict_YX)) {
+      # Check if they map to the same X matrix AND have the same samples
+      if (dict_YX[i] == dict_YX[j] && identical(sample_lists[[i]], sample_lists[[j]])) {
+        # If same matrix and same samples, use the smaller index
+        new_dict[j] <- new_dict[i]
+      }
+    }
+  }
+
+  # Step 2: Create result list
+  result <- list()
+
+  for (i in 1:length(Y)) {
+    tmp <- list(
+      "X" = NULL,
+      "Y" = NULL,
+      "N" = NULL,
+      "variable_miss" = NULL
+    )
+    matrix_type <- dict_YX[i]
+    # Get the appropriate matrix from X list
+    current_X <- X[[matrix_type]]
+    current_Y <- Y[[i]]
+    # Check if we need to match samples or if we can use as-is
+    if (nrow(current_Y) == nrow(current_X)) {
+      # Same number of rows, assume same order
+      matched_X <- current_X
+      matched_Y <- scale(current_Y, center = intercept, scale = standardize)
+    } else {
+      # Different number of rows, find matching samples
+      overlap_samples <- intersect(rownames(current_Y), rownames(current_X))
+      matched_X <- current_X[match(overlap_samples, rownames(current_X)), , drop = FALSE]
+      matched_Y <- current_Y[match(overlap_samples, rownames(current_Y)), , drop = FALSE]
+      matched_Y <- scale(matched_Y, center = intercept, scale = standardize)
+    }
+    tmp$Y <- matched_Y
+    tmp$N <- length(matched_Y)
+
+    # - if missing X
+    variable.name <- colnames(matched_X)
+    if (length(variable.name) != length(target_variants)) {
+      x_extend <- matrix(0, nrow = nrow(matched_X), ncol = length(target_variants),
+                         dimnames = list(rownames(matched_X), target_variants) )
+      variable.tmp <- intersect(target_variants, variable.name)
+      pos <- match(variable.tmp, target_variants)
+      tmp$variable_miss <- setdiff(1:length(target_variants), pos)
+      poss <- match(variable.tmp, variable.name)
+      x_extend[, pos] <- matched_X[, poss]
+      matched_X <- x_extend
+    }
+    if (new_dict[i] == i) {
+      if (!intercept & !standardize) {
+        x_stand <- matched_X
+      } else {
+        x_stand <- Rfast::standardise(matched_X, center = intercept, scale = standardize)
+      }
+      x_stand[which(is.na(x_stand))] <- 0
+      tmp$X <- x_stand
+    }
+
+    # Create components for each list
+    result[[i]] <- tmp
+  }
+  # Return results with the unified dictionary
+  return(list(
+    result = result,
+    new_dict = new_dict,
+    original_dict = dict_YX
+  ))
+}
diff --git a/README.md b/README.md
@@ -8,9 +8,10 @@ This R package implements ColocBoost---motivated and designed for colocalization
 
 ## Quick Start
 
-### CRAN Installation
+### CRAN Installation (available soon)
 Install released versions from CRAN (Linux, macOS and Windows)
 
+
 ```r
 install.packages("colocboost")
 ```

diff --git a/man/colocboost.Rd b/man/colocboost.Rd
diff --git a/man/colocboost_plot.Rd b/man/colocboost_plot.Rd
diff --git a/man/get_cormat.Rd b/man/get_cormat.Rd
diff --git a/man/get_cos.Rd b/man/get_cos.Rd
diff --git a/man/get_cos_summary.Rd b/man/get_cos_summary.Rd