Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable returning the covariance matrix for GxE tests #94

Open
wants to merge 2 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 37 additions & 33 deletions R/assocTestSingle.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ setGeneric("assocTestSingle", function(gdsobj, ...) standardGeneric("assocTestSi
setMethod("assocTestSingle",
"SeqVarIterator",
function(gdsobj, null.model, test=c("Score", "Score.SPA", "BinomiRare", "CMP"),
recalc.pval.thresh=0.05, fast.score.SE=FALSE, GxE=NULL,
recalc.pval.thresh=0.05, fast.score.SE=FALSE, GxE=NULL, GxE.return.cov=FALSE,
geno.coding=c("additive", "dominant", "recessive"),
sparse=TRUE, imputed=FALSE, male.diploid=TRUE, genome.build=c("hg19", "hg38"),
sparse=TRUE, imputed=FALSE, male.diploid=TRUE, genome.build=c("hg19", "hg38"),
BPPARAM=bpparam(), verbose=TRUE) {
test <- match.arg(test)
geno.coding <- match.arg(geno.coding)
Expand All @@ -19,27 +19,27 @@ setMethod("assocTestSingle",
null.model <- .updateNullModelFormat(null.model)

# check that the provided null model is compatible with the requested test
.checkNullModelTestSingle(null.model = null.model, test = test,
.checkNullModelTestSingle(null.model = null.model, test = test,
recalc.pval.thresh = recalc.pval.thresh, fast.score.SE = fast.score.SE, GxE = GxE)

# coerce null.model if necessary
if (sparse) null.model <- .nullModelAsMatrix(null.model)

# filter samples to match null model
sample.index <- .setFilterNullModel(gdsobj, null.model, verbose=verbose)

# get sex for calculating allele freq
sex <- validateSex(gdsobj)[sample.index]

if (!is.null(GxE)) GxE <- .modelMatrixColumns(null.model, GxE)

# check ploidy
if (SeqVarTools:::.ploidy(gdsobj) == 1) male.diploid <- FALSE

# results
#n.iter <- length(variantFilter(gdsobj))
#set.messages <- ceiling(n.iter / 100) # max messages = 100

if(verbose) message('Using ', bpnworkers(BPPARAM), ' CPU cores')

i <- 1
Expand All @@ -52,24 +52,25 @@ setMethod("assocTestSingle",
if (!iterate) {
return(NULL)
}

var.info <- variantInfo(gdsobj, alleles=FALSE, expanded=TRUE)

if (!imputed) {
geno <- expandedAltDosage(gdsobj, use.names=FALSE, sparse=sparse)[sample.index,,drop=FALSE]
} else {
geno <- imputedDosage(gdsobj, use.names=FALSE)[sample.index,,drop=FALSE]
}

chr <- chromWithPAR(gdsobj, genome.build=genome.build)

return(list(var.info=var.info, geno=geno, chr=chr))
}

res <- bpiterate(ITER, .testGenoBlockSingle, BPPARAM=BPPARAM,
sex=sex, null.model=null.model, test=test,
recalc.pval.thresh=recalc.pval.thresh,
fast.score.SE=fast.score.SE, GxE=GxE,
recalc.pval.thresh=recalc.pval.thresh,
fast.score.SE=fast.score.SE,
GxE=GxE, GxE.return.cov=GxE.return.cov,
geno.coding=geno.coding,
sparse=sparse, imputed=imputed,
male.diploid=male.diploid)
Expand All @@ -82,7 +83,7 @@ setMethod("assocTestSingle",
setMethod("assocTestSingle",
"GenotypeIterator",
function(gdsobj, null.model, test=c("Score", "Score.SPA", "BinomiRare", "CMP"),
recalc.pval.thresh=0.05, GxE=NULL,
recalc.pval.thresh=0.05, GxE=NULL, GxE.return.cov=FALSE,
geno.coding=c("additive", "dominant", "recessive"),
male.diploid=TRUE, BPPARAM=bpparam(), verbose=TRUE) {
test <- match.arg(test)
Expand All @@ -92,23 +93,23 @@ setMethod("assocTestSingle",
null.model <- .updateNullModelFormat(null.model)

# check that the provided null model is compatible with the requested test
.checkNullModelTestSingle(null.model = null.model, test = test,
.checkNullModelTestSingle(null.model = null.model, test = test,
recalc.pval.thresh = recalc.pval.thresh, fast.score.SE = FALSE, GxE = GxE)

# filter samples to match null model
sample.index <- .sampleIndexNullModel(gdsobj, null.model)

# get sex for calculating allele freq
sex <- validateSex(gdsobj)[sample.index]

if (!is.null(GxE)) GxE <- .modelMatrixColumns(null.model, GxE)

# results
# n.iter <- length(snpFilter(gdsobj))
# set.messages <- ceiling(n.iter / 100) # max messages = 100

if(verbose) message('Using ', bpnworkers(BPPARAM), ' CPU cores')

i <- 1
ITER <- function() {
iterate <- TRUE
Expand All @@ -119,19 +120,20 @@ setMethod("assocTestSingle",
if (!iterate) {
return(NULL)
}

var.info <- variantInfo(gdsobj)

geno <- getGenotypeSelection(gdsobj, scan=sample.index, order="selection",
transpose=TRUE, use.names=FALSE, drop=FALSE)

return(list(var.info=var.info, geno=geno, chr=var.info$chr))
}

res <- bpiterate(ITER, .testGenoBlockSingle, BPPARAM=BPPARAM,
sex=sex, null.model=null.model, test=test,
recalc.pval.thresh=recalc.pval.thresh,
fast.score.SE=FALSE, GxE=GxE,
recalc.pval.thresh=recalc.pval.thresh,
fast.score.SE=FALSE,
GxE=GxE, GxE.return.cov=GxE.return.cov,
geno.coding=geno.coding,
sparse=FALSE, imputed=FALSE,
male.diploid=male.diploid)
Expand All @@ -143,41 +145,43 @@ setMethod("assocTestSingle",

# function to process a block of genotype data
.testGenoBlockSingle <- function(x, sex, null.model, test,
recalc.pval.thresh, fast.score.SE, GxE,
recalc.pval.thresh, fast.score.SE,
GxE, GxE.return.cov,
geno.coding,
sparse, imputed, male.diploid, ...) {

# for BinomiRare and CMP, restrict to variants where the alternate allele is minor
if (test %in% c("BinomiRare", "CMP")) {
AF.max <- 0.5
} else {
AF.max <- 1
}

x <- .prepGenoBlock(x, AF.max=AF.max, geno.coding=geno.coding, imputed=imputed,
sex=sex, male.diploid=male.diploid)
var.info <- x$var.info
n.obs <- x$n.obs
freq <- x$freq
geno <- x$geno
rm(x)

# mean impute missing values
if (any(n.obs < nrow(geno))) {
geno <- .meanImpute(geno, freq$freq)
}

# do the test
if (ncol(geno) == 0){
res.i <- NULL
} else {
assoc <- testGenoSingleVar(null.model, G=geno, E=GxE, test=test,
recalc.pval.thresh=recalc.pval.thresh,
fast.score.SE=fast.score.SE)

fast.score.SE=fast.score.SE,
GxE.return.cov=GxE.return.cov)

res.i <- cbind(var.info, n.obs, freq, assoc)
}

#if (verbose & n.iter > 1 & i %% set.messages == 0) {
# message(paste("Iteration", i , "of", n.iter, "completed"))
#}
Expand Down
24 changes: 13 additions & 11 deletions man/assocTestSingle.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,15 @@
\S4method{assocTestSingle}{SeqVarIterator}(gdsobj, null.model,
test=c("Score", "Score.SPA", "BinomiRare", "CMP"),
recalc.pval.thresh=0.05, fast.score.SE=FALSE,
GxE=NULL,
GxE=NULL, GxE.return.cov=FALSE,
geno.coding=c("additive", "dominant", "recessive"),
sparse=TRUE, imputed=FALSE,
male.diploid=TRUE, genome.build=c("hg19", "hg38"),
BPPARAM=bpparam(), verbose=TRUE)
\S4method{assocTestSingle}{GenotypeIterator}(gdsobj, null.model,
test=c("Score", "Score.SPA", "BinomiRare", "CMP"),
recalc.pval.thresh=0.05, GxE=NULL,
recalc.pval.thresh=0.05,
GxE=NULL, GxE.return.cov=FALSE,
geno.coding=c("additive", "dominant", "recessive"),
male.diploid=TRUE, BPPARAM=bpparam(), verbose=TRUE)
}
Expand All @@ -33,6 +34,7 @@
\item{recalc.pval.thresh}{If test is not "Score", recalculate p-values using the specified `test` for variants with a Score p-value below this threshold; return the score p-value for all other variants.}
\item{fast.score.SE}{Logical indicator of whether to use the fast approximation of the score standard error for testing variant association. When \code{FALSE} (default), the true score SE is calculated. When \code{TRUE}, the fast score SE approximation from SAIGE is used. This option can only be used with a null model fit with \code{\link{fitNullModelFastScore}} or updated with \code{\link{nullModelFastScore}}. See 'Details' for further information.}
\item{GxE}{A vector of character strings specifying the names of the variables for which a genotype interaction term should be included.If \code{GxE} is not \code{NULL}, \code{test} is ignored and Wald tests of interaction are performed. If \code{GxE} is \code{NULL} (default) no genotype interactions are included. See 'Details' for further information.}
\item{GxE.return.cov}{Logical indicator of whether the estimated covariance matrix of the effect size estimates (betas) for the genotype and genotype interaction terms should be returned; the default is FALSE. Warning: setting to TRUE can greatly increase output size.}
%\item{ivar.return.betaCov}{Logical indicator of whether the estimated covariance matrix of the effect size estimates (betas) for the genotype and genotype interaction terms should be returned; the default is FALSE.}
\item{geno.coding}{Whether genotypes should be coded as "additive" (0, 1, or 2 copies of the effect allele), "recessive" (1=homozygous for the effect allele, 0 otherwise), or "dominant" (1=heterozygous or homozygous for the effect allele, 0 for no effect allele). For recessive coding on sex chromosomes, males are coded as 1 if they are hemizygous for the effect allele.}
\item{sparse}{Logical indicator of whether to read genotypes as sparse Matrix objects; the default is \code{TRUE}. Set this to \code{FALSE} if the alternate allele dosage of the genotypes in the test are not expected to be mostly 0.}
Expand All @@ -47,15 +49,15 @@
\code{assocTestSingle} uses the \code{\link{BiocParallel}} package to process iterator chunks in parallel. See the \code{\link{BiocParallel}} documentation for more information on the default behaviour of \code{\link{bpparam}} and how to register different parallel backends. If serial execution is desired, set \code{BPPARAM=BiocParallel::SerialParam()}. Note that parallel execution requires more RAM than serial execution.

All samples included in \code{null model} will be included in the association test, even if a different set of samples is present in the current filter for \code{gdsobj}.

The effect size estimate is for each copy of the alternate allele (when \code{gdsobj} is a \code{\link{SeqVarIterator}} object) or the "A" allele (when \code{gdsobj} is a \code{\link{GenotypeIterator}} object). We refer to this as the "effect allele" in the rest of the documentation. For multiallelic variants in \code{\link{SeqVarIterator}} objects, each alternate (or "A") allele is tested separately.
%When \code{impute.geno} is TRUE, sporadic missing genotype values are mean imputed using the minor allele frequency (MAF) calculated on all other samples at that SNP. When \code{impute.geno} is FALSE, samples with missing values for all of the SNP genotypes in the current SNP block are removed from the analysis for the block; this may significantly slow down computation time because many pre-computed matrices need to be re-computed each time the sample set changes. Also note: when \code{impute.geno} is FALSE, sporadic missingness for a sample inside of a SNP block will lead to an error.

Sporadic missing genotype values are mean imputed using the allele frequency calculated on all other samples at that variant.

Monomorphic variants (including variants where every sample is a heterozygote) are omitted from the results.
The input \code{GxE} can be used to perform GxE tests. Multiple interaction variables may be specified, but all interaction variables specified must have been included as covariates in fitting the null model with \code{fitNullModel}. When performing GxE analyses, \code{assocTestSingle} will report two tests: (1) the joint Wald test of all genotype interaction terms in the model (this is the test for any genotype interaction effect), and (2) the joint Wald test of the genotype term along with all of the genotype interaction terms (this is the test for any genetic effect). Individual genotype interaction terms can be tested by creating test statistics from the reported effect size estimates and their standard errors (Note: when \code{GxE} contains a single continuous or binary covariate, this test is the same as the test for any genotype interaction effect mentioned above). %In order to test more complex hypotheses regarding subsets of multiple genotype interaction terms, \code{ivar.return.betaCov} can be used to retrieve the estimated covariance matrix of the effect size estimates.

The input \code{GxE} can be used to perform GxE tests. Multiple interaction variables may be specified, but all interaction variables specified must have been included as covariates in fitting the null model with \code{fitNullModel}. When performing GxE analyses, \code{assocTestSingle} will report two tests: (1) the joint Wald test of all genotype interaction terms in the model (this is the test for any genotype interaction effect), and (2) the joint Wald test of the genotype term along with all of the genotype interaction terms (this is the test for any genetic effect). Individual genotype interaction terms can be tested by creating test statistics from the reported effect size estimates and their standard errors (Note: when \code{GxE} contains a single continuous or binary covariate, this test is the same as the test for any genotype interaction effect mentioned above). In order to test more complex hypotheses regarding subsets of multiple genotype interaction terms, \code{GxE.return.cov} can be used to retrieve the estimated covariance matrix of the effect size estimates.

The saddle point approximation (SPA), run by using \code{test = "Score.SPA"}, implements the method described by Dey et al. (2017), which was extended to mixed models by Zhou et al. (2018) in their SAIGE software. SPA provides better calibration of p-values when fitting mixed models with a binomial family for a sample with an imbalanced case to control ratio.

Expand Down Expand Up @@ -89,7 +91,7 @@
\item{PVE}{An approximation of the proportion of phenotype variance explained}
% If \code{test} is \code{"Wald"} and \code{GxE} is \code{NULL}:
% \item{Est}{The effect size estimate for each additional copy of the effect allele}
% \item{Est.SE}{The estimated standard error of the effect size estimate}
% \item{Est.SE}{The estimated standard error of the effect size estimate}
% \item{Wald.Stat}{The Wald Z test statistic}
% \item{Wald.pval}{The Wald p-value}
If \code{test} is \code{"Score.SPA"}:
Expand All @@ -100,17 +102,17 @@
\item{Est.G:env}{The effect size estimate for the genotype*env interaction term. There will be as many of these terms as there are interaction variables, and "env" will be replaced with the variable name.}
\item{SE.G}{The estimated standard error of the genotype term effect size estimate}
\item{SE.G:env}{The estimated standard error of the genotype*env effect size estimate. There will be as many of these terms as there are interaction variables, and "env" will be replaced with the variable name.}
\item{GxE.Stat}{The Wald Z test statistic for the test of all genotype interaction terms. When there is only one genotype interaction term, this is the test statistic for that term.}
\item{GxE.Stat}{The Wald chi-square test statistic for the test of all genotype interaction terms. When there is only one genotype interaction term, this is the test statistic for that term.}
\item{GxE.pval}{The Wald p-value for the test of all genotype interaction terms; i.e. the test of any genotype interaction effect}
\item{Joint.Stat}{The Wald Z test statistic for the joint test of the genotype term and all of the genotype interaction terms}
\item{Joint.Stat}{The Wald chi-square test statistic for the joint test of the genotype term and all of the genotype interaction terms}
\item{Joint.pval}{The Wald p-value for the joint test of the genotype term and all of the genotype interaction terms; i.e. the test of any genotype effect}
If \code{test} is \code{"BinomiRare" or "CMP"}:
\item{n.carrier}{Number of individuals with at least one copy of the effect allele}
\item{n.D.carrier}{Number of cases with at least one copy of the effect allele}
\item{pval}{p-value}
\item{mid.pval}{mid-p-value}
%When \code{GxE} is not \code{NULL}, if \code{ivar.return.betaCov} is \code{TRUE}, then the output is a list with two elements. The first, "results", is the data.frame described above. The second, "betaCov", is a list with length equal to the number of rows of "results", where each element of the list is the covariance matrix of the effect size estimates (betas) for the genotype and genotype interaction terms.

When \code{GxE} is not \code{NULL}, if \code{GxE.return.cov} is \code{TRUE}, then the output is a list with two elements. The first, "results", is the data.frame described above. The second, "GxEcovMatList", is a list with length equal to the number of rows of "results", where each element of the list is the covariance matrix of the effect size estimates (betas) for the genotype and genotype interaction terms.
}

\references{
Expand Down Expand Up @@ -157,7 +159,7 @@ nullmod <- fitNullModel(iterator, outcome="outcome", covars="sex")
# run the association test
assoc <- assocTestSingle(iterator, nullmod,
BPPARAM=BiocParallel::SerialParam())

# use fast score SE for a null model with a covariance matrix
seqResetFilter(seqData)
grm <- SNPRelate::snpgdsGRM(seqData, verbose=FALSE)
Expand Down