Permalink
Browse files

A couple of bug fixes to getCoverageInRange. Addition of PACKAGE argu…

…ment to all .Call functions. Addition of some more documentation.
  • Loading branch information...
1 parent aa63c40 commit 6fff2f1d0e6445a25cd2fece2b2c900c72d05872 James Bullard committed May 20, 2012
Showing with 166 additions and 44 deletions.
  1. +5 −3 NAMESPACE
  2. +46 −16 R/cmpH5Utils.R
  3. +58 −5 R/pacBioIO.R
  4. +3 −3 R/plsH5Utils.R
  5. +23 −0 man/getQualityValue.Rd
  6. +1 −0 man/getRefPath.Rd
  7. +11 −1 man/getUnrolledTemplateSpan.Rd
  8. +7 −16 src/pacBioIO.c
  9. +12 −0 tests/testall.R
View
@@ -70,7 +70,6 @@ export(getReadLength,
PacBioPlsH5,
getPulseEvents,
getBaseEvents,
- getHoleNumbers,
getBaselineSigma,
getBasecalls,
getDeletionQV,
@@ -102,7 +101,8 @@ export(getReadLength,
getBarcodeLabels,
getBarcodeScores,
getStartFrame,
- getCumulativeAdvanceTime)
+ getCumulativeAdvanceTime,
+ restrictCmpH5)
exportMethods(getSNR,
show,
@@ -124,5 +124,7 @@ exportMethods(getSNR,
getSubstitutionTag,
getWidthInFrames,
getStartFrame,
- getNumEvent)
+ getNumEvent,
+ getReadScore,
+ getHoleNumbers)
View
@@ -61,6 +61,10 @@ getRefNames <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
cmpH5$refName[idx]
}
+setMethod("getHoleNumbers", "PacBioCmpH5", function(h5Obj, idx = 1:nrow(h5Obj)) {
+ h5Obj$holeNumber[idx]
+})
+
getAccuracy <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
1 - rowSums(alnIndex(cmpH5)[idx, c("nMisMatches", "nInsertions", "nDeletions")])/
getReadLength(cmpH5, idx)
@@ -185,6 +189,13 @@ getFrameByBaselineSNR <- function(cmpH5, plsH5s, fx = function(x) median(x, na.r
})
}
+setMethod("getReadScore", "PacBioCmpH5", function(h5Obj, plsH5s) {
+ doWithPlsAndCmp(h5Obj, plsH5s, function(cmp, pls, idx) {
+ mtch <- match(cmp$holeNumber[idx], getBaseEvents(pls)[,"holeNumber"])
+ getReadScore(pls)[mtch]
+ })
+})
+
.narrowRegions <- function(regTable) {
regTable <- regTable[order(regTable$holeNumber),]
hqregs <- regTable$type == "HQRegion"
@@ -204,16 +215,14 @@ getFrameByBaselineSNR <- function(cmpH5, plsH5s, fx = function(x) median(x, na.r
return(regTable)
}
-getMoleculeIndex <- function(cmpH5) {
- factor(interaction(cmpH5$movieName, cmpH5$holeNumber, drop = T))
+getMoleculeIndex <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
+ p <- paste(cmpH5$movieName[idx], cmpH5$holeNumber[idx], sep = ".")
+ factor(p, levels = p[!duplicated(p)])
}
getUnrolledTemplateSpan <- function(cmpH5, idx = 1:nrow(cmpH5), unique = TRUE) {
- x <- tapply(getTemplateSpan(cmpH5)[idx], mi <- getMoleculeIndex(cmpH5)[idx], sum)
- if (! unique) {
- x <- rep(x, tapply(idx, mi, length))
- }
- return(x)
+ x <- tapply(getTemplateSpan(cmpH5, idx), m <- getMoleculeIndex(cmpH5, idx), sum)
+ as.integer(if (! unique) x[match(m, names(x))] else x)
}
#############################################################################
@@ -329,7 +338,7 @@ getReadsInRange <- function(cmpH5, refSeq, refStart = 1, refEnd = NA, idx = 1:nr
ranges <- alnIndex(cmpH5)[seq.int(refBlock[1], refBlock[2]), c("tStart", "tEnd", "nBackRead", "nOverlap")]
if (is.na(refEnd)) refEnd <- getRefLength(cmpH5, refSeq)
idxs <- .Call("PBR_indices_within_range", ranges[,1], ranges[,2], ranges[,3], ranges[,4],
- as.integer(refStart), as.integer(refEnd))
+ as.integer(refStart), as.integer(refEnd), PACKAGE = "pbh5")
## XXX: This shouldn't happen, but it has.
idxs[length(idxs)] <- ifelse(idxs[length(idxs)] >= refBlock[2], refBlock[2] - 1,
idxs[length(idxs)])
@@ -345,9 +354,9 @@ getCoverageInRange <- function (cmpH5, refSeq, refStart = 1, refEnd = NA, idx =
w <- sort(intersect(idx, seq.int(refBlock[1], refBlock[2])))
ranges <- alnIndex(cmpH5)[w, c("tStart", "tEnd", "nBackRead", "nOverlap")]
if (is.na(refEnd)) refEnd <- getRefLength(cmpH5, refSeq)
- if (nrow(ranges) == 0) return(integer(refEnd - refStart + 1))
+ if (nrow(ranges) == 0) { print("ranges empty!"); return(integer(refEnd - refStart + 1)) }
.Call("PBR_coverage_within_range", ranges[, 1], ranges[,2], ranges[, 3], ranges[, 4],
- as.integer(refStart), as.integer(refEnd))
+ as.integer(refStart), as.integer(refEnd), PACKAGE = "pbh5")
}
#############################################################################
@@ -356,7 +365,7 @@ getCoverageInRange <- function (cmpH5, refSeq, refStart = 1, refEnd = NA, idx =
##
#############################################################################
templatePositionFromAln <- function(alnLst, strand, start, end) {
- .Call("PBR_get_template_position", alnLst, strand, start, end)
+ .Call("PBR_get_template_position", alnLst, strand, start, end, PACKAGE = "pbh5")
}
.reverse <- function(x, strand) {
@@ -453,15 +462,15 @@ getCallsForReads <- function(rawAlns, refStart, refEnd, strands, starts, ends) {
stopifnot(length(rawAlns) == length(starts) && length(rawAlns) == length(strands) &&
length(rawAlns) == length(ends))
.Call("PBR_compute_consensus", rawAlns, as.integer(refStart), as.integer(refEnd),
- as.integer(strands), as.integer(starts), as.integer(ends))
+ as.integer(strands), as.integer(starts), as.integer(ends), PACKAGE = "pbh5")
}
getConsensusForReads <- function(rawAlns, refStart, refEnd, strands, starts, ends, fast = TRUE) {
if (fast) {
stopifnot(length(rawAlns) == length(starts) && length(rawAlns) == length(strands) &&
length(rawAlns) == length(ends))
.Call("PBR_compute_consensus2", rawAlns, as.integer(refStart), as.integer(refEnd),
- as.integer(strands), as.integer(starts), as.integer(ends))
+ as.integer(strands), as.integer(starts), as.integer(ends), PACKAGE = "pbh5")
} else {
computeConsensus(getCallsForReads(rawAlns, refStart, refEnd, strands, starts, ends))
}
@@ -554,6 +563,17 @@ computeConsensus <- function(calls) {
## Access and Manipulation
##
#############################################################################
+getAlignmentsWithFeatures <- function(cmpH5, idx = 1:nrow(cmpH5), fxs = getIPD, collapse = FALSE) {
+ if (is.list(fxs)) {
+ .getAlignmentsWithFeaturesList(cmpH5, idx, fxs, collapse)
+ } else {
+ alns <- getAlignments(cmpH5, idx)
+ dta <- do.call(c, fxs(cmpH5, idx))
+ idx <- rep(idx, sapply(alns, nrow))
+ data.frame(do.call(rbind, alns), idx, dta, row.names = NULL, stringsAsFactors = FALSE)
+ }
+}
+
.getAlignmentsWithFeaturesAsDataFrame <- function(cmpH5, idx, features) {
alns <- getAlignments(cmpH5, idx)
ds <- lapply(features, function(fx) {
@@ -564,8 +584,8 @@ computeConsensus <- function(calls) {
data.frame(alns, idx, ds, row.names = NULL, stringsAsFactors = FALSE)
}
-getAlignmentsWithFeatures <- function(cmpH5, idx = seq.int(1, nrow(cmpH5)),
- fxs = list('ipd' = getIPD), collapse = FALSE) {
+.getAlignmentsWithFeaturesList <- function(cmpH5, idx = seq.int(1, nrow(cmpH5)),
+ fxs = list('ipd' = getIPD), collapse = FALSE) {
if (collapse) {
dta <- .getAlignmentsWithFeaturesAsDataFrame(cmpH5, idx, fxs)
} else {
@@ -585,9 +605,11 @@ getAlignmentsWithFeatures <- function(cmpH5, idx = seq.int(1, nrow(cmpH5)),
return(dta)
}
+
+
#############################################################################
##
-## Barcode Stuff
+## Barcode
##
#############################################################################
getBarcodeLabels <- function(cmpH5, idx = 1:nrow(cmpH5)) {
@@ -596,3 +618,11 @@ getBarcodeLabels <- function(cmpH5, idx = 1:nrow(cmpH5)) {
getBarcodeScores <- function(cmpH5, idx = 1:nrow(cmpH5)) {
getH5Dataset(cmpH5, "AlnInfo/barcodeScore")[idx]
}
+
+#############################################################################
+##
+## Kinetics Analyses
+##
+#############################################################################
+
+## On hold till I get clarity on the datasets.
View
@@ -121,6 +121,14 @@ setGeneric("getSNR", function(h5Obj, ...) {
standardGeneric("getSNR")
})
+setGeneric("getReadScore", function(h5Obj, ...) {
+ standardGeneric("getReadScore")
+})
+
+setGeneric("getHoleNumbers", function(h5Obj, ...) {
+ standardGeneric("getHoleNumbers")
+})
+
## The QualityValues are in both basH5 and cmpH5 files.
setGeneric("getQualityValue", function(h5Obj, ...) {
standardGeneric("getQualityValue")
@@ -188,7 +196,7 @@ PacBioCmpH5 <- function(fileName) {
}
AlnGroup <- .initMap("AlnGroup")
RefGroup <- .initMap("RefGroup", c("ID", "Path", "RefInfoID"))
- RefInfo <- .initMap("RefInfo", c("ID", "Name", "FullName", "Length", "MD5"))
+ RefInfo <- .initMap("RefInfo", c("ID", "FullName", "Length", "MD5"))
MovieInfo <- .initMap("MovieInfo", c("ID", "Name", "Exp", "Run", "FrameRate"))
## add some things to these data structures.
@@ -207,7 +215,6 @@ PacBioCmpH5 <- function(fileName) {
} else {
isSorted <- FALSE
}
-
RefGroup <- merge(RefGroup, RefInfo, by.x = "RefInfoID", by.y = "ID")
RefGroup$Name <- gsub("^/", "", RefGroup$Path)
@@ -249,6 +256,12 @@ PacBioCmpH5 <- function(fileName) {
obj@AlnIndex[, "tStart"] <- as.integer(obj@AlnIndex[, "tStart"] + 1)
obj@AlnIndex[, "rStart"] <- as.integer(obj@AlnIndex[, "rStart"] + 1)
obj@AlnIndex[, "offsetBegin"] <- as.integer(obj@AlnIndex[, "offsetBegin"] + 1)
+
+ ## Deterministically clean up all the garbage we created in building up this
+ ## PacBioCmpH5 object, before returning.
+ unneeded <- setdiff(ls(), c("obj"))
+ rm(list=unneeded)
+ gc()
return(obj)
}
@@ -306,9 +319,9 @@ getRefMD5 <- function(cmpH5, refSeq) {
setMethod("show", "PacBioCmpH5", function(object) {
callNextMethod(object)
cat("N Alignments:", nrow(alnIndex(object)), "\n")
+ cat("N Bases:", format(sum(getReadLength(object)), digits = 4), "\n")
cat("N ReadGroups:", nrow(alnGroup(object)), "\n")
cat("N RefSeqs:", nrow(refGroup(object)), "\n")
- cat("N Bases:", sum(getReadLength(object)), "\n")
})
setMethod("summary", "PacBioCmpH5", function(object) {
@@ -494,6 +507,40 @@ colnames(.bMap) <- c("read", "reference")
}
}
+## XXX : IRanges has a generic called 'restrict' -- my preferred name
+## -- and I don't want to deal with overwriting it and load order,
+## etc.
+restrictCmpH5 <- function(cmpH5, mask = rep(TRUE, nrow(cmpH5))) {
+ ## A mask is used instead of the more common permutation because I
+ ## want to preserve order.
+ if (any(! is.finite(mask)) || length(mask) != nrow(cmpH5)) {
+ stop("mask must not contain missing values and, nrow(cmpH5) == length(mask), must hold.")
+ }
+ nalnIdx <- cmpH5@AlnIndex[mask, ]
+ nrefGroup <- refGroup(cmpH5)
+
+ ntbl <- table(factor(nalnIdx[,"refName"],
+ levels = nrefGroup$Name[order(nrefGroup$offsetBegin)]))
+
+ ## The key thing to be careful about is that the offsets aren't in
+ ## any particular order.
+ csum <- cumsum(ntbl)
+ noffsets <- cbind(c(1, csum[-length(csum)] + 1), csum)
+ rownames(noffsets) <- names(ntbl)
+ nrefGroup[match(rownames(noffsets),
+ nrefGroup$Name), c("offsetBegin", "offsetEnd")] <- noffsets[,c(1,2)]
+ ## And you can lose references.
+ nrefGroup <- nrefGroup[nrefGroup[,"offsetEnd"]-nrefGroup[,"offsetBegin"] > 0,]
+
+ cmpH5@AlnIndex <- nalnIdx
+ cmpH5@RefGroup <- nrefGroup
+
+ ## the other thing that I could consider doing is consider fixing up
+ ## the nBack* columns, but it just reverts to linear search (it
+ ## should unless there are bugs.)
+ return(cmpH5)
+}
+
getIPD <- function(cmpH5, idx = seq.int(1, nrow(cmpH5)), unit = c("seconds", "frames")) {
.getFrameBasedDataset(cmpH5, idx, "IPD", unit, naValue = 0xffff)
}
@@ -520,7 +567,10 @@ getCumulativeAdvanceTime <- function(cmpH5, idx = seq.int(1, nrow(cmpH5)), unit
mapply(getIPD(cmpH5, idx, unit), getPulseWidth(cmpH5, idx, unit), FUN = function(ipd, pw) {
v <- ipd + pw
v <- cumsum(ifelse(is.na(v), 0, v))
- v[is.na(ipd)] <- NA
+ ## it actually makes a difference if you use pw versus ipd;
+ ## unfortunately, the first IPD is sometimes NA - this is a bug in
+ ## loadPulses.
+ v[is.na(pw)] <- NA
v
}, SIMPLIFY = FALSE)
}
@@ -643,9 +693,12 @@ PacBioPlsH5 <- function(fileName) {
getPulseEvents <- function(plsH5) plsH5@pulseEvents
getBaseEvents <- function(basH5) basH5@baseEvents
getCCSEvents <- function(basH5) basH5@ccsBaseEvents
-getHoleNumbers <- function(basH5) basH5@baseEvents[, "holeNumber"]
getCCSHoleNumbers <- function(basH5) basH5@ccsBaseEvents[, "holeNumber"]
+setMethod("getHoleNumbers", "PacBioBasH5", function(h5Obj) {
+ h5Obj@baseEvents[, "holeNumber"]
+})
+
getBaselineSigma <- function(plsH5) {
getH5Dataset(plsH5, "PulseData/PulseCalls/ZMW/BaselineSigma")
}
View
@@ -73,9 +73,9 @@ setMethod("getNumEvent", "PacBioBasH5", function(h5Obj) {
getH5Dataset(h5Obj, "PulseData/BasCalls/ZMW/NumEvent")[]
})
-getReadScore <- function(basH5) {
- getH5Dataset(basH5, "PulseData/BaseCalls/ZMWMetrics/ReadScore")[]
-}
+setMethod("getReadScore", "PacBioBasH5", function(h5Obj) {
+ getH5Dataset(h5Obj, "PulseData/BaseCalls/ZMWMetrics/ReadScore")[]
+})
setMethod("getSNR", "PacBioBasH5", function(h5Obj) {
snr <- getH5Dataset(h5Obj, "PulseData/BaseCalls/ZMWMetrics/HQRegionSNR")[]
View
@@ -30,6 +30,13 @@
}
\usage{
getQualityValue(h5Obj, ...)
+ getInsertionQV(h5Obj, ...)
+ getDeletionQV(h5Obj, ...)
+ getMergeQV(h5Obj, ...)
+ getSubstitutionQV(h5Obj, ...)
+
+ getDeletionTag(h5Obj, ...)
+ getSubstitutionTag(h5Obj, ...)
}
\arguments{
\item{h5Obj}{
@@ -44,6 +51,7 @@
A list of vectors of quality values.
}
\examples{
+ gc()
require(pbh5)
cmpH5 <- PacBioCmpH5(system.file("h5_files", "aligned_reads.cmp.h5",
package = "pbh5"))
@@ -53,4 +61,19 @@
length(values[[1]])
head(values[[1]])
}
+\details{
+ Quality values are provided in the standard \emph{phred} scaling,
+ \deqn{ QV = -10 \log_{10} p_{err} }
+ so that a larger QV values indicates a smaller probability of the
+ given error type.
+
+ Deletion QVs correspond to the the probability of a deletion event
+ occuring \emph{before} the indicated sequence position, and the
+ DeletionTag value will be an uppercase ASCII-encoded DNA base (`A',
+ `C', `T', `G') or else `N' if the primary pipeline did not identify
+ a "most-likely" deleted base.
+
+ Quality values are calculated in the primary analysis pipeline based
+ on pulse metrics.
+}
\keyword{datasets}
View
@@ -23,6 +23,7 @@
character representing the absolute path to the reference group.
}
\examples{
+ gc()
require(pbh5)
cmpH5 <- PacBioCmpH5(system.file("h5_files", "aligned_reads.cmp.h5", package = "pbh5"))
getRefPath(cmpH5, 1) == getRefPath(cmpH5, "ref000001")
@@ -9,7 +9,7 @@
'getUnrolledTemplateSpan' commputes the unrolled template span.
}
\usage{
-getUnrolledTemplateSpan(cmpH5, idx = 1:nrow(cmpH5), unique = TRUE)
+ getUnrolledTemplateSpan(cmpH5, idx = 1:nrow(cmpH5), unique = TRUE)
}
\arguments{
\item{cmpH5}{
@@ -20,13 +20,23 @@ getUnrolledTemplateSpan(cmpH5, idx = 1:nrow(cmpH5), unique = TRUE)
}
\item{unique}{
Boolean statement whether the unrolled template span is unique.
+ }
}
+\details{
+ The getUnrolledTemplateSpan is based on the getMoleculeIndex function
+ which returns a vector of length(idx) of type factor designating the
+ molecule's ID. the vector is ordered as idx, therefore the default
+ behavior of \code{idx = 1:nrow(cmpH5)} means that the vector is as the
+ cmpH5 object -- this is only distinct from other functions in that
+ there are repeats in the vector, hence the \code{unique = TRUE}
+ argument of getUnrolledTemplateSpan.
}
\value{
'getUnrolledTemplateSpan' returns an array with molecule index and the
unrolled span length.
}
\examples{
+ gc()
require(pbh5)
cmpH5 <- PacBioCmpH5(system.file("h5_files", "aligned_reads.cmp.h5",
package = "pbh5"))
Oops, something went wrong.

0 comments on commit 6fff2f1

Please sign in to comment.