Example dataset and formatting updates

esebesty · Jul 7, 2020 · 49fe913 · 49fe913
1 parent 58099c2
commit 49fe913
Show file tree

Hide file tree

Showing 14 changed files with 100 additions and 83 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,7 +2,7 @@ Package: SplicingFactory
 Type: Package
 Title: Splicing Diversity Analysis for Transcriptome Data
 biocViews: Transcriptomics, RNASeq, DifferentialSplicing, AlternativeSplicing, TranscriptomeVariant
-Version: 0.99.3
+Version: 0.99.4
 Authors@R: c(
     person("Peter A.", "Szikora", role = c("aut", "cre"), email = "peter.andras.szikora@gmail.com"),
     person("Endre", "Sebestyen", role = "aut", email = "sebestyen.endre@med.semmelweis-univ.hu", comment = c(ORCID = "0000-0001-5470-2161")))

diff --git a/NEWS.md b/NEWS.md
@@ -1,18 +1,23 @@
-# SplicingFactory 0.99.3
+# SplicingFactory 0.99.4 (dev)
+
+* Code and documentation formatting corrections.
+* Updates to example dataset.
+
+# SplicingFactory 0.99.3 (dev)
 
 * SummarizedExperiment input type updated in calculate_diversity, 
   new argument: SE_assay.
 
 * Documentation, vignette updated.
 
-# SplicingFactory 0.99.2
+# SplicingFactory 0.99.2 (dev)
 
 * SummarizedExperiment input type instead of ExpressionSet.
 
-# SplicingFactory 0.99.1
+# SplicingFactory 0.99.1 (dev)
 
 * Correction: unnecessary file removed.
 
 # SplicingFactory 0.99.0 (dev)
 
-* Submitted to Bioconductor
+* Submitted to Bioconductor.
diff --git a/R/calculate_difference.R b/R/calculate_difference.R
@@ -44,57 +44,57 @@
 #' # sample categories
 #' samples <- c(rep('Healthy', 4), rep('Pathogenic', 4))
 #'
-#' # To calculate the difference of splicing diversity changes between the 'Healthy'
-#' # and 'Pathogenic' condition together with the significance values, using mean
-#' # and Wilcoxon rank sum test, use:
+#' # To calculate the difference of splicing diversity changes between the
+#' # 'Healthy' and 'Pathogenic' condition together with the significance values,
+#' # using mean and Wilcoxon rank sum test, use:
 #' calculate_difference(x, samples, control = 'Healthy', method = 'mean', test = 'wilcoxon')
-calculate_difference <- function(x, samples, control, method = "mean", test = "wilcoxon", randomizations = 100, 
+calculate_difference <- function(x, samples, control, method = "mean", test = "wilcoxon", randomizations = 100,
     ...) {
-    if (!is(x, "data.frame")) 
+    if (!is(x, "data.frame"))
         stop("Input data type is not supported! Please use `?calculate_difference`
-         to see the possible arguments and details.", 
+         to see the possible arguments and details.",
             call. = FALSE)
-    if (ncol(x) - 1 != length(samples)) 
+    if (ncol(x) - 1 != length(samples))
         stop("The number of columns in the data.frame is not equal to the number of
-         samples defined in the samples argument.", 
+         samples defined in the samples argument.",
             call. = FALSE)
-    if (length(levels(as.factor(samples))) > 2) 
+    if (length(levels(as.factor(samples))) > 2)
         stop("The number of conditions are higher than two. Please use exactly two
-         different sample conditions, e.g. healthy and pathogenic.", 
+         different sample conditions, e.g. healthy and pathogenic.",
             call. = FALSE)
-    if (length(levels(as.factor(samples))) < 2) 
+    if (length(levels(as.factor(samples))) < 2)
         stop("The number of conditions are smaller than two. Please use exactly two
-         different sample conditions, e.g. healthy and pathogenic.", 
+         different sample conditions, e.g. healthy and pathogenic.",
             call. = FALSE)
-    if (!(control %in% samples)) 
+    if (!(control %in% samples))
         stop("This control sample type cannot be found in your samples.")
-    if (!(method %in% c("mean", "median"))) 
+    if (!(method %in% c("mean", "median")))
         stop("Invalid method. Please use `?calculate_diversity` to see the possible
-         arguments and details.", 
+         arguments and details.",
             call. = FALSE)
-    if (!(test %in% c("wilcoxon", "shuffle"))) 
+    if (!(test %in% c("wilcoxon", "shuffle")))
         stop("Invalid test method. Please use `?calculate_diversity` to see the
-         possible arguments and details.", 
+         possible arguments and details.",
             call. = FALSE)
     if (test == "wilcoxon") {
-        if (randomizations != 100) 
+        if (randomizations != 100)
             message("Note: The 'randomizations' argument is an option for label shuffling,
-              it won't have any effect on the Wilcoxon rank sum test.", 
+              it won't have any effect on the Wilcoxon rank sum test.",
                 call. = FALSE)
-        if (length(grep(unique(samples)[1], samples)) < 3 | 
-            length(grep(unique(samples)[2], samples)) < 3 | 
-            length(samples) < 8) 
+        if (length(grep(unique(samples)[1], samples)) < 3 |
+            length(grep(unique(samples)[2], samples)) < 3 |
+            length(samples) < 8)
             warning("Low sample size. Wilcoxon rank sum test requires at least
       three samples in a given category and at least 8 samples overall for a
-              theoretical p-value smaller than 0.05.", 
+              theoretical p-value smaller than 0.05.",
                 call. = FALSE)
     }
     if (test == "shuffle") {
-        if (length(samples) <= 5) 
+        if (length(samples) <= 5)
             warning("Low sample size, not enough samples for label shuffling!", call. = FALSE)
-        if (length(samples) > 5 & length(samples) < 10) 
+        if (length(samples) > 5 & length(samples) < 10)
             warning("Low sample size, label shuffling might not give informative and
-              correct results.", 
+              correct results.",
                 call. = FALSE)
     }
     x$cond_1 <- apply(x[grep(unique(samples)[1], samples) + 1], 1, function(x) sum(!is.na(x)))

diff --git a/R/diversity_functions.R b/R/diversity_functions.R
@@ -17,7 +17,7 @@ calculate_entropy <- function(x, norm = TRUE) {
     if (sum(x) != 0 & length(x) > 1) {
         x <- x/sum(x)
         x_log = ifelse(is.finite(log(x, base = 2)), log(x, base = 2), 0)
-        
+
         if (norm == FALSE) {
             x = -sum(x * x_log)
         }
@@ -44,15 +44,15 @@ calculate_entropy <- function(x, norm = TRUE) {
 #' @details
 #' The function calculates a Laplace entropy value as part of different
 #' diversity calculations. Given a vector of transcript-level expression values
-#' of a gene, this function characterize the diversity of splicing isoforms for a
-#' gene. If there only one single transcript, the resulted index will be NaN, as
-#' diversity cannot be calculated. If the expression of the given gene is 0, the
-#' diversity index will be NA.
+#' of a gene, this function characterize the diversity of splicing isoforms for
+#' a gene. If there only one single transcript, the resulted index will be NaN,
+#' as diversity cannot be calculated. If the expression of the given gene is 0,
+#' the diversity index will be NA.
 calculate_laplace_entropy <- function(x, norm = TRUE) {
     if (sum(x) != 0 & length(x) > 1) {
         x <- (x + 1)/sum(x + 1)
         x_log = ifelse(is.finite(log(x, base = 2)), log(x, base = 2), 0)
-        
+
         if (norm == FALSE) {
             x = -sum(x * x_log)
         }
@@ -76,10 +76,10 @@ calculate_laplace_entropy <- function(x, norm = TRUE) {
 #' @details
 #' The function calculates a Gini coefficient as part of different
 #' diversity calculations. Given a vector of transcript-level expression values
-#' of a gene, this function characterize the diversity of splicing isoforms for a
-#' gene. If there only one single transcript, the resulted index will be NaN, as
-#' diversity cannot be calculated. If the expression of the given gene is 0, the
-#' diversity index will be NA.
+#' of a gene, this function characterize the diversity of splicing isoforms for
+#' a gene. If there only one single transcript, the resulted index will be NaN,
+#' as diversity cannot be calculated. If the expression of the given gene is 0,
+#' the diversity index will be NA.
 calculate_gini <- function(x) {
     if (sum(x) != 0 & length(x) > 1) {
         x <- sort(x)
@@ -101,10 +101,10 @@ calculate_gini <- function(x) {
 #' @details
 #' The function calculates a Simpson index as part of different
 #' diversity calculations. Given a vector of transcript-level expression values
-#' of a gene, this function characterize the diversity of splicing isoforms for a
-#' gene. If there only one single transcript, the resulted index will be NaN, as
-#' diversity cannot be calculated. If the expression of the given gene is 0, the
-#' diversity index will be NA.
+#' of a gene, this function characterize the diversity of splicing isoforms for
+#' a gene. If there only one single transcript, the resulted index will be NaN,
+#' as diversity cannot be calculated. If the expression of the given gene is 0,
+#' the diversity index will be NA.
 calculate_simpson <- function(x) {
     if (sum(x) != 0 & length(x) > 1) {
         x <- x/sum(x)
@@ -125,10 +125,10 @@ calculate_simpson <- function(x) {
 #' @details
 #' The function calculates an inverse Simpson index as part of different
 #' diversity calculations. Given a vector of transcript-level expression values
-#' of a gene, this function characterize the diversity of splicing isoforms for a
-#' gene. If there only one single transcript, the resulted index will be NaN, as
-#' diversity cannot be calculated. If the expression of the given gene is 0, the
-#' diversity index will be NA.
+#' of a gene, this function characterize the diversity of splicing isoforms for
+#' a gene. If there only one single transcript, the resulted index will be NaN,
+#' as diversity cannot be calculated. If the expression of the given gene is 0,
+#' the diversity index will be NA.
 calculate_inverse_simpson <- function(x) {
     if (sum(x) != 0 & length(x) > 1) {
         x <- x/sum(x)

diff --git a/R/example_dataset.R → R/tcga_brca_luma_dataset.R b/R/example_dataset.R → R/tcga_brca_luma_dataset.R
@@ -8,7 +8,13 @@
 #'
 #' @usage data(example_dataset)
 #'
-#' @format An object of class \code{"cross"}; see \code{\link[qtl]{read.cross}}.
+#' @format A data frame with 1054 rows and 41 columns. The first 3 columns
+#' contain the following:
+#' \describe{
+#'   \item{genes}{Gene names}
+#'   \item{TCGA-A7-A0CH_N}{TCGA-A7-A0CH patient normal tissue sample}
+#'   \item{TCGA-A7-A0CH_T}{TCGA-A7-A0CH patient tumor tissue sample}
+#' }.
 #'
 #' @keywords datasets
 #'
@@ -17,4 +23,4 @@
 #'
 #' @source \href{https://portal.gdc.cancer.gov/legacy-archive}{TCGA Legacy}
 #'
-"example_dataset"
+"tcga_brca_luma_dataset"
diff --git a/data/example_dataset.RData b/data/example_dataset.RData
diff --git a/data/tcga_brca_luma_dataset.RData b/data/tcga_brca_luma_dataset.RData
diff --git a/man/calculate_difference.Rd b/man/calculate_difference.Rd
diff --git a/man/calculate_gini.Rd b/man/calculate_gini.Rd
diff --git a/man/calculate_inverse_simpson.Rd b/man/calculate_inverse_simpson.Rd
diff --git a/man/calculate_laplace_entropy.Rd b/man/calculate_laplace_entropy.Rd
diff --git a/man/calculate_simpson.Rd b/man/calculate_simpson.Rd
diff --git a/man/example_dataset.Rd → man/tcga_brca_luma_dataset.Rd b/man/example_dataset.Rd → man/tcga_brca_luma_dataset.Rd
diff --git a/vignettes/SplicingFactory.Rmd b/vignettes/SplicingFactory.Rmd
@@ -37,7 +37,7 @@ knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>"
 )
-```	
+```
 
 # Standard workflow
 
@@ -84,26 +84,26 @@ a numeric value, specifying the assay to be analyzed.
 
 ### Example dataset
 
-The package contains an example dataset called `example_dataset`. The data was
-downloaded from The Cancer Genome Atlas on 12th of April, 2020. It contains
-transcript level read counts for 300 pre-selected genes of 40 patients with
-Luminal A type breast cancer (primary tumor and solid normal samples).
+The package contains an example dataset called `tcga_brca_luma_dataset`. The
+data was downloaded from The Cancer Genome Atlas on 12th of April, 2020. It
+contains transcript level read counts for 300 pre-selected genes of 40 patients
+with Luminal A type breast cancer (primary tumor and solid normal samples).
 Transcript level expression was estimated with RSEM.
 
 ```{r setup}
 library("SplicingFactory")
 
-data(example_dataset)
+data(tcga_brca_luma_dataset)
 
 # Extract gene names
-genes <- example_dataset[, 1] 
+genes <- tcga_brca_luma_dataset[, 1]
 
 # Extract sample conditions
-samples <- ifelse(grepl("_N", colnames(example_dataset)[-1]),
+samples <- ifelse(grepl("_N", colnames(tcga_brca_luma_dataset)[-1]),
                   "Solid tissue normal", "Primary Tumor")
 
 # Extract read count matrix
-x <- example_dataset[, -1]
+x <- tcga_brca_luma_dataset[, -1]
 
 dim(x)