Bump Version 1.5 (#7)

* add snp-only aln support * update_readme * update .gitignore' ' '' * update workflow * update gitignore * .gitignore updated * update gitignore * update gitignore * TODO:SnpEff_local * bundle_snpEff * bug_fix * save_fit_data * minor_bug_fix * to_do * skip_tanglegram * NewYear_bugfix * Drop genbankr dependency * remove devel test from workflows * Mega_alignments * Improve Readme * logo resize * update icon * LDweaver_1.5 * Fix logo height * Fix logo height * Update r.yml Update work.yml for hard dependencies --------- Co-authored-by: Sudaraka88 <sudarakm@ua-eduroam-ten-25-132-182.uniaccess.unimelb.edu.au> Co-authored-by: Sudaraka88 <sudarakm@Sudarakas-MacBook-Pro.local> Co-authored-by: Sudaraka88 <sudarakm@ua-eduroam-ten-25-135-47.uniaccess.unimelb.edu.au> Co-authored-by: Sudaraka88 <sudarakm@Sudarakas-MBP.home>
Sudaraka88 · Apr 22, 2024 · fd6ef3d · fd6ef3d
1 parent 597269f
commit fd6ef3d
Show file tree

Hide file tree

Showing 23 changed files with 541 additions and 1,547 deletions.
diff --git a/.github/workflows/r.yml b/.github/workflows/r.yml
@@ -8,7 +8,7 @@ on:
   schedule:
     - cron:  '1 1 1 * *'
 
-name: R-CMD-check
+name: R-CMD-check-hard
 
 jobs:
   R-CMD-check:
@@ -24,14 +24,14 @@ jobs:
           - {os: windows-latest, r: 'release'}
          # - {os: ubuntu-latest,   r: 'devel', http-user-agent: 'release'}
           - {os: ubuntu-latest,   r: 'release'}
-          - {os: ubuntu-latest,   r: 'oldrel-1'}
+         # - {os: ubuntu-latest,   r: 'oldrel-1'}
 
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
       R_KEEP_PKG_SOURCE: yes
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - uses: r-lib/actions/setup-pandoc@v2
 
@@ -43,7 +43,13 @@ jobs:
 
       - uses: r-lib/actions/setup-r-dependencies@v2
         with:
-          extra-packages: any::rcmdcheck
+          dependencies: '"hard"'
+          cache: false
+          extra-packages: |
+            any::rcmdcheck
+            any::testthat
+            any::knitr
+            any::rmarkdown
           needs: check
 
       - uses: r-lib/actions/check-r-package@v2

diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,6 @@
 .DS_Store
 *.Rmd
 Makevars
+
+testscripts.R
+testscript_op
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: LDWeaver
 Type: Package
 Title: Genomewide Epistasis Analysis on Bacteria
-Version: 1.4
+Version: 1.5
 Authors@R: person("Sudaraka", "Mallawaarachchi", email = "smallawaarachchi@gmail.com", role = c("aut", "cre"))
 Maintainer: Sudaraka Mallawaarachchi <smallawaarachchi@gmail.com>
 Description: Perform genomewide epistasis analysis by evaluating the LD structure in bacteria.
@@ -10,7 +10,7 @@ Encoding: UTF-8
 LazyData: true
 biocViews: Software
 Depends: R (>= 4.0.0),
-Imports: 
+Imports:          
          ape,
          Biostrings,
          chromoMap,
@@ -19,9 +19,7 @@ Imports:
          fitdistrplus,
          GenomicRanges,
          GenomeInfoDb,
-         ggnewscale,
          ggplot2,
-         ggtree,
          ggraph,
          grDevices,
          heatmap3,
@@ -32,7 +30,6 @@ Imports:
          MatrixExtra,
          methods,
          parallel,
-         phytools,
          plyr,
          RColorBrewer,
          Rcpp,
@@ -41,8 +38,14 @@ Imports:
          stats,
          utils,
          VariantAnnotation
+Suggests:
+         phytools,
+         ggtree,
+         ggnewscale,
+         spam,
+         spam64
 LinkingTo: Rcpp, RcppArmadillo
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 URL: https://github.com/Sudaraka88/LDWeaver
 BugReports: https://github.com/Sudaraka88/LDWeaver/issues
 Biarch: TRUE
diff --git a/NAMESPACE b/NAMESPACE
@@ -64,7 +64,6 @@ importFrom(S4Vectors,subjectHits)
 importFrom(VariantAnnotation,VRanges)
 importFrom(VariantAnnotation,makeVRangesFromGRanges)
 importFrom(ape,read.gff)
-importFrom(ape,read.tree)
 importFrom(chromoMap,chromoMap)
 importFrom(data.table,"%between%")
 importFrom(data.table,.I)
@@ -74,7 +73,6 @@ importFrom(data.table,setattr)
 importFrom(dplyr,`%>%`)
 importFrom(dplyr,summarise)
 importFrom(fitdistrplus,fitdist)
-importFrom(ggnewscale,new_scale_fill)
 importFrom(ggplot2,aes)
 importFrom(ggplot2,facet_wrap)
 importFrom(ggplot2,geom_point)
@@ -89,8 +87,6 @@ importFrom(ggraph,geom_edge_arc2)
 importFrom(ggraph,geom_node_label)
 importFrom(ggraph,ggraph)
 importFrom(ggraph,scale_edge_colour_discrete)
-importFrom(ggtree,ggtree)
-importFrom(ggtree,gheatmap)
 importFrom(grDevices,colorRampPalette)
 importFrom(grDevices,png)
 importFrom(heatmap3,heatmap3)
@@ -101,7 +97,6 @@ importFrom(methods,as)
 importFrom(methods,is)
 importFrom(methods,new)
 importFrom(parallel,detectCores)
-importFrom(phytools,midpoint.root)
 importFrom(plyr,.)
 importFrom(plyr,ddply)
 importFrom(stats,coef)

diff --git a/R/BacGWES.R b/R/BacGWES.R
@@ -6,42 +6,53 @@
 #' @importFrom utils timestamp packageVersion
 #'
 #' @param dset name of the dataset, all outputs will be saved to the folder <dset>
-#' @param aln_path path to the multi fasta alignment
-#' @param aln_has_all_bases specify whether the alignment has all bases in the reference genome (default = T). For example, if it's a SNP only alignment, set to F and provide pos
-#' @param pos numeric vector of positions for each base in the alignment (default = NULL). Only required if sites are missing from the alignment (e.g. SNP alignment output from snp-sites or https://github.com/Sudaraka88/FastaR)
-#' @param gbk_path path to genbank annotations file (default = NULL). Only provide one of genbank or gff3 annotation files.
-#' @param gff3_path path to gff3 annotations file (default = NULL). Only provide one of genbank or gff3 annotation files.
-#' @param ref_fasta_path path to Reference fasta file. The file MUST be in fasta format and contain exactly one sequence! Required for gff3 annotations,
-#' not required for genbank annotations if the file contains the reference sequence.
-#' @param validate_ref_ann_lengths check if the gbk reference sequence length matches the with fasta alignment (default = T)
-#' @param snp_filt_method specify the filtering method for SNP extraction: 'relaxed' or 'default' (default = 'default'). Unlike default, relaxed mode considers ambiguous/gap characters (N) as minor alleles when applying the
-#' maf_freq filter. Eg: Under default filter values, a site with allele frequencies A:0.85, C:0.0095, N:0.1405 will be respectively dropped and allowed by 'default' and 'relaxed' methods.
+#' @param aln_path path to the multi fasta alignment - can be a SNP only or full alignment
+#' @param aln_has_all_bases specify whether the alignment has all bases in the reference genome (default = T). For a SNP only alignment, set to F and provide SNP positions in pos (see example below)
+#' @param pos numeric vector of positions for each base in the alignment (default = NULL). Only required for SNP alignments (e.g. Output from snp-sites or FastaR https://github.com/Sudaraka88/FastaR)
+#' @param gbk_path path to genbank annotations file (default = NULL). Only provide one of genbank or gff3
+#' @param gff3_path path to gff3 annotations file (default = NULL). Only provide one of genbank or gff3
+#' @param ref_fasta_path path to Reference fasta file. This must be in fasta format and contain exactly one sequence! Only required for gff3 annotations
+#' @param validate_ref_ann_lengths check if the reference sequence length matches with fasta alignment. (default = T for full alignments, = F for snp-only alignments.). Set to F for alignments that only contain a portion of the genome
+#' @param snp_filt_method specify SNP filtering method: 'relaxed' or 'default' (default = 'default'). Unlike default, relaxed considers ambiguous/gap characters (N) as minor alleles when applying the
+#' maf_freq filter. Eg: Under default filter values, a site with allele frequencies A:0.85, C:0.0095, N:0.1405 will be respectively, dropped and allowed by 'default' and 'relaxed'
 #' @param gap_freq sites with a gap frequency >gap_greq will be dropped (default = 0.15)
 #' @param maf_freq sites with a minor allele frequency <maf_freq will be dropped (default = 0.01)
-#' @param hdw_threshold Hamming distance similarity threshold (default = 0.1, i.e. 10\%) - lower values will force stricter population structure control at the cost of masking real signal.
-#' @param perform_SR_analysis_only specify whether to only perform the short range link analysis (default = FALSE)
+#' @param hdw_threshold Hamming distance similarity threshold (default = 0.1, i.e. 10\%) - lower values will force stricter population structure control at the cost of masking real signal
+#' @param perform_SR_analysis_only specify whether to only perform LDWeaver (short-range link) analysis. (default = FALSE) will analyse all links (i.e., LDWeaver + SpydrPick)
 #' @param SnpEff_Annotate specify whether to perform annotations using SnpEff (default = TRUE)
-#' @param sr_dist links less than <sr_dist> apart are considered 'short range' (default = 20000), range 1000 - 25000 bp.
-#' @param lr_retain_links specify the maximum number of long-range MI links to retain (default = 1000000) - in each block, only a top subset of links will be saved
+#' @param sr_dist links shorter than <sr_dist> apart are considered 'short range' (default = 20000), range 1000 - 25000 bp
+#' @param lr_retain_links specify the maximum number of long-range MI links to retain (default = 1000000) - in each block, only a top subset of links will be saved such approximately this many links will be retained
 #' @param max_tophits specify the maximum number of short range links to save as <sr_tophits.tsv>. Note: all short-range links will be annotated (and saved separately),
 #' but only the top <max_tophits> will be used for visualisation (default = 250), range 50 - 1000
-#' @param num_clusts_CDS parition to genome into num_clusts_CDS regions using k-means (default = 3) range 1 - 10
-#' @param srp_cutoff specify the short-range -log10(p) cut-off value to discard short-range links before returning the data.frame. This setting has no impact on the
-#' modelling since all links are used. However, setting a threshold > 2 will generally reduce the memory usage, plotting time (default = 3, i.e. corresponding to p = 0.001),
-#' and run time for ARACNE. If all links are required to be returned, set to 0 (i.e. corresponding to p = 1), range 0 - 5
-#' @param tanglegram_break_segments specify the number of genome segments to prepare - one tanglegram per segment (default = 5), range 1 - 10. Set NULL to skip tanglegram
-#' @param write_gwesExplorer specify whether output for GWESExplorer is required (default = T)
+#' @param num_clusts_CDS partition to genome into num_clusts_CDS regions using k-means (default = 3) range 1 - 10 - accounts for within genome heterogeneity in the short-range analysis
+#' @param srp_cutoff specify the short-range -log10(p) cut-off value to discard short-range links before returning the data.frame (default = 3, i.e., p = 10^-3). This setting has no impact on modelling since all links are used.
+#' Larger values will reduce memory usage, plotting time and ARACNE run time. If all links are required, set to 0 (i.e. p = 10^0 = 1), range 0 - 5
+#' @param tanglegram_break_segments specify the number of genome tanglegram segments to prepare (default = 5 segments). Set NULL to skip tanglegram. range 1 - 10.
+#' @param write_gwesExplorer specify whether to write output for GWESExplorer (default = T)
 #' @param multicore specify whether to use parallel processing (default = T)
-#' @param ncores specify the number of cores to use for parallel processing (default = NULL), will auto detect if NULL
+#' @param ncores specify the number of cores to use for parallel processing. Auto detect (detect = NULL)
 #' @param max_blk_sz specify maximum block size for MI computation (default = 10000), larger sizes require more RAM, range 1000 - 100000
 #' @param save_additional_outputs specify whether to save outputs such as extracted SNPs and Hamming distance weights. Recommended for very large datasets to save time on re-computation (default = F)
+#' @param mega_dset specify whether the datasets is megascale. This mode requires spam and spam64 packages. This is upto 5 times slower, set to TRUE only if the normal analysis fails (default = F)
 #'
-#'
-#' @return Numeric Value 0 if successful (all generated outputs will be saved)
+#' @return All generated outputs will be saved to folder <dset>.
 #'
 #' @examples
 #' \dontrun{
-#' sr_links_red = LDWeaver(dset = "efcm", aln_path = "<efcm_aln>", gbk_path = "<efcm.gbk>")
+#' # These examples use data included with the package.
+#'
+#' # Example 1 - using a full alignment that includes non SNP sites
+#' dset <- "full_dset"
+#' gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver")
+#' aln_path <- system.file("extdata", "sample.aln.gz", package = "LDWeaver")
+#' LDWeaver::LDWeaver(dset = dset,  aln_path = aln_path,  gbk_path = gbk_path, validate_ref_ann_lengths = F)
+#'
+#' # Example 2- using a SNP only alignment
+#' dset <- "snp_dset"
+#' gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver")
+#' aln_path <- system.file("extdata", "snp_sample.fa.gz", package = "LDWeaver")
+#' pos <- as.numeric(readLines(system.file("extdata", "snp_sample.fa.pos", package = "LDWeaver")))
+#' LDWeaver::LDWeaver(dset = dset,  aln_path = aln_path, aln_has_all_bases = F, pos = pos, gbk_path = gbk_path)
 #' }
 #' @export
 LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path = NULL, gff3_path = NULL,
@@ -50,7 +61,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
                     SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6,
                     max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5,
                     write_gwesExplorer = T, multicore = T, max_blk_sz = 10000, ncores = NULL,
-                    save_additional_outputs = F){
+                    save_additional_outputs = F, mega_dset = F){
+
   # Build blocks
   # BLK1: Extract SNPs and create sparse Mx from MSA (fasta)
   # BLK2: Parse GBK or GFF+REF
@@ -65,10 +77,10 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
   # BLK11: Cleanup
 
   #TODO: Provide the option to skip SNP extraction and use the whole provided alignment (redundant if pre-filtered)
-  #TODO: Add the option to provide genbank file without reference sequence
+  #TODO: Add the option to provide GFF file without reference sequence
   #TODO: Count through blocks and automate the displayed BLOCK NUMBER
-  #TODO: genbankr is being droped from the newest bioconductor, add alternative (https://github.com/gmbecker/genbankr)
   #TODO: Add Hamming Distance plot, can we have a SNP Tree + Hamming Distance weights to show population structure control?
+  #TODO: Benchmark spam VS matrixExtra
 
   #NOTE: SnpEff does not parse the GBK and GFF3 file from the same refseq reference genome the same way. There might be differences between annotations/tophits/etc.
   # # Welcome message # #
@@ -82,8 +94,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
     # Added snpEff to inst/extdata
     snpeff_jar_path = system.file("extdata", "snpEff.jar", package = "LDWeaver")
     ######################################## These checks must be unnecessary now ########################################
-    if(is.null(snpeff_jar_path)) stop("<snpeff_jar_path> must be provided for annotations. To run without annotations, set SnpEff_Annotate = F")
-    if(!file.exists(snpeff_jar_path)) stop(paste("<SnpEff.jar> not found at:", snpeff_jar_path, "please check the path provided"))
+    # if(is.null(snpeff_jar_path)) stop("<snpeff_jar_path> must be provided for annotations. To run without annotations, set SnpEff_Annotate = F")
+    # if(!file.exists(snpeff_jar_path)) stop(paste("<SnpEff.jar> not found at:", snpeff_jar_path, "please check the path provided"))
     ######################################################################################################################
     order_links = F # sr_links should be ordered at the end after adding annotations
   } else {
@@ -158,6 +170,15 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
     validate_ref_ann_lengths = F
   }
 
+  if(mega_dset){ # This is a mega dataset, we need spam and spam64 packages
+    if(!requireNamespace("spam") & !requireNamespace("spam64")){
+      message("mega_dset is set to TRUE but spam and spam64 dependencies are not installed.")
+      return(invisible())
+    }
+  }
+
+
+  ######################################################################################################################
   # normalise_input_paths
   aln_path = normalizePath(aln_path)
   if(!is.null(gbk_path)) gbk_path = normalizePath(gbk_path)
@@ -318,7 +339,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
   cat("\n\n #################### BLOCK 3 #################### \n\n")
   if(!file.exists(cds_var_path)) {
     cat("Estimating the variation in CDS \n")
-    cds_var = LDWeaver::estimate_variation_in_CDS(gbk = gbk, gff = gff, snp.dat = snp.dat, ncores = ncores, num_clusts_CDS = num_clusts_CDS, clust_plt_path = clust_plt_path)
+    cds_var = LDWeaver::estimate_variation_in_CDS(gbk = gbk, gff = gff, snp.dat = snp.dat, ncores = ncores, num_clusts_CDS = num_clusts_CDS, clust_plt_path = clust_plt_path, mega_dset = mega_dset)
     if(save_additional_outputs){
       saveRDS(cds_var, cds_var_path)
     }
@@ -332,7 +353,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
   if(!file.exists(hdw_path)) {
     cat("Estimating per sequence Hamming distance \n")
     # hdw = perform_pop_struct_correction_sparse(snp.matrix = snp.dat$snp.matrix, nsnp = snp.dat$nsnp)
-    hdw = LDWeaver::estimate_Hamming_distance_weights(snp.dat = snp.dat, threshold = hdw_threshold)
+    hdw = LDWeaver::estimate_Hamming_distance_weights(snp.dat = snp.dat, threshold = hdw_threshold, mega_dset = mega_dset)
     if(save_additional_outputs){
       saveRDS(hdw, hdw_path)
     }
@@ -459,6 +480,4 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
   LDWeaver::cleanup(dset = dset, delete_after_moving = F)
 
   cat(paste("\n\n ** All done in", round(difftime(Sys.time(), t_global, units = "mins"), 3), "m ** \n"))
-
-
 }