Minor fixes

* add snp-only aln support * update_readme * update .gitignore' ' '' * update workflow * update gitignore * .gitignore updated * update gitignore * update gitignore * TODO:SnpEff_local * bundle_snpEff * bug_fix * save_fit_data * minor_bug_fix * to_do * skip_tanglegram * NewYear_bugfix * Drop genbankr dependency * remove devel test from workflows * Mega_alignments * Improve Readme * logo resize * update icon * LDweaver_1.5 * Fix logo height * Fix logo height * Update r.yml Update work.yml for hard dependencies * minor bug fixes * LogoFix * logofix * fix title * Fix heading * typoFix * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md * Update README.md --------- Co-authored-by: Sudaraka88 <sudarakm@ua-eduroam-ten-25-132-182.uniaccess.unimelb.edu.au> Co-authored-by: Sudaraka88 <sudarakm@Sudarakas-MacBook-Pro.local> Co-authored-by: Sudaraka88 <sudarakm@ua-eduroam-ten-25-135-47.uniaccess.unimelb.edu.au> Co-authored-by: Sudaraka88 <sudarakm@Sudarakas-MBP.home>
Sudaraka88 · Apr 23, 2024 · 1ddec3e · 1ddec3e
1 parent fd6ef3d
commit 1ddec3e
Show file tree

Hide file tree

Showing 9 changed files with 136 additions and 91 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,5 @@ Makevars
 
 testscripts.R
 testscript_op
+tests
+
diff --git a/R/BacGWES.R b/R/BacGWES.R
@@ -33,7 +33,8 @@
 #' @param ncores specify the number of cores to use for parallel processing. Auto detect (detect = NULL)
 #' @param max_blk_sz specify maximum block size for MI computation (default = 10000), larger sizes require more RAM, range 1000 - 100000
 #' @param save_additional_outputs specify whether to save outputs such as extracted SNPs and Hamming distance weights. Recommended for very large datasets to save time on re-computation (default = F)
-#' @param mega_dset specify whether the datasets is megascale. This mode requires spam and spam64 packages. This is upto 5 times slower, set to TRUE only if the normal analysis fails (default = F)
+#' @param mega_dset specify whether the datasets is megascale. This mode requires spam and spam64 packages. This is  >5 times slower, set to TRUE only if the normal analysis fails (default = F)
+
 #'
 #' @return All generated outputs will be saved to folder <dset>.
 #'
@@ -53,6 +54,12 @@
 #' aln_path <- system.file("extdata", "snp_sample.fa.gz", package = "LDWeaver")
 #' pos <- as.numeric(readLines(system.file("extdata", "snp_sample.fa.pos", package = "LDWeaver")))
 #' LDWeaver::LDWeaver(dset = dset,  aln_path = aln_path, aln_has_all_bases = F, pos = pos, gbk_path = gbk_path)
+#'
+# Example 3 - Redoing the full analysis as a mega scale dataset
+#' dset <- "full_dset_spam"
+#' gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver")
+#' aln_path <- system.file("extdata", "sample.aln.gz", package = "LDWeaver")
+#' LDWeaver::LDWeaver(dset = dset,  aln_path = aln_path,  gbk_path = gbk_path, validate_ref_ann_lengths = F, mega_dset = T)
 #' }
 #' @export
 LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path = NULL, gff3_path = NULL,
@@ -175,6 +182,9 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
       message("mega_dset is set to TRUE but spam and spam64 dependencies are not installed.")
       return(invisible())
     }
+    message("mega_dset is selected. Warning! This mode has a much slower run time. Setting spam.force64=TRUE (see https://cran.r-project.org/web/packages/spam64/spam64.pdf)")
+    options(spam.force64 = TRUE)
+
   }
 
 
@@ -235,7 +245,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
     if(perform_SR_analysis_only) cat("Only short-range analysis requested. \n")
     cat(paste("All outputs will be saved to:", normalizePath(dset), "\n"))
     cat(paste("\n *** Input paths *** \n\n"))
-    cat(paste("* Alignment:", aln_path, "\n"))
+    if(mega_dset) cat(paste("* Mega Alignment:", aln_path, "\n")) else cat(paste("* Alignment:", aln_path, "\n"))
     if(!is.null(gbk_path)) {
       cat(paste("* GenBank Annotation:", gbk_path, "\n"))
       cat(paste("* Parser built using genbankr source (https://github.com/gmbecker/genbankr) \n"))
@@ -254,7 +264,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
     cat(paste("Links <=", sr_dist, "bp-apart will be classified as short-range (sr-links) \n"))
     if(!perform_SR_analysis_only) cat(paste("Approx. top", lr_retain_links, "long range links will be saved \n"))
     cat(paste("Top sr-links with -log10(p) >", srp_cutoff, "will be saved \n"))
-    cat(paste("Tanglegram/GWESExplorer outputs will illustrate upto:", max_tophits, "top sr-links \n"))
+    if(!is.null(tanglegram_break_segments)) cat(paste("Tanglegram/GWESExplorer outputs will illustrate upto:", max_tophits, "top sr-links \n"))
     cat(paste("MI Computation will use a max block size of:", max_blk_sz, "x", max_blk_sz, "SNPs! Reduce <max_blk_sz> if RAM is scarce!\n\n"))
     cat(paste("~~~~~ https://github.com/Sudaraka88/LDWeaver/ ~~~~~"))
   }
@@ -269,13 +279,13 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
 
     # Adding support for SNP-only alignments
     if(aln_has_all_bases == T){
-      snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq)
+      snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq, mega_dset = mega_dset)
       if(save_additional_outputs){
         cat("Step 5: Savings snp.dat...")
         saveRDS(snp.dat, ACGTN_snp_path)
       }
     } else {
-      snp.dat = LDWeaver::parse_fasta_SNP_alignment(aln_path = aln_path, pos = pos, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq)
+      snp.dat = LDWeaver::parse_fasta_SNP_alignment(aln_path = aln_path, pos = pos, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq, mega_dset = mega_dset)
       # Note that snp.dat$g = NULL (we cannot measure this, need to get it from the genbank file)
       # we cannot save snp.dat here due to absent snp.dat$g, moving downstream (block 2)
     }
@@ -375,7 +385,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
                                                 lr_save_path = lr_save_path, sr_save_path = sr_save_path,
                                                 plt_folder = dset, sr_dist = sr_dist, lr_retain_links = lr_retain_links,
                                                 max_blk_sz = max_blk_sz, srp_cutoff = srp_cutoff, runARACNE = T,
-                                                perform_SR_analysis_only = perform_SR_analysis_only, order_links = order_links)
+                                                perform_SR_analysis_only = perform_SR_analysis_only,
+                                                order_links = order_links,mega_dset = mega_dset)
   }
 
 
@@ -436,7 +447,9 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
     gwesexplorer_path = file.path(dset, "SR_GWESExplorer")
     if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path)
     cat("\n\n #################### BLOCK 10 #################### \n\n")
-    LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path)
+    if(mega_dset) {
+      message("GWES Explorer output currently not generated for mega datasets\n")
+      } else LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path)
   }
 
 
@@ -455,13 +468,13 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
       if( !(  (file.exists(file.path(dset, "lr_tophits.tsv"))) | (file.exists(file.path(dset, "Tophits/lr_tophits.tsv"))) ) ) { # if the annotated_links file exists, no need to run again
         LDWeaver::analyse_long_range_links(dset = dset, lr_links_path =  lr_save_path, sr_links_path = sr_save_path, SnpEff_Annotate = T, snpeff_jar_path = snpeff_jar_path,
                                            gbk_path = gbk_path, gff3_path = gff3_path, snp.dat = snp.dat, cds_var = cds_var, ref_fasta_path = ref_fasta_path,
-                                           validate_ref_ann_lengths = validate_ref_ann_lengths)
+                                           validate_ref_ann_lengths = validate_ref_ann_lengths, mega_dset = mega_dset)
       } else {
         cat("Results from previous LR anlayis exist!")
       }
     } else {
       if( !(  (file.exists(file.path(dset, "lr_gwes.png"))) | (file.exists(file.path(dset, "GWESPlots/lr_gwes.png"))) ) ) { # if the lr_gwes plot exist, no need to run again
-        LDWeaver::analyse_long_range_links(dset = dset, lr_links_path =  lr_save_path, sr_links_path = sr_save_path, SnpEff_Annotate = F)
+        LDWeaver::analyse_long_range_links(dset = dset, lr_links_path =  lr_save_path, sr_links_path = sr_save_path, SnpEff_Annotate = F, mega_dset = mega_dset)
       } else {
         cat("Results from previous LR anlayis exist!")
       }

diff --git a/R/computePairwiseMI.R b/R/computePairwiseMI.R
@@ -46,6 +46,14 @@
 perform_MI_computation = function(snp.dat, hdw, cds_var, ncores, lr_save_path = NULL, sr_save_path = NULL, plt_folder = NULL,
                                   sr_dist = 20000, lr_retain_links = 1e6, max_blk_sz = 10000, srp_cutoff = 3, runARACNE = TRUE,
                                   perform_SR_analysis_only = FALSE, order_links = T, mega_dset = F){
+
+  ## DEBUG LINES - DO NOT DELETE and REMEMBER TO COMMENT
+  # lr_save_path = "testscript_op/lr_links_spam.tsv"
+  # sr_save_path = "testscript_op/sr_links_spam.tsv"
+  # plt_folder = "testscript_op"
+  # sr_dist = 20000; lr_retain_links = 1e6; max_blk_sz = 10000; srp_cutoff = 3
+  # Rcpp::sourceCpp("src/computeMI.cpp"); Rcpp::sourceCpp("src/fintersect.cpp")
+
   t000 = Sys.time()
   # TODO: if no paths are given, we need a way to stop overwriting (use timestamp()?)
   if(is.null(lr_save_path)) lr_save_path = file.path(getwd(), "lr_links.tsv")

diff --git a/R/extractSNPs.R b/R/extractSNPs.R
@@ -53,6 +53,10 @@ parse_fasta_alignment <- function(aln_path, gap_freq = 0.15, maf_freq = 0.01, me
       message("This feature requires spam and spam64 packages.")
       return(invisible())
     } else {
+      # We need to make sure we are using spam64, set it quietly
+      if(!getOption("spam.force64")) options(spam.force64 = T)
+
+
       snp.matrix_A <- spam::spam(list(i=snp.data$i_A, j=snp.data$j_A, values=as.logical(snp.data$x_A)),
                                  nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
       snp.data$i_A = snp.data$j_A = snp.data$x_A = NULL
@@ -185,6 +189,10 @@ parse_fasta_SNP_alignment <- function(aln_path, pos, gap_freq = 0.15, maf_freq =
       message("This feature requires spam and spam64 packages.")
       return(invisible())
     } else {
+
+      # We need to make sure we are using spam64, set it quietly
+      if(!getOption("spam.force64")) options(spam.force64 = T)
+
       snp.matrix_A <- spam::spam(list(i=snp.data$i_A, j=snp.data$j_A, values=as.logical(snp.data$x_A)),
                                  nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
       snp.data$i_A = snp.data$j_A = snp.data$x_A = NULL

diff --git a/R/lr_analyser.R b/R/lr_analyser.R
@@ -20,6 +20,7 @@
 #' @param max_tophits specify the maximum number of long range links to save as <lr_tophits.tsv>. Note: all short-range links will be annotated (and saved separately),
 #' but only the top <max_tophits> will be used for visualisation (default = 500)
 #' @param links_from_spydrpick are the links computed using spydrpick (default = F)
+#' @param mega_dset set TRUE for mega scale datasets (default = F)
 #'
 #' @examples
 #' \dontrun{
@@ -29,8 +30,8 @@
 analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrlinks_ordered = F, SnpEff_Annotate = F,
                                     snpeff_jar_path = NULL, gbk_path = NULL, gff3_path = NULL, ref_fasta_path = NULL,
                                     validate_ref_ann_lengths = T, snp.dat = NULL, cds_var = NULL, max_tophits = 500,
-                                    links_from_spydrpick = F){
-                                    # tanglegram_break_segments = 5){
+                                    links_from_spydrpick = F, mega_dset = F){
+  # tanglegram_break_segments = 5){
 
   #TODO: We are redoing the SnpEff annotation for long-range links, might be better to do it in one run
   # it makes sense to have a larger max_tophits for long range links - there will be a lot more of long-range links compared to short
@@ -158,9 +159,9 @@ analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrli
     }
 
     tophits = LDWeaver::perform_snpEff_annotations(dset_name = dset, annotation_folder = file.path(getwd(), dset),
-                                                  snpeff_jar = snpeff_jar_path, gbk = gbk, gbk_path = gbk_path,
-                                                  gff = gff, cds_var = cds_var, links_df = lr_links_red, snp.dat = snp.dat,
-                                                  tophits_path = tophits_path, max_tophits = max_tophits, links_type = "LR")
+                                                   snpeff_jar = snpeff_jar_path, gbk = gbk, gbk_path = gbk_path,
+                                                   gff = gff, cds_var = cds_var, links_df = lr_links_red, snp.dat = snp.dat,
+                                                   tophits_path = tophits_path, max_tophits = max_tophits, links_type = "LR")
 
     # Tanglegram is difficult to read when plotted like this, best to avoid!
     # tanglegram_path = file.path(dset, "LR_Tanglegram")
@@ -171,8 +172,12 @@ analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrli
     cat("\n")
     gwesexplorer_path = file.path(dset, "LR_GWESExplorer")
     if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path)
-    LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits,
-                                            gwes_explorer_folder = gwesexplorer_path, links_type = "LR")
+    if(mega_dset) {
+      message("GWES Explorer output currently not generated for mega datasets\n")
+    } else LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits,
+                                                    gwes_explorer_folder = gwesexplorer_path, links_type = "LR")
+
+
 
     cat("\n")