Skip to content

Commit

Permalink
Minor fixes
Browse files Browse the repository at this point in the history
* add snp-only aln support

* update_readme

* update .gitignore'

'
''

* update workflow

* update gitignore

* .gitignore updated

* update gitignore

* update gitignore

* TODO:SnpEff_local

* bundle_snpEff

* bug_fix

* save_fit_data

* minor_bug_fix

* to_do

* skip_tanglegram

* NewYear_bugfix

* Drop genbankr dependency

* remove devel test from workflows

* Mega_alignments

* Improve Readme

* logo resize

* update icon

* LDweaver_1.5

* Fix logo height

* Fix logo height

* Update r.yml

Update work.yml for hard dependencies

* minor bug fixes

* LogoFix

* logofix

* fix title

* Fix heading

* typoFix

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

* Update README.md

---------

Co-authored-by: Sudaraka88 <sudarakm@ua-eduroam-ten-25-132-182.uniaccess.unimelb.edu.au>
Co-authored-by: Sudaraka88 <sudarakm@Sudarakas-MacBook-Pro.local>
Co-authored-by: Sudaraka88 <sudarakm@ua-eduroam-ten-25-135-47.uniaccess.unimelb.edu.au>
Co-authored-by: Sudaraka88 <sudarakm@Sudarakas-MBP.home>
  • Loading branch information
5 people committed Apr 23, 2024
1 parent fd6ef3d commit 1ddec3e
Show file tree
Hide file tree
Showing 9 changed files with 136 additions and 91 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ Makevars

testscripts.R
testscript_op
tests

31 changes: 22 additions & 9 deletions R/BacGWES.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
#' @param ncores specify the number of cores to use for parallel processing. Auto detect (detect = NULL)
#' @param max_blk_sz specify maximum block size for MI computation (default = 10000), larger sizes require more RAM, range 1000 - 100000
#' @param save_additional_outputs specify whether to save outputs such as extracted SNPs and Hamming distance weights. Recommended for very large datasets to save time on re-computation (default = F)
#' @param mega_dset specify whether the datasets is megascale. This mode requires spam and spam64 packages. This is upto 5 times slower, set to TRUE only if the normal analysis fails (default = F)
#' @param mega_dset specify whether the datasets is megascale. This mode requires spam and spam64 packages. This is >5 times slower, set to TRUE only if the normal analysis fails (default = F)

#'
#' @return All generated outputs will be saved to folder <dset>.
#'
Expand All @@ -53,6 +54,12 @@
#' aln_path <- system.file("extdata", "snp_sample.fa.gz", package = "LDWeaver")
#' pos <- as.numeric(readLines(system.file("extdata", "snp_sample.fa.pos", package = "LDWeaver")))
#' LDWeaver::LDWeaver(dset = dset, aln_path = aln_path, aln_has_all_bases = F, pos = pos, gbk_path = gbk_path)
#'
# Example 3 - Redoing the full analysis as a mega scale dataset
#' dset <- "full_dset_spam"
#' gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver")
#' aln_path <- system.file("extdata", "sample.aln.gz", package = "LDWeaver")
#' LDWeaver::LDWeaver(dset = dset, aln_path = aln_path, gbk_path = gbk_path, validate_ref_ann_lengths = F, mega_dset = T)
#' }
#' @export
LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path = NULL, gff3_path = NULL,
Expand Down Expand Up @@ -175,6 +182,9 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
message("mega_dset is set to TRUE but spam and spam64 dependencies are not installed.")
return(invisible())
}
message("mega_dset is selected. Warning! This mode has a much slower run time. Setting spam.force64=TRUE (see https://cran.r-project.org/web/packages/spam64/spam64.pdf)")
options(spam.force64 = TRUE)

}


Expand Down Expand Up @@ -235,7 +245,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
if(perform_SR_analysis_only) cat("Only short-range analysis requested. \n")
cat(paste("All outputs will be saved to:", normalizePath(dset), "\n"))
cat(paste("\n *** Input paths *** \n\n"))
cat(paste("* Alignment:", aln_path, "\n"))
if(mega_dset) cat(paste("* Mega Alignment:", aln_path, "\n")) else cat(paste("* Alignment:", aln_path, "\n"))
if(!is.null(gbk_path)) {
cat(paste("* GenBank Annotation:", gbk_path, "\n"))
cat(paste("* Parser built using genbankr source (https://github.com/gmbecker/genbankr) \n"))
Expand All @@ -254,7 +264,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
cat(paste("Links <=", sr_dist, "bp-apart will be classified as short-range (sr-links) \n"))
if(!perform_SR_analysis_only) cat(paste("Approx. top", lr_retain_links, "long range links will be saved \n"))
cat(paste("Top sr-links with -log10(p) >", srp_cutoff, "will be saved \n"))
cat(paste("Tanglegram/GWESExplorer outputs will illustrate upto:", max_tophits, "top sr-links \n"))
if(!is.null(tanglegram_break_segments)) cat(paste("Tanglegram/GWESExplorer outputs will illustrate upto:", max_tophits, "top sr-links \n"))
cat(paste("MI Computation will use a max block size of:", max_blk_sz, "x", max_blk_sz, "SNPs! Reduce <max_blk_sz> if RAM is scarce!\n\n"))
cat(paste("~~~~~ https://github.com/Sudaraka88/LDWeaver/ ~~~~~"))
}
Expand All @@ -269,13 +279,13 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path

# Adding support for SNP-only alignments
if(aln_has_all_bases == T){
snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq)
snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq, mega_dset = mega_dset)
if(save_additional_outputs){
cat("Step 5: Savings snp.dat...")
saveRDS(snp.dat, ACGTN_snp_path)
}
} else {
snp.dat = LDWeaver::parse_fasta_SNP_alignment(aln_path = aln_path, pos = pos, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq)
snp.dat = LDWeaver::parse_fasta_SNP_alignment(aln_path = aln_path, pos = pos, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq, mega_dset = mega_dset)
# Note that snp.dat$g = NULL (we cannot measure this, need to get it from the genbank file)
# we cannot save snp.dat here due to absent snp.dat$g, moving downstream (block 2)
}
Expand Down Expand Up @@ -375,7 +385,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
lr_save_path = lr_save_path, sr_save_path = sr_save_path,
plt_folder = dset, sr_dist = sr_dist, lr_retain_links = lr_retain_links,
max_blk_sz = max_blk_sz, srp_cutoff = srp_cutoff, runARACNE = T,
perform_SR_analysis_only = perform_SR_analysis_only, order_links = order_links)
perform_SR_analysis_only = perform_SR_analysis_only,
order_links = order_links,mega_dset = mega_dset)
}


Expand Down Expand Up @@ -436,7 +447,9 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
gwesexplorer_path = file.path(dset, "SR_GWESExplorer")
if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path)
cat("\n\n #################### BLOCK 10 #################### \n\n")
LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path)
if(mega_dset) {
message("GWES Explorer output currently not generated for mega datasets\n")
} else LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path)
}


Expand All @@ -455,13 +468,13 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
if( !( (file.exists(file.path(dset, "lr_tophits.tsv"))) | (file.exists(file.path(dset, "Tophits/lr_tophits.tsv"))) ) ) { # if the annotated_links file exists, no need to run again
LDWeaver::analyse_long_range_links(dset = dset, lr_links_path = lr_save_path, sr_links_path = sr_save_path, SnpEff_Annotate = T, snpeff_jar_path = snpeff_jar_path,
gbk_path = gbk_path, gff3_path = gff3_path, snp.dat = snp.dat, cds_var = cds_var, ref_fasta_path = ref_fasta_path,
validate_ref_ann_lengths = validate_ref_ann_lengths)
validate_ref_ann_lengths = validate_ref_ann_lengths, mega_dset = mega_dset)
} else {
cat("Results from previous LR anlayis exist!")
}
} else {
if( !( (file.exists(file.path(dset, "lr_gwes.png"))) | (file.exists(file.path(dset, "GWESPlots/lr_gwes.png"))) ) ) { # if the lr_gwes plot exist, no need to run again
LDWeaver::analyse_long_range_links(dset = dset, lr_links_path = lr_save_path, sr_links_path = sr_save_path, SnpEff_Annotate = F)
LDWeaver::analyse_long_range_links(dset = dset, lr_links_path = lr_save_path, sr_links_path = sr_save_path, SnpEff_Annotate = F, mega_dset = mega_dset)
} else {
cat("Results from previous LR anlayis exist!")
}
Expand Down
8 changes: 8 additions & 0 deletions R/computePairwiseMI.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@
perform_MI_computation = function(snp.dat, hdw, cds_var, ncores, lr_save_path = NULL, sr_save_path = NULL, plt_folder = NULL,
sr_dist = 20000, lr_retain_links = 1e6, max_blk_sz = 10000, srp_cutoff = 3, runARACNE = TRUE,
perform_SR_analysis_only = FALSE, order_links = T, mega_dset = F){

## DEBUG LINES - DO NOT DELETE and REMEMBER TO COMMENT
# lr_save_path = "testscript_op/lr_links_spam.tsv"
# sr_save_path = "testscript_op/sr_links_spam.tsv"
# plt_folder = "testscript_op"
# sr_dist = 20000; lr_retain_links = 1e6; max_blk_sz = 10000; srp_cutoff = 3
# Rcpp::sourceCpp("src/computeMI.cpp"); Rcpp::sourceCpp("src/fintersect.cpp")

t000 = Sys.time()
# TODO: if no paths are given, we need a way to stop overwriting (use timestamp()?)
if(is.null(lr_save_path)) lr_save_path = file.path(getwd(), "lr_links.tsv")
Expand Down
8 changes: 8 additions & 0 deletions R/extractSNPs.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ parse_fasta_alignment <- function(aln_path, gap_freq = 0.15, maf_freq = 0.01, me
message("This feature requires spam and spam64 packages.")
return(invisible())
} else {
# We need to make sure we are using spam64, set it quietly
if(!getOption("spam.force64")) options(spam.force64 = T)


snp.matrix_A <- spam::spam(list(i=snp.data$i_A, j=snp.data$j_A, values=as.logical(snp.data$x_A)),
nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
snp.data$i_A = snp.data$j_A = snp.data$x_A = NULL
Expand Down Expand Up @@ -185,6 +189,10 @@ parse_fasta_SNP_alignment <- function(aln_path, pos, gap_freq = 0.15, maf_freq =
message("This feature requires spam and spam64 packages.")
return(invisible())
} else {

# We need to make sure we are using spam64, set it quietly
if(!getOption("spam.force64")) options(spam.force64 = T)

snp.matrix_A <- spam::spam(list(i=snp.data$i_A, j=snp.data$j_A, values=as.logical(snp.data$x_A)),
nrow = snp.param$num.seqs, ncol = snp.param$num.snps)
snp.data$i_A = snp.data$j_A = snp.data$x_A = NULL
Expand Down
19 changes: 12 additions & 7 deletions R/lr_analyser.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#' @param max_tophits specify the maximum number of long range links to save as <lr_tophits.tsv>. Note: all short-range links will be annotated (and saved separately),
#' but only the top <max_tophits> will be used for visualisation (default = 500)
#' @param links_from_spydrpick are the links computed using spydrpick (default = F)
#' @param mega_dset set TRUE for mega scale datasets (default = F)
#'
#' @examples
#' \dontrun{
Expand All @@ -29,8 +30,8 @@
analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrlinks_ordered = F, SnpEff_Annotate = F,
snpeff_jar_path = NULL, gbk_path = NULL, gff3_path = NULL, ref_fasta_path = NULL,
validate_ref_ann_lengths = T, snp.dat = NULL, cds_var = NULL, max_tophits = 500,
links_from_spydrpick = F){
# tanglegram_break_segments = 5){
links_from_spydrpick = F, mega_dset = F){
# tanglegram_break_segments = 5){

#TODO: We are redoing the SnpEff annotation for long-range links, might be better to do it in one run
# it makes sense to have a larger max_tophits for long range links - there will be a lot more of long-range links compared to short
Expand Down Expand Up @@ -158,9 +159,9 @@ analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrli
}

tophits = LDWeaver::perform_snpEff_annotations(dset_name = dset, annotation_folder = file.path(getwd(), dset),
snpeff_jar = snpeff_jar_path, gbk = gbk, gbk_path = gbk_path,
gff = gff, cds_var = cds_var, links_df = lr_links_red, snp.dat = snp.dat,
tophits_path = tophits_path, max_tophits = max_tophits, links_type = "LR")
snpeff_jar = snpeff_jar_path, gbk = gbk, gbk_path = gbk_path,
gff = gff, cds_var = cds_var, links_df = lr_links_red, snp.dat = snp.dat,
tophits_path = tophits_path, max_tophits = max_tophits, links_type = "LR")

# Tanglegram is difficult to read when plotted like this, best to avoid!
# tanglegram_path = file.path(dset, "LR_Tanglegram")
Expand All @@ -171,8 +172,12 @@ analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrli
cat("\n")
gwesexplorer_path = file.path(dset, "LR_GWESExplorer")
if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path)
LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits,
gwes_explorer_folder = gwesexplorer_path, links_type = "LR")
if(mega_dset) {
message("GWES Explorer output currently not generated for mega datasets\n")
} else LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits,
gwes_explorer_folder = gwesexplorer_path, links_type = "LR")



cat("\n")

Expand Down

0 comments on commit 1ddec3e

Please sign in to comment.