Skip to content

Commit

Permalink
Bump Version 1.5 (#7)
Browse files Browse the repository at this point in the history
* add snp-only aln support

* update_readme

* update .gitignore'

'
''

* update workflow

* update gitignore

* .gitignore updated

* update gitignore

* update gitignore

* TODO:SnpEff_local

* bundle_snpEff

* bug_fix

* save_fit_data

* minor_bug_fix

* to_do

* skip_tanglegram

* NewYear_bugfix

* Drop genbankr dependency

* remove devel test from workflows

* Mega_alignments

* Improve Readme

* logo resize

* update icon

* LDweaver_1.5

* Fix logo height

* Fix logo height

* Update r.yml

Update work.yml for hard dependencies

---------

Co-authored-by: Sudaraka88 <sudarakm@ua-eduroam-ten-25-132-182.uniaccess.unimelb.edu.au>
Co-authored-by: Sudaraka88 <sudarakm@Sudarakas-MacBook-Pro.local>
Co-authored-by: Sudaraka88 <sudarakm@ua-eduroam-ten-25-135-47.uniaccess.unimelb.edu.au>
Co-authored-by: Sudaraka88 <sudarakm@Sudarakas-MBP.home>
  • Loading branch information
5 people committed Apr 22, 2024
1 parent 597269f commit fd6ef3d
Show file tree
Hide file tree
Showing 23 changed files with 541 additions and 1,547 deletions.
14 changes: 10 additions & 4 deletions .github/workflows/r.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
schedule:
- cron: '1 1 1 * *'

name: R-CMD-check
name: R-CMD-check-hard

jobs:
R-CMD-check:
Expand All @@ -24,14 +24,14 @@ jobs:
- {os: windows-latest, r: 'release'}
# - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'}
- {os: ubuntu-latest, r: 'release'}
- {os: ubuntu-latest, r: 'oldrel-1'}
# - {os: ubuntu-latest, r: 'oldrel-1'}

env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
R_KEEP_PKG_SOURCE: yes

steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- uses: r-lib/actions/setup-pandoc@v2

Expand All @@ -43,7 +43,13 @@ jobs:

- uses: r-lib/actions/setup-r-dependencies@v2
with:
extra-packages: any::rcmdcheck
dependencies: '"hard"'
cache: false
extra-packages: |
any::rcmdcheck
any::testthat
any::knitr
any::rmarkdown
needs: check

- uses: r-lib/actions/check-r-package@v2
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
.DS_Store
*.Rmd
Makevars

testscripts.R
testscript_op
15 changes: 9 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: LDWeaver
Type: Package
Title: Genomewide Epistasis Analysis on Bacteria
Version: 1.4
Version: 1.5
Authors@R: person("Sudaraka", "Mallawaarachchi", email = "smallawaarachchi@gmail.com", role = c("aut", "cre"))
Maintainer: Sudaraka Mallawaarachchi <smallawaarachchi@gmail.com>
Description: Perform genomewide epistasis analysis by evaluating the LD structure in bacteria.
Expand All @@ -10,7 +10,7 @@ Encoding: UTF-8
LazyData: true
biocViews: Software
Depends: R (>= 4.0.0),
Imports:
Imports:
ape,
Biostrings,
chromoMap,
Expand All @@ -19,9 +19,7 @@ Imports:
fitdistrplus,
GenomicRanges,
GenomeInfoDb,
ggnewscale,
ggplot2,
ggtree,
ggraph,
grDevices,
heatmap3,
Expand All @@ -32,7 +30,6 @@ Imports:
MatrixExtra,
methods,
parallel,
phytools,
plyr,
RColorBrewer,
Rcpp,
Expand All @@ -41,8 +38,14 @@ Imports:
stats,
utils,
VariantAnnotation
Suggests:
phytools,
ggtree,
ggnewscale,
spam,
spam64
LinkingTo: Rcpp, RcppArmadillo
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
URL: https://github.com/Sudaraka88/LDWeaver
BugReports: https://github.com/Sudaraka88/LDWeaver/issues
Biarch: TRUE
5 changes: 0 additions & 5 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ importFrom(S4Vectors,subjectHits)
importFrom(VariantAnnotation,VRanges)
importFrom(VariantAnnotation,makeVRangesFromGRanges)
importFrom(ape,read.gff)
importFrom(ape,read.tree)
importFrom(chromoMap,chromoMap)
importFrom(data.table,"%between%")
importFrom(data.table,.I)
Expand All @@ -74,7 +73,6 @@ importFrom(data.table,setattr)
importFrom(dplyr,`%>%`)
importFrom(dplyr,summarise)
importFrom(fitdistrplus,fitdist)
importFrom(ggnewscale,new_scale_fill)
importFrom(ggplot2,aes)
importFrom(ggplot2,facet_wrap)
importFrom(ggplot2,geom_point)
Expand All @@ -89,8 +87,6 @@ importFrom(ggraph,geom_edge_arc2)
importFrom(ggraph,geom_node_label)
importFrom(ggraph,ggraph)
importFrom(ggraph,scale_edge_colour_discrete)
importFrom(ggtree,ggtree)
importFrom(ggtree,gheatmap)
importFrom(grDevices,colorRampPalette)
importFrom(grDevices,png)
importFrom(heatmap3,heatmap3)
Expand All @@ -101,7 +97,6 @@ importFrom(methods,as)
importFrom(methods,is)
importFrom(methods,new)
importFrom(parallel,detectCores)
importFrom(phytools,midpoint.root)
importFrom(plyr,.)
importFrom(plyr,ddply)
importFrom(stats,coef)
Expand Down
85 changes: 52 additions & 33 deletions R/BacGWES.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,42 +6,53 @@
#' @importFrom utils timestamp packageVersion
#'
#' @param dset name of the dataset, all outputs will be saved to the folder <dset>
#' @param aln_path path to the multi fasta alignment
#' @param aln_has_all_bases specify whether the alignment has all bases in the reference genome (default = T). For example, if it's a SNP only alignment, set to F and provide pos
#' @param pos numeric vector of positions for each base in the alignment (default = NULL). Only required if sites are missing from the alignment (e.g. SNP alignment output from snp-sites or https://github.com/Sudaraka88/FastaR)
#' @param gbk_path path to genbank annotations file (default = NULL). Only provide one of genbank or gff3 annotation files.
#' @param gff3_path path to gff3 annotations file (default = NULL). Only provide one of genbank or gff3 annotation files.
#' @param ref_fasta_path path to Reference fasta file. The file MUST be in fasta format and contain exactly one sequence! Required for gff3 annotations,
#' not required for genbank annotations if the file contains the reference sequence.
#' @param validate_ref_ann_lengths check if the gbk reference sequence length matches the with fasta alignment (default = T)
#' @param snp_filt_method specify the filtering method for SNP extraction: 'relaxed' or 'default' (default = 'default'). Unlike default, relaxed mode considers ambiguous/gap characters (N) as minor alleles when applying the
#' maf_freq filter. Eg: Under default filter values, a site with allele frequencies A:0.85, C:0.0095, N:0.1405 will be respectively dropped and allowed by 'default' and 'relaxed' methods.
#' @param aln_path path to the multi fasta alignment - can be a SNP only or full alignment
#' @param aln_has_all_bases specify whether the alignment has all bases in the reference genome (default = T). For a SNP only alignment, set to F and provide SNP positions in pos (see example below)
#' @param pos numeric vector of positions for each base in the alignment (default = NULL). Only required for SNP alignments (e.g. Output from snp-sites or FastaR https://github.com/Sudaraka88/FastaR)
#' @param gbk_path path to genbank annotations file (default = NULL). Only provide one of genbank or gff3
#' @param gff3_path path to gff3 annotations file (default = NULL). Only provide one of genbank or gff3
#' @param ref_fasta_path path to Reference fasta file. This must be in fasta format and contain exactly one sequence! Only required for gff3 annotations
#' @param validate_ref_ann_lengths check if the reference sequence length matches with fasta alignment. (default = T for full alignments, = F for snp-only alignments.). Set to F for alignments that only contain a portion of the genome
#' @param snp_filt_method specify SNP filtering method: 'relaxed' or 'default' (default = 'default'). Unlike default, relaxed considers ambiguous/gap characters (N) as minor alleles when applying the
#' maf_freq filter. Eg: Under default filter values, a site with allele frequencies A:0.85, C:0.0095, N:0.1405 will be respectively, dropped and allowed by 'default' and 'relaxed'
#' @param gap_freq sites with a gap frequency >gap_greq will be dropped (default = 0.15)
#' @param maf_freq sites with a minor allele frequency <maf_freq will be dropped (default = 0.01)
#' @param hdw_threshold Hamming distance similarity threshold (default = 0.1, i.e. 10\%) - lower values will force stricter population structure control at the cost of masking real signal.
#' @param perform_SR_analysis_only specify whether to only perform the short range link analysis (default = FALSE)
#' @param hdw_threshold Hamming distance similarity threshold (default = 0.1, i.e. 10\%) - lower values will force stricter population structure control at the cost of masking real signal
#' @param perform_SR_analysis_only specify whether to only perform LDWeaver (short-range link) analysis. (default = FALSE) will analyse all links (i.e., LDWeaver + SpydrPick)
#' @param SnpEff_Annotate specify whether to perform annotations using SnpEff (default = TRUE)
#' @param sr_dist links less than <sr_dist> apart are considered 'short range' (default = 20000), range 1000 - 25000 bp.
#' @param lr_retain_links specify the maximum number of long-range MI links to retain (default = 1000000) - in each block, only a top subset of links will be saved
#' @param sr_dist links shorter than <sr_dist> apart are considered 'short range' (default = 20000), range 1000 - 25000 bp
#' @param lr_retain_links specify the maximum number of long-range MI links to retain (default = 1000000) - in each block, only a top subset of links will be saved such approximately this many links will be retained
#' @param max_tophits specify the maximum number of short range links to save as <sr_tophits.tsv>. Note: all short-range links will be annotated (and saved separately),
#' but only the top <max_tophits> will be used for visualisation (default = 250), range 50 - 1000
#' @param num_clusts_CDS parition to genome into num_clusts_CDS regions using k-means (default = 3) range 1 - 10
#' @param srp_cutoff specify the short-range -log10(p) cut-off value to discard short-range links before returning the data.frame. This setting has no impact on the
#' modelling since all links are used. However, setting a threshold > 2 will generally reduce the memory usage, plotting time (default = 3, i.e. corresponding to p = 0.001),
#' and run time for ARACNE. If all links are required to be returned, set to 0 (i.e. corresponding to p = 1), range 0 - 5
#' @param tanglegram_break_segments specify the number of genome segments to prepare - one tanglegram per segment (default = 5), range 1 - 10. Set NULL to skip tanglegram
#' @param write_gwesExplorer specify whether output for GWESExplorer is required (default = T)
#' @param num_clusts_CDS partition to genome into num_clusts_CDS regions using k-means (default = 3) range 1 - 10 - accounts for within genome heterogeneity in the short-range analysis
#' @param srp_cutoff specify the short-range -log10(p) cut-off value to discard short-range links before returning the data.frame (default = 3, i.e., p = 10^-3). This setting has no impact on modelling since all links are used.
#' Larger values will reduce memory usage, plotting time and ARACNE run time. If all links are required, set to 0 (i.e. p = 10^0 = 1), range 0 - 5
#' @param tanglegram_break_segments specify the number of genome tanglegram segments to prepare (default = 5 segments). Set NULL to skip tanglegram. range 1 - 10.
#' @param write_gwesExplorer specify whether to write output for GWESExplorer (default = T)
#' @param multicore specify whether to use parallel processing (default = T)
#' @param ncores specify the number of cores to use for parallel processing (default = NULL), will auto detect if NULL
#' @param ncores specify the number of cores to use for parallel processing. Auto detect (detect = NULL)
#' @param max_blk_sz specify maximum block size for MI computation (default = 10000), larger sizes require more RAM, range 1000 - 100000
#' @param save_additional_outputs specify whether to save outputs such as extracted SNPs and Hamming distance weights. Recommended for very large datasets to save time on re-computation (default = F)
#' @param mega_dset specify whether the datasets is megascale. This mode requires spam and spam64 packages. This is upto 5 times slower, set to TRUE only if the normal analysis fails (default = F)
#'
#'
#' @return Numeric Value 0 if successful (all generated outputs will be saved)
#' @return All generated outputs will be saved to folder <dset>.
#'
#' @examples
#' \dontrun{
#' sr_links_red = LDWeaver(dset = "efcm", aln_path = "<efcm_aln>", gbk_path = "<efcm.gbk>")
#' # These examples use data included with the package.
#'
#' # Example 1 - using a full alignment that includes non SNP sites
#' dset <- "full_dset"
#' gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver")
#' aln_path <- system.file("extdata", "sample.aln.gz", package = "LDWeaver")
#' LDWeaver::LDWeaver(dset = dset, aln_path = aln_path, gbk_path = gbk_path, validate_ref_ann_lengths = F)
#'
#' # Example 2- using a SNP only alignment
#' dset <- "snp_dset"
#' gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver")
#' aln_path <- system.file("extdata", "snp_sample.fa.gz", package = "LDWeaver")
#' pos <- as.numeric(readLines(system.file("extdata", "snp_sample.fa.pos", package = "LDWeaver")))
#' LDWeaver::LDWeaver(dset = dset, aln_path = aln_path, aln_has_all_bases = F, pos = pos, gbk_path = gbk_path)
#' }
#' @export
LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path = NULL, gff3_path = NULL,
Expand All @@ -50,7 +61,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
SnpEff_Annotate = T, sr_dist = 20000, lr_retain_links = 1e6,
max_tophits = 250, num_clusts_CDS = 3, srp_cutoff = 3, tanglegram_break_segments = 5,
write_gwesExplorer = T, multicore = T, max_blk_sz = 10000, ncores = NULL,
save_additional_outputs = F){
save_additional_outputs = F, mega_dset = F){

# Build blocks
# BLK1: Extract SNPs and create sparse Mx from MSA (fasta)
# BLK2: Parse GBK or GFF+REF
Expand All @@ -65,10 +77,10 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
# BLK11: Cleanup

#TODO: Provide the option to skip SNP extraction and use the whole provided alignment (redundant if pre-filtered)
#TODO: Add the option to provide genbank file without reference sequence
#TODO: Add the option to provide GFF file without reference sequence
#TODO: Count through blocks and automate the displayed BLOCK NUMBER
#TODO: genbankr is being droped from the newest bioconductor, add alternative (https://github.com/gmbecker/genbankr)
#TODO: Add Hamming Distance plot, can we have a SNP Tree + Hamming Distance weights to show population structure control?
#TODO: Benchmark spam VS matrixExtra

#NOTE: SnpEff does not parse the GBK and GFF3 file from the same refseq reference genome the same way. There might be differences between annotations/tophits/etc.
# # Welcome message # #
Expand All @@ -82,8 +94,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
# Added snpEff to inst/extdata
snpeff_jar_path = system.file("extdata", "snpEff.jar", package = "LDWeaver")
######################################## These checks must be unnecessary now ########################################
if(is.null(snpeff_jar_path)) stop("<snpeff_jar_path> must be provided for annotations. To run without annotations, set SnpEff_Annotate = F")
if(!file.exists(snpeff_jar_path)) stop(paste("<SnpEff.jar> not found at:", snpeff_jar_path, "please check the path provided"))
# if(is.null(snpeff_jar_path)) stop("<snpeff_jar_path> must be provided for annotations. To run without annotations, set SnpEff_Annotate = F")
# if(!file.exists(snpeff_jar_path)) stop(paste("<SnpEff.jar> not found at:", snpeff_jar_path, "please check the path provided"))
######################################################################################################################
order_links = F # sr_links should be ordered at the end after adding annotations
} else {
Expand Down Expand Up @@ -158,6 +170,15 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
validate_ref_ann_lengths = F
}

if(mega_dset){ # This is a mega dataset, we need spam and spam64 packages
if(!requireNamespace("spam") & !requireNamespace("spam64")){
message("mega_dset is set to TRUE but spam and spam64 dependencies are not installed.")
return(invisible())
}
}


######################################################################################################################
# normalise_input_paths
aln_path = normalizePath(aln_path)
if(!is.null(gbk_path)) gbk_path = normalizePath(gbk_path)
Expand Down Expand Up @@ -318,7 +339,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
cat("\n\n #################### BLOCK 3 #################### \n\n")
if(!file.exists(cds_var_path)) {
cat("Estimating the variation in CDS \n")
cds_var = LDWeaver::estimate_variation_in_CDS(gbk = gbk, gff = gff, snp.dat = snp.dat, ncores = ncores, num_clusts_CDS = num_clusts_CDS, clust_plt_path = clust_plt_path)
cds_var = LDWeaver::estimate_variation_in_CDS(gbk = gbk, gff = gff, snp.dat = snp.dat, ncores = ncores, num_clusts_CDS = num_clusts_CDS, clust_plt_path = clust_plt_path, mega_dset = mega_dset)
if(save_additional_outputs){
saveRDS(cds_var, cds_var_path)
}
Expand All @@ -332,7 +353,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
if(!file.exists(hdw_path)) {
cat("Estimating per sequence Hamming distance \n")
# hdw = perform_pop_struct_correction_sparse(snp.matrix = snp.dat$snp.matrix, nsnp = snp.dat$nsnp)
hdw = LDWeaver::estimate_Hamming_distance_weights(snp.dat = snp.dat, threshold = hdw_threshold)
hdw = LDWeaver::estimate_Hamming_distance_weights(snp.dat = snp.dat, threshold = hdw_threshold, mega_dset = mega_dset)
if(save_additional_outputs){
saveRDS(hdw, hdw_path)
}
Expand Down Expand Up @@ -459,6 +480,4 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path
LDWeaver::cleanup(dset = dset, delete_after_moving = F)

cat(paste("\n\n ** All done in", round(difftime(Sys.time(), t_global, units = "mins"), 3), "m ** \n"))


}
Loading

0 comments on commit fd6ef3d

Please sign in to comment.