diff --git a/.gitignore b/.gitignore index 24b7e8a..da2398b 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ Makevars testscripts.R testscript_op +tests + diff --git a/R/BacGWES.R b/R/BacGWES.R index e16de33..9362189 100644 --- a/R/BacGWES.R +++ b/R/BacGWES.R @@ -33,7 +33,8 @@ #' @param ncores specify the number of cores to use for parallel processing. Auto detect (detect = NULL) #' @param max_blk_sz specify maximum block size for MI computation (default = 10000), larger sizes require more RAM, range 1000 - 100000 #' @param save_additional_outputs specify whether to save outputs such as extracted SNPs and Hamming distance weights. Recommended for very large datasets to save time on re-computation (default = F) -#' @param mega_dset specify whether the datasets is megascale. This mode requires spam and spam64 packages. This is upto 5 times slower, set to TRUE only if the normal analysis fails (default = F) +#' @param mega_dset specify whether the datasets is megascale. This mode requires spam and spam64 packages. This is >5 times slower, set to TRUE only if the normal analysis fails (default = F) + #' #' @return All generated outputs will be saved to folder . #' @@ -53,6 +54,12 @@ #' aln_path <- system.file("extdata", "snp_sample.fa.gz", package = "LDWeaver") #' pos <- as.numeric(readLines(system.file("extdata", "snp_sample.fa.pos", package = "LDWeaver"))) #' LDWeaver::LDWeaver(dset = dset, aln_path = aln_path, aln_has_all_bases = F, pos = pos, gbk_path = gbk_path) +#' +# Example 3 - Redoing the full analysis as a mega scale dataset +#' dset <- "full_dset_spam" +#' gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver") +#' aln_path <- system.file("extdata", "sample.aln.gz", package = "LDWeaver") +#' LDWeaver::LDWeaver(dset = dset, aln_path = aln_path, gbk_path = gbk_path, validate_ref_ann_lengths = F, mega_dset = T) #' } #' @export LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path = NULL, gff3_path = NULL, @@ -175,6 +182,9 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path message("mega_dset is set to TRUE but spam and spam64 dependencies are not installed.") return(invisible()) } + message("mega_dset is selected. Warning! This mode has a much slower run time. Setting spam.force64=TRUE (see https://cran.r-project.org/web/packages/spam64/spam64.pdf)") + options(spam.force64 = TRUE) + } @@ -235,7 +245,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path if(perform_SR_analysis_only) cat("Only short-range analysis requested. \n") cat(paste("All outputs will be saved to:", normalizePath(dset), "\n")) cat(paste("\n *** Input paths *** \n\n")) - cat(paste("* Alignment:", aln_path, "\n")) + if(mega_dset) cat(paste("* Mega Alignment:", aln_path, "\n")) else cat(paste("* Alignment:", aln_path, "\n")) if(!is.null(gbk_path)) { cat(paste("* GenBank Annotation:", gbk_path, "\n")) cat(paste("* Parser built using genbankr source (https://github.com/gmbecker/genbankr) \n")) @@ -254,7 +264,7 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path cat(paste("Links <=", sr_dist, "bp-apart will be classified as short-range (sr-links) \n")) if(!perform_SR_analysis_only) cat(paste("Approx. top", lr_retain_links, "long range links will be saved \n")) cat(paste("Top sr-links with -log10(p) >", srp_cutoff, "will be saved \n")) - cat(paste("Tanglegram/GWESExplorer outputs will illustrate upto:", max_tophits, "top sr-links \n")) + if(!is.null(tanglegram_break_segments)) cat(paste("Tanglegram/GWESExplorer outputs will illustrate upto:", max_tophits, "top sr-links \n")) cat(paste("MI Computation will use a max block size of:", max_blk_sz, "x", max_blk_sz, "SNPs! Reduce if RAM is scarce!\n\n")) cat(paste("~~~~~ https://github.com/Sudaraka88/LDWeaver/ ~~~~~")) } @@ -269,13 +279,13 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path # Adding support for SNP-only alignments if(aln_has_all_bases == T){ - snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq) + snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq, mega_dset = mega_dset) if(save_additional_outputs){ cat("Step 5: Savings snp.dat...") saveRDS(snp.dat, ACGTN_snp_path) } } else { - snp.dat = LDWeaver::parse_fasta_SNP_alignment(aln_path = aln_path, pos = pos, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq) + snp.dat = LDWeaver::parse_fasta_SNP_alignment(aln_path = aln_path, pos = pos, method = snp_filt_method, gap_freq = gap_freq, maf_freq = maf_freq, mega_dset = mega_dset) # Note that snp.dat$g = NULL (we cannot measure this, need to get it from the genbank file) # we cannot save snp.dat here due to absent snp.dat$g, moving downstream (block 2) } @@ -375,7 +385,8 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path lr_save_path = lr_save_path, sr_save_path = sr_save_path, plt_folder = dset, sr_dist = sr_dist, lr_retain_links = lr_retain_links, max_blk_sz = max_blk_sz, srp_cutoff = srp_cutoff, runARACNE = T, - perform_SR_analysis_only = perform_SR_analysis_only, order_links = order_links) + perform_SR_analysis_only = perform_SR_analysis_only, + order_links = order_links,mega_dset = mega_dset) } @@ -436,7 +447,9 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path gwesexplorer_path = file.path(dset, "SR_GWESExplorer") if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path) cat("\n\n #################### BLOCK 10 #################### \n\n") - LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path) + if(mega_dset) { + message("GWES Explorer output currently not generated for mega datasets\n") + } else LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, gwes_explorer_folder = gwesexplorer_path) } @@ -455,13 +468,13 @@ LDWeaver = function(dset, aln_path, aln_has_all_bases = T, pos = NULL, gbk_path if( !( (file.exists(file.path(dset, "lr_tophits.tsv"))) | (file.exists(file.path(dset, "Tophits/lr_tophits.tsv"))) ) ) { # if the annotated_links file exists, no need to run again LDWeaver::analyse_long_range_links(dset = dset, lr_links_path = lr_save_path, sr_links_path = sr_save_path, SnpEff_Annotate = T, snpeff_jar_path = snpeff_jar_path, gbk_path = gbk_path, gff3_path = gff3_path, snp.dat = snp.dat, cds_var = cds_var, ref_fasta_path = ref_fasta_path, - validate_ref_ann_lengths = validate_ref_ann_lengths) + validate_ref_ann_lengths = validate_ref_ann_lengths, mega_dset = mega_dset) } else { cat("Results from previous LR anlayis exist!") } } else { if( !( (file.exists(file.path(dset, "lr_gwes.png"))) | (file.exists(file.path(dset, "GWESPlots/lr_gwes.png"))) ) ) { # if the lr_gwes plot exist, no need to run again - LDWeaver::analyse_long_range_links(dset = dset, lr_links_path = lr_save_path, sr_links_path = sr_save_path, SnpEff_Annotate = F) + LDWeaver::analyse_long_range_links(dset = dset, lr_links_path = lr_save_path, sr_links_path = sr_save_path, SnpEff_Annotate = F, mega_dset = mega_dset) } else { cat("Results from previous LR anlayis exist!") } diff --git a/R/computePairwiseMI.R b/R/computePairwiseMI.R index 0a5f43b..690b219 100644 --- a/R/computePairwiseMI.R +++ b/R/computePairwiseMI.R @@ -46,6 +46,14 @@ perform_MI_computation = function(snp.dat, hdw, cds_var, ncores, lr_save_path = NULL, sr_save_path = NULL, plt_folder = NULL, sr_dist = 20000, lr_retain_links = 1e6, max_blk_sz = 10000, srp_cutoff = 3, runARACNE = TRUE, perform_SR_analysis_only = FALSE, order_links = T, mega_dset = F){ + + ## DEBUG LINES - DO NOT DELETE and REMEMBER TO COMMENT + # lr_save_path = "testscript_op/lr_links_spam.tsv" + # sr_save_path = "testscript_op/sr_links_spam.tsv" + # plt_folder = "testscript_op" + # sr_dist = 20000; lr_retain_links = 1e6; max_blk_sz = 10000; srp_cutoff = 3 + # Rcpp::sourceCpp("src/computeMI.cpp"); Rcpp::sourceCpp("src/fintersect.cpp") + t000 = Sys.time() # TODO: if no paths are given, we need a way to stop overwriting (use timestamp()?) if(is.null(lr_save_path)) lr_save_path = file.path(getwd(), "lr_links.tsv") diff --git a/R/extractSNPs.R b/R/extractSNPs.R index 1e90799..f839cd0 100644 --- a/R/extractSNPs.R +++ b/R/extractSNPs.R @@ -53,6 +53,10 @@ parse_fasta_alignment <- function(aln_path, gap_freq = 0.15, maf_freq = 0.01, me message("This feature requires spam and spam64 packages.") return(invisible()) } else { + # We need to make sure we are using spam64, set it quietly + if(!getOption("spam.force64")) options(spam.force64 = T) + + snp.matrix_A <- spam::spam(list(i=snp.data$i_A, j=snp.data$j_A, values=as.logical(snp.data$x_A)), nrow = snp.param$num.seqs, ncol = snp.param$num.snps) snp.data$i_A = snp.data$j_A = snp.data$x_A = NULL @@ -185,6 +189,10 @@ parse_fasta_SNP_alignment <- function(aln_path, pos, gap_freq = 0.15, maf_freq = message("This feature requires spam and spam64 packages.") return(invisible()) } else { + + # We need to make sure we are using spam64, set it quietly + if(!getOption("spam.force64")) options(spam.force64 = T) + snp.matrix_A <- spam::spam(list(i=snp.data$i_A, j=snp.data$j_A, values=as.logical(snp.data$x_A)), nrow = snp.param$num.seqs, ncol = snp.param$num.snps) snp.data$i_A = snp.data$j_A = snp.data$x_A = NULL diff --git a/R/lr_analyser.R b/R/lr_analyser.R index 3d3f749..42b8de8 100644 --- a/R/lr_analyser.R +++ b/R/lr_analyser.R @@ -20,6 +20,7 @@ #' @param max_tophits specify the maximum number of long range links to save as . Note: all short-range links will be annotated (and saved separately), #' but only the top will be used for visualisation (default = 500) #' @param links_from_spydrpick are the links computed using spydrpick (default = F) +#' @param mega_dset set TRUE for mega scale datasets (default = F) #' #' @examples #' \dontrun{ @@ -29,8 +30,8 @@ analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrlinks_ordered = F, SnpEff_Annotate = F, snpeff_jar_path = NULL, gbk_path = NULL, gff3_path = NULL, ref_fasta_path = NULL, validate_ref_ann_lengths = T, snp.dat = NULL, cds_var = NULL, max_tophits = 500, - links_from_spydrpick = F){ - # tanglegram_break_segments = 5){ + links_from_spydrpick = F, mega_dset = F){ + # tanglegram_break_segments = 5){ #TODO: We are redoing the SnpEff annotation for long-range links, might be better to do it in one run # it makes sense to have a larger max_tophits for long range links - there will be a lot more of long-range links compared to short @@ -158,9 +159,9 @@ analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrli } tophits = LDWeaver::perform_snpEff_annotations(dset_name = dset, annotation_folder = file.path(getwd(), dset), - snpeff_jar = snpeff_jar_path, gbk = gbk, gbk_path = gbk_path, - gff = gff, cds_var = cds_var, links_df = lr_links_red, snp.dat = snp.dat, - tophits_path = tophits_path, max_tophits = max_tophits, links_type = "LR") + snpeff_jar = snpeff_jar_path, gbk = gbk, gbk_path = gbk_path, + gff = gff, cds_var = cds_var, links_df = lr_links_red, snp.dat = snp.dat, + tophits_path = tophits_path, max_tophits = max_tophits, links_type = "LR") # Tanglegram is difficult to read when plotted like this, best to avoid! # tanglegram_path = file.path(dset, "LR_Tanglegram") @@ -171,8 +172,12 @@ analyse_long_range_links = function(dset, lr_links_path, sr_links_path, are_lrli cat("\n") gwesexplorer_path = file.path(dset, "LR_GWESExplorer") if(!file.exists(gwesexplorer_path)) dir.create(gwesexplorer_path) - LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, - gwes_explorer_folder = gwesexplorer_path, links_type = "LR") + if(mega_dset) { + message("GWES Explorer output currently not generated for mega datasets\n") + } else LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, + gwes_explorer_folder = gwesexplorer_path, links_type = "LR") + + cat("\n") diff --git a/README.md b/README.md index 1145fb6..79c89c9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -## Genomewide Co-selection and Epistasis in Bacteria - [![R](https://github.com/Sudaraka88/LDWeaver/workflows/R-CMD-check/badge.svg)](https://github.com/Sudaraka88/LDWeaver/actions) @@ -7,7 +5,8 @@ [![LICESNSE](https://anaconda.org/bioconda/r-ldweaver/badges/license.svg)](https://spdx.org/licenses/GPL-3.0-or-later.html) -## About +## Genomewide Co-selection and Epistasis in Bacteria + LDWeaver accepts a sequence alignment (fasta) and its reference annotation (genbank or gff) as inputs and identifies linkage disequilibrium (LD) between @@ -88,7 +87,7 @@ aln_path <- system.file("extdata", "sample.aln.gz", package = "LDWeaver") gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver") snp_filt_method = "relaxed" LDWeaver(dset = dset, aln_path = aln_path, gbk_path = gbk_path, validate_ref_ann_lengths = F, - num_clusts_CDS = 2, SnpEff_Annotate = F, snp_filt_method = snp_filt_method) +num_clusts_CDS = 2, SnpEff_Annotate = F, snp_filt_method = snp_filt_method) ``` >**Note** If you are using a SNP-only alignment, set `aln_has_all_bases = F` and provide `pos`, a numeric vector of SNP positions. Each SNP in the SNP-only alignment must have a unique SNP position. @@ -125,9 +124,9 @@ aln_path <- "spn23f_msch.aln.gz" gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver") LDWeaver::LDWeaver(dset = dset, - aln_path = aln_path, - gbk_path = gbk_path, - save_additional_outputs = T) +aln_path = aln_path, +gbk_path = gbk_path, +save_additional_outputs = T) ``` `LDWeaver::LDWeaver()` one-liner is versatile for most @@ -149,9 +148,9 @@ ncores = parallel::detectCores() snp.dat = LDWeaver::parse_fasta_alignment(aln_path = aln_path) # parse the alignment and extract SNPs gbk = LDWeaver::parse_genbank_file(gbk_path = gbk_path, g = snp.dat$g) # parse the annotation cds_var = LDWeaver::estimate_variation_in_CDS(gbk = gbk, snp.dat = snp.dat, - ncores = ncores, - num_clusts_CDS = 3, - clust_plt_path = "msch/CDS_clustering.png") +ncores = ncores, +num_clusts_CDS = 3, +clust_plt_path = "msch/CDS_clustering.png") ``` ![](inst/sup/CDS_clustering.png) @@ -161,10 +160,10 @@ hdw = LDWeaver::estimate_Hamming_distance_weights(snp.dat = snp.dat) # Hamming d # Perform MI computation model fitting and ARACNE - this will take some time... sr_links = LDWeaver::perform_MI_computation(snp.dat = snp.dat, hdw = hdw, - cds_var = cds_var, ncores = ncores, - lr_save_path = "msch/lr_links.tsv", - sr_save_path = "msch/sr_links.tsv", - plt_folder = dset) +cds_var = cds_var, ncores = ncores, +lr_save_path = "msch/lr_links.tsv", +sr_save_path = "msch/sr_links.tsv", +plt_folder = dset) ``` ![](inst/sup/c1_fit.png) ![](inst/sup/c2_fit.png) @@ -180,9 +179,9 @@ LDWeaver::make_gwes_plots(sr_links = sr_links, plt_folder = dset) ``` r # Identify the top hits by performing snpEff annotations tophits = LDWeaver::perform_snpEff_annotations(dset_name = dset, annotation_folder = file.path(getwd(), dset), - gbk = gbk, gbk_path = gbk_path, cds_var = cds_var, - links_df = sr_links, snp.dat = snp.dat, - tophits_path = "msch/sr_tophits.tsv") +gbk = gbk, gbk_path = gbk_path, cds_var = cds_var, +links_df = sr_links, snp.dat = snp.dat, +tophits_path = "msch/sr_tophits.tsv") ``` This will generate several outputs comprising annotations into the @@ -199,7 +198,7 @@ should look like this: ![](inst/sup/Tanglegram_screenshot.png) ``` r # Generate GWES Explorer outputs LDWeaver::write_output_for_gwes_explorer(snp.dat = snp.dat, tophits = tophits, - gwes_explorer_folder = "msch/SR_GWESExplorer") +gwes_explorer_folder = "msch/SR_GWESExplorer") ``` Above line will create three files in \ that can be @@ -214,8 +213,8 @@ Next step is to analyse the long range links ``` r # Analyse long range links LDWeaver::analyse_long_range_links(dset = dset, lr_links_path = "msch/lr_links.tsv", - sr_links_path = "msch/sr_links.tsv", SnpEff_Annotate = T, - snp.dat = snp.dat, gbk_path = gbk_path, cds_var = cds_var) +sr_links_path = "msch/sr_links.tsv", SnpEff_Annotate = T, +snp.dat = snp.dat, gbk_path = gbk_path, cds_var = cds_var) ``` ![](inst/sup/lr_gwes.png) @@ -230,8 +229,8 @@ LDWeaver::cleanup(dset) It is possible to generate a genomewide LD distribution map using the following: ``` r LDWeaver::genomewide_LDMap(lr_links_path = "msch/Temp/lr_links.tsv", - sr_links_path = "msch/Temp/sr_links.tsv", - plot_save_path = "msch/GWLD.png") +sr_links_path = "msch/Temp/sr_links.tsv", +plot_save_path = "msch/GWLD.png") ``` > **Note** The paths have now updated after running LDWeaver::cleanup(). @@ -244,19 +243,21 @@ sites and their magnitude can be generated using: # Generate the Network Plot for pbp genes network = LDWeaver::create_network_for_gene("pbp", - sr_annotated_path = "msch/Annotated_links/sr_links_annotated.tsv", - lr_annotated_path = "msch/Annotated_links/lr_links_annotated.tsv", - level = 2) +sr_annotated_path = "msch/Annotated_links/sr_links_annotated.tsv", +lr_annotated_path = "msch/Annotated_links/lr_links_annotated.tsv", +level = 2) LDWeaver::create_network(network, - plot_title = "pbp network", - netplot_path = "msch/pbp_network.png", - plot_w = 2000, plot_h = 2000) +plot_title = "pbp network", +netplot_path = "msch/pbp_network.png", +plot_w = 2000, plot_h = 2000) ``` ![](inst/sup/network_plot.png) ## Additional Information +> **Note** With LDWeaver >1.5, you can analyse mega scale datasets with > 2^(32-1) elements. This requires spam and spam64 packages. Set `mega_dset=T` in `LDWeaver::LDWeaver()` to use this feature. Warning! This is currently considerably slower than the default mode (`mega_dset=F`) and only supports single core operations. There will also be minor discrepancies between the two methods due to floating point errors, however, this should only have a minimal impact on the final link ranking. + ### Key Outputs If the above steps worked as expected, the following output will be saved to a @@ -265,26 +266,27 @@ folder called `sample`, which should be created in the current working directory - Figures - 1. sample/cX_fit.png - shows the distribution and modelling of the - background linkage disequilibrium (estimated using weighted Mutual - Information) vs. bp-separation within each cluster (X = 1,2 in the - example) - 2. sample/CDS_clustering.png - shows the genome partitioning, based on - the CDS diversity (compared to the reference sequence) - 3. sample/sr_gwes_clust.png - short-range GWES plot for each cluster (2 in - this case) - 4. sample/sr_gwes_combi.png - combined short-range GWES plot (for links with - bp positions spanning two clusters, the max srp_value is used) - 5. sample/lr_gwes.png - Long range GWES plot (similar to the output from - SpydrPick) +1. sample/cX_fit.png - shows the distribution and modelling of the +background linkage disequilibrium (estimated using weighted Mutual +Information) vs. bp-separation within each cluster (X = 1,2 in the +example) +2. sample/CDS_clustering.png - shows the genome partitioning, based on +the CDS diversity (compared to the reference sequence) +3. sample/sr_gwes_clust.png - short-range GWES plot for each cluster (2 in +this case) +4. sample/sr_gwes_combi.png - combined short-range GWES plot (for links with +bp positions spanning two clusters, the max srp_value is used) +5. sample/lr_gwes.png - Long range GWES plot (similar to the output from +SpydrPick) - Outputs - 1. sample/sr_links.tsv - tab separated file containing details on - short-range links (i.e. links \<= sr_dist bp apart) - 2. sample/lr_links.tsv - tab separated file containing details on long-range - links (i.e. links \> sr_dist bp apart) - +1. sample/sr_links.tsv - tab separated file containing details on +short-range links (i.e. links \<= sr_dist bp apart) +2. sample/lr_links.tsv - tab separated file containing details on long-range +links (i.e. links \> sr_dist bp apart) + + ### Extra Outputs > **Note** The default `sr_dist` value in LDWeaver is 20000bp (user modifiable). @@ -292,11 +294,11 @@ folder called `sample`, which should be created in the current working directory - Additional Outputs (*not generated*) - can be used to avoid costly re-computations. - 1. Additional_Outputs/snp_ACGTN.rds - list comprising sparse SNP data from the alignment - 2. Additional_Outputs/parsed_gbk.rds - GenBankRecord of the genbank annotation data - 3. Additional_Outputs/hdw.rds - named vector comprising Hamming distance weights for - each sequence - 4. Additional_Outputs/cds_var.rds - list comprising alignment diversity information +1. Additional_Outputs/snp_ACGTN.rds - list comprising sparse SNP data from the alignment +2. Additional_Outputs/parsed_gbk.rds - GenBankRecord of the genbank annotation data +3. Additional_Outputs/hdw.rds - named vector comprising Hamming distance weights for +each sequence +4. Additional_Outputs/cds_var.rds - list comprising alignment diversity information > **Note** For very large datsets, the user has the option to set `save_additional_outputs=T`. > When these four files are present in \/Additional_Outputs/, the saved information @@ -311,18 +313,18 @@ refers to **sr** (short range) or **lr** (long range). - Outputs - 1. Annotated_links/X_links_annotated.tsv - tab separated file similar to - sample/X_links.tsv with additional SnpEff annotations and allele - distribution information - 2. Tophits/X_tophits.tsv - tab separated file containing the top 250 - links (user modifiable with `max_tophipts`) . Several filters are applied - to extract the top links from Annotated_links/X_links_annotated.tsv - 3. SR_Tanglegram - folder compirising html tanglegrams to easily - visualise links and the corresponding genomic regions - 4. GWESExplorer/X_GWESExplorer - folder containing the outputs necessary to dynamically - explore links using - GWESExplorer - (X = sr,lr). +1. Annotated_links/X_links_annotated.tsv - tab separated file similar to +sample/X_links.tsv with additional SnpEff annotations and allele +distribution information +2. Tophits/X_tophits.tsv - tab separated file containing the top 250 +links (user modifiable with `max_tophipts`) . Several filters are applied +to extract the top links from Annotated_links/X_links_annotated.tsv +3. SR_Tanglegram - folder compirising html tanglegrams to easily +visualise links and the corresponding genomic regions +4. GWESExplorer/X_GWESExplorer - folder containing the outputs necessary to dynamically +explore links using +GWESExplorer +(X = sr,lr). > **Note** The default srp_cutoff is 3 (i.e., p=0.001). Short-range links > with p\>0.001 are automatically discarded, this can be modified using @@ -332,15 +334,14 @@ refers to **sr** (short range) or **lr** (long range). - Temporary files created during snpEff annotations. These are all written to \/Temp and can be ignored or safely deleted) - 1. Temp/snpEff_data - data folder for snpEff - 2. Temp/snpEff.config - configuration file for snpEff - 3. Temp/X_annotations.tsv - tab separated file containing full snpEff - annotations on each site associated with a short-range GWES link - with srp_max \> srp_cutoff - 4. Temp/X_annotataed_stats.genes.txt - annotations and statistics in tab - separated format - 5. Temp/X_annotated_stats.html - annotations and statistics in html - format - 6. Temp/X_snps.vcf, Temp/X_snps_ann.vcf - input and output from the snpEff - annotation pipeline - +1. Temp/snpEff_data - data folder for snpEff +2. Temp/snpEff.config - configuration file for snpEff +3. Temp/X_annotations.tsv - tab separated file containing full snpEff +annotations on each site associated with a short-range GWES link +with srp_max \> srp_cutoff +4. Temp/X_annotataed_stats.genes.txt - annotations and statistics in tab +separated format +5. Temp/X_annotated_stats.html - annotations and statistics in html +format +6. Temp/X_snps.vcf, Temp/X_snps_ann.vcf - input and output from the snpEff +annotation pipeline \ No newline at end of file diff --git a/images/icon_dark.jpg b/images/icon_dark.jpg new file mode 100644 index 0000000..c4d73f7 Binary files /dev/null and b/images/icon_dark.jpg differ diff --git a/images/icon_light.jpg b/images/icon_light.jpg new file mode 100644 index 0000000..2ecb76c Binary files /dev/null and b/images/icon_light.jpg differ diff --git a/man/LDWeaver.Rd b/man/LDWeaver.Rd index f643ed1..690ebe3 100644 --- a/man/LDWeaver.Rd +++ b/man/LDWeaver.Rd @@ -87,7 +87,9 @@ Larger values will reduce memory usage, plotting time and ARACNE run time. If al \item{save_additional_outputs}{specify whether to save outputs such as extracted SNPs and Hamming distance weights. Recommended for very large datasets to save time on re-computation (default = F)} -\item{mega_dset}{specify whether the datasets is megascale. This mode requires spam and spam64 packages. This is upto 5 times slower, set to TRUE only if the normal analysis fails (default = F)} + +\item{mega_dset}{specify whether the datasets is megascale. This mode requires spam and spam64 packages. This is >5 times slower, set to TRUE only if the normal analysis fails (default = F)} + } \value{ All generated outputs will be saved to folder . @@ -111,5 +113,11 @@ gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver") aln_path <- system.file("extdata", "snp_sample.fa.gz", package = "LDWeaver") pos <- as.numeric(readLines(system.file("extdata", "snp_sample.fa.pos", package = "LDWeaver"))) LDWeaver::LDWeaver(dset = dset, aln_path = aln_path, aln_has_all_bases = F, pos = pos, gbk_path = gbk_path) + +dset <- "full_dset_spam" +gbk_path <- system.file("extdata", "sample.gbk", package = "LDWeaver") +aln_path <- system.file("extdata", "sample.aln.gz", package = "LDWeaver") +LDWeaver::LDWeaver(dset = dset, aln_path = aln_path, gbk_path = gbk_path, validate_ref_ann_lengths = F, mega_dset = T) + } }