From 8425db1892a6a501a67f91878e43fd8cf4865ed4 Mon Sep 17 00:00:00 2001 From: skchronicles Date: Tue, 19 Sep 2023 15:57:27 -0400 Subject: [PATCH] Adding rule to align against filtered, high-conf sqanti novel/known transcriptome --- config/cluster.json | 9 +++++++ workflow/Snakefile | 6 +++++ workflow/rules/sqanti.smk | 56 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/config/cluster.json b/config/cluster.json index 65e9360..ae8081b 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -142,5 +142,14 @@ "threads": "16", "mem": "64g", "time": "0-12:00:00" + }, + "sqanti_minimap2": { + "threads": "4", + "mem": "32g", + "time": "0-08:00:00" + }, + "sqanti_nanocount": { + "threads": "2", + "mem": "16g" } } diff --git a/workflow/Snakefile b/workflow/Snakefile index 58d1fc7..4a16c8e 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -268,6 +268,12 @@ rule all: # Filters characterized transcripts from Sqanti QC # @imported from `rule sqanti_ml_filter` in rules/sqanti.smk join(workpath, "project", "counts", "novel", "sqanti.isoforms_MLresult_classification.txt"), + # Align to filtered, high-confidence SQANTI3 transcriptome + # @imported from `rules sqanti_minimap2` in rules/sqanti.smk + expand( + join(workpath, "{name}", "bams", "{name}.sorted.sqanti.transcriptome.bam"), + name=samples + ), # Nanopolish polyA tail length estimation, # needs index to map basecalled reads to # raw signal from the ONT sequencer diff --git a/workflow/rules/sqanti.smk b/workflow/rules/sqanti.smk index c062045..563b6f8 100644 --- a/workflow/rules/sqanti.smk +++ b/workflow/rules/sqanti.smk @@ -108,4 +108,58 @@ rule sqanti_ml_filter: --gtf {input.gtf} \\ --isoforms {input.fa} \\ {input.txt} - """ \ No newline at end of file + """ + + +rule sqanti_minimap2: + """ + Data-processing step to align direct RNA reads against filtered, high- + confidence annotated transcriptome from SQANTI. + @Input: + Nanofilt quality filtered FastQ file (scatter), + Transcriptomic FASTA file + @Output: + Transcriptomic BAM file + """ + input: + fq = join(workpath, "{name}", "fastqs", "{name}.filtered.fastq.gz"), + ref = join(workpath, "project", "counts", "novel", "sqanti.isoforms.filtered.fasta"), + output: + bam = join(workpath, "{name}", "bams", "{name}.sorted.sqanti.transcriptome.bam"), + bai = join(workpath, "{name}", "bams", "{name}.sorted.sqanti.transcriptome.bam.bai"), + params: + rname = 'sqanti_minimap2', + tmpdir = join(workpath, "{name}", "bams", "sqanti_transcriptome_tmp"), + conda: depending(join(workpath, config['conda']['modr']), use_conda) + container: depending(config['images']['modr'], use_singularity) + threads: int(allocated("threads", "sqanti_minimap2", cluster)) + shell: + """ + # Setups temporary directory for + # intermediate files with built-in + # mechanism for deletion on exit + if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi + tmp=$(mktemp -d -p "{params.tmpdir}") + trap 'rm -rf "${{tmp}}"' EXIT + + # Align against reference genome, + # minimap2 automatically handles + # conversion of U to T bps, See + # this issue for the author of + # minimap2's recomendations for + # aligning direct RNA reads against + # the transcriptome: + # https://github.com/lh3/minimap2/issues/340 + minimap2 \\ + -ax map-ont \\ + -N 10 \\ + -k10 \\ + {input.ref} \\ + {input.fq} \\ + | samtools sort -@{threads} \\ + -T "${{tmp}}" \\ + -O bam \\ + --write-index \\ + -o {output.bam}##idx##{output.bai} \\ + - + """ \ No newline at end of file