From 8425db1892a6a501a67f91878e43fd8cf4865ed4 Mon Sep 17 00:00:00 2001
From: skchronicles <kuhnsa3@gmail.com>
Date: Tue, 19 Sep 2023 15:57:27 -0400
Subject: [PATCH] Adding rule to align against filtered, high-conf sqanti
 novel/known transcriptome

---
 config/cluster.json       |  9 +++++++
 workflow/Snakefile        |  6 +++++
 workflow/rules/sqanti.smk | 56 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/config/cluster.json b/config/cluster.json
index 65e9360..ae8081b 100644
--- a/config/cluster.json
+++ b/config/cluster.json
@@ -142,5 +142,14 @@
         "threads": "16",
         "mem": "64g",
         "time": "0-12:00:00"
+    },
+    "sqanti_minimap2": {
+        "threads": "4",
+        "mem": "32g",
+        "time": "0-08:00:00"
+    },
+    "sqanti_nanocount": {
+        "threads": "2",
+        "mem": "16g"
     }
 }
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 58d1fc7..4a16c8e 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -268,6 +268,12 @@ rule all:
         # Filters characterized transcripts from Sqanti QC
         # @imported from `rule sqanti_ml_filter` in rules/sqanti.smk
         join(workpath, "project", "counts", "novel", "sqanti.isoforms_MLresult_classification.txt"),
+        # Align to filtered, high-confidence SQANTI3 transcriptome
+        # @imported from `rules sqanti_minimap2` in rules/sqanti.smk
+        expand(
+            join(workpath, "{name}", "bams", "{name}.sorted.sqanti.transcriptome.bam"),
+            name=samples
+        ),
         # Nanopolish polyA tail length estimation,
         # needs index to map basecalled reads to
         # raw signal from the ONT sequencer
diff --git a/workflow/rules/sqanti.smk b/workflow/rules/sqanti.smk
index c062045..563b6f8 100644
--- a/workflow/rules/sqanti.smk
+++ b/workflow/rules/sqanti.smk
@@ -108,4 +108,58 @@ rule sqanti_ml_filter:
         --gtf {input.gtf} \\
         --isoforms {input.fa} \\
         {input.txt}
-    """
\ No newline at end of file
+    """
+
+
+rule sqanti_minimap2:
+    """
+    Data-processing step to align direct RNA reads against filtered, high-
+    confidence annotated transcriptome from SQANTI.
+    @Input:
+        Nanofilt quality filtered FastQ file (scatter),
+        Transcriptomic FASTA file
+    @Output:
+        Transcriptomic BAM file
+    """
+    input:
+        fq  = join(workpath, "{name}", "fastqs", "{name}.filtered.fastq.gz"),
+        ref  = join(workpath, "project", "counts", "novel", "sqanti.isoforms.filtered.fasta"),
+    output:
+        bam = join(workpath, "{name}", "bams", "{name}.sorted.sqanti.transcriptome.bam"),
+        bai = join(workpath, "{name}", "bams", "{name}.sorted.sqanti.transcriptome.bam.bai"),
+    params:
+        rname = 'sqanti_minimap2',
+        tmpdir = join(workpath, "{name}", "bams", "sqanti_transcriptome_tmp"),
+    conda: depending(join(workpath, config['conda']['modr']), use_conda)
+    container: depending(config['images']['modr'], use_singularity)
+    threads: int(allocated("threads", "sqanti_minimap2", cluster)) 
+    shell: 
+        """
+        # Setups temporary directory for
+        # intermediate files with built-in 
+        # mechanism for deletion on exit
+        if [ ! -d "{params.tmpdir}" ]; then mkdir -p "{params.tmpdir}"; fi
+        tmp=$(mktemp -d -p "{params.tmpdir}")
+        trap 'rm -rf "${{tmp}}"' EXIT
+
+        # Align against reference genome,
+        # minimap2 automatically handles
+        # conversion of U to T bps, See 
+        # this issue for the author of 
+        # minimap2's recomendations for 
+        # aligning direct RNA reads against 
+        # the transcriptome:
+        # https://github.com/lh3/minimap2/issues/340  
+        minimap2 \\
+            -ax map-ont \\
+            -N 10 \\
+            -k10 \\
+            {input.ref} \\
+            {input.fq} \\
+        | samtools sort -@{threads} \\
+            -T "${{tmp}}" \\
+            -O bam \\
+            --write-index \\
+            -o {output.bam}##idx##{output.bai} \\
+            -
+        """
\ No newline at end of file