Merge pull request #1 from TRON-Bioinformatics/support-bams

Support bams
TRON-Bioinformatics · Mar 25, 2024 · 8a52bb9 · 8a52bb9
2 parents 099c6d7 + 2c08937
commit 8a52bb9
Show file tree

Hide file tree

Showing 21 changed files with 1,788 additions and 61 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,8 +3,9 @@
 /workspace.xml
 work
 .nextflow.log*
-report.html
-timeline.html
+report.html*
+timeline.html*
+trace.txt*
 trace.txt
 dag.dot
 .nextflow

diff --git a/README.md b/README.md
@@ -14,13 +14,21 @@ Nextflow (Di Tommaso, 2017) pipeline for HLA typing using HLA-HD (Kawaguchi, 201
 
 ## How to run it
 
-Prepare an input table with the FASTQs for each sample with three tab-separated columns **without a header**
+Prepare an input table with the FASTQs for each sample with three tab-separated columns **without a header** using `--input_fastqs`.
 
-| Sample name          | FASTQ1                      | FASTQ2                  |
-|----------------------|---------------------------------|------------------------------|
-| sample_1             | /path/to/sample_1.1.fq      |    /path/to/sample_1.2.fq   |
-| sample_2             | /path/to/sample_2.1.fq      |    /path/to/sample_2.2.fq  |
+| Sample name          | FASTQ1                    | FASTQ2                  |
+|----------------------|---------------------------|------------------------------|
+| sample_1             | /path/to/sample_1.1.fq.gz |    /path/to/sample_1.2.fq.gz   |
+| sample_2             | /path/to/sample_2.1.fq.gz    |    /path/to/sample_2.2.fq.gz  |
 
+Alternatively, provide a table with BAM files using `--input_bams`.
+
+| Sample name          | BAM                   |
+|----------------------|-----------------------|
+| sample_1             | /path/to/sample_1.bam |
+| sample_2             | /path/to/sample_2.bam |
+
+BAM files should be indexed.
 
 Run as indicated below.
 
@@ -34,22 +42,27 @@ Usage:
     nextflow run main.nf --input_files input_files --output output_folder
 
 Input:
-    * input_files: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
+    * input_fastqs: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
     The input file does not have header!
     Example input file:
     name1       fastq1.fq.gz    fastq2.fq.gz
     name2       fastq1.fq.gz    fastq2.fq.gz
+    * input_bams: the path to a tab-separated values file containing in each row the sample name and BAM
+    The input file does not have header!
+    Example input file:
+    name1       name1.bam
+    name2       name2.bam
     * output: output folder where results will be stored
 
 Optional input:
+    * reference: the reference genome to use (default: hg38, possible values: hg38 or hg19)
     * read_length: the read length (default: 50)
     * hlahd_folder: the HLA-HD folder (default: /code/hlahd.1.2.0.1)
     * bowtie2_folder: the bowtie2 folder (default: /code/bowtie/2.3.4.3)
     * bowtie2_module: the module to load with bowtie2
     * ld_library_path: the value to set in LD_LIBRARY_PATH
     * cpus: the number of CPUs per sample (default: 15)
     * memory: the amount of memory per sample (default: 30g)
-
 ```
 
 

diff --git a/main.nf b/main.nf
@@ -2,6 +2,9 @@
 
 nextflow.enable.dsl = 2
 
+include { BAM2FASTQ } from './modules/00_bam2fastq'
+include { HLAHD } from './modules/01_hlahd'
+
 
 def helpMessage() {
     log.info params.help_message
@@ -13,63 +16,42 @@ if (params.help) {
 }
 
 // checks required inputs
-if (params.input_files) {
+if (params.input_fastqs) {
   Channel
-    .fromPath(params.input_files)
+    .fromPath(params.input_fastqs)
     .splitCsv(header: ['name', 'fastq1', 'fastq2'], sep: "\t")
     .map{ row-> tuple(row.name, file(row.fastq1), file(row.fastq2)) }
-    .set { input_files }
+    .set { input_fastqs }
+} else if (params.input_bams) {
+  Channel
+    .fromPath(params.input_bams)
+    .splitCsv(header: ['name', 'bam'], sep: "\t")
+    .map{ row-> tuple(row.name, file(row.bam)) }
+    .set { input_bams }
 } else {
-  exit 1, "Input file not specified!"
+  exit 1, "Provide either --input_fastqs or --input_bams"
 }
 
 
-process HLAHD {
-    cpus params.cpus
-    memory params.memory
-    tag "${name}"
-    publishDir "${params.output}/${name}", mode: "copy", pattern: "*.txt"
-    module params.bowtie2_module
-
-    input:
-    tuple val(name), val(fastq1), val(fastq2)
-
-    output:
-    tuple val("${name}"), path("*final*")
-
-    script:
-    """
-    mkdir temp
-
-    # HLA-HD wants its own binaries and bowtie2 in path
-    export PATH=${params.hlahd_folder}/bin/:${params.bowtie2_folder}:\$PATH
-    export LD_LIBRARY_PATH=${params.ld_library_path}
+if (params.reference == 'hg38') {
+    contigs = "$baseDir/references/contigs_hla_reads_hg38.bed"
+}
+else if (params.reference == 'hg19') {
+    contigs = "$baseDir/references/contigs_hla_reads_hg19.bed"
+}
+else {
+    log.error "--reference only supports hg38 or hg19"
+    exit 1
+}
 
-    zcat ${fastq1} > input_fastq1.fastq
-    zcat ${fastq2} > input_fastq2.fastq
 
-    # HLA HD does not accept gzipped fastq files, unzip them first
-    hlahd.sh \
-        -m ${params.read_length} \
-        -t ${task.cpus} \
-        -f ${params.hlahd_folder}/freq_data/ \
-        input_fastq1.fastq \
-        input_fastq2.fastq \
-        ${params.hlahd_folder}/HLA_gene.split.txt \
-        ${params.hlahd_folder}/dictionary/ \
-        ${name} \
-        temp
 
-    # moves the final result to base folder
-    mv temp/${name}/result/* .
+workflow {
 
-    # deletes temp folder
-    rm -rf temp
-    rm -f input_fastq1.fastq
-    rm -f input_fastq2.fastq
-    """
-}
+    if (input_bams) {
+        BAM2FASTQ(input_bams, contigs)
+        input_fastqs =BAM2FASTQ.out
+    }
 
-workflow {
-    HLAHD(input_files)
+    HLAHD(input_fastqs)
 }
diff --git a/modules/00_bam2fastq.nf b/modules/00_bam2fastq.nf
@@ -0,0 +1,42 @@
+process BAM2FASTQ {
+    cpus params.cpus
+    memory params.memory
+    tag "${name}"
+    publishDir "${params.output}/${name}", mode: "copy", pattern: "*.txt"
+    module params.bowtie2_module
+
+    conda (params.enable_conda ? "bioconda::samtools=1.18 bioconda::gatk4=4.2.5.0" : null)
+
+    input:
+    tuple val(name), val(bam)
+    val(contigs)
+
+    output:
+    tuple val("${name}"), path("${name}.hla.1.fastq.gz"), path("${name}.hla.2.fastq.gz")
+
+    script:
+    """
+    samtools index $bam
+
+    # gets reads in the provided regions
+    # only non duplicated reads
+    samtools view \
+        -b \
+        -L ${contigs} \
+        -F 1024 \
+        ${bam} > ${name}.mhc.bam
+
+    # Extract unmap reads
+    samtools view -b -f 4 $bam > ${name}.unmap.bam
+
+    #Merge bam files
+    samtools merge -o ${name}.merge.bam ${name}.unmap.bam ${name}.mhc.bam
+
+    #Create fastq
+    gatk SamToFastq --VALIDATION_STRINGENCY SILENT -I ${name}.merge.bam -F ${name}.hlatmp.1.fastq -F2 ${name}.hlatmp.2.fastq
+
+    #Change fastq ID
+    cat ${name}.hlatmp.1.fastq |awk '{if(NR%4 == 1){O=\$0;gsub("/1"," 1",O);print O}else{print \$0}}' | gzip > ${name}.hla.1.fastq.gz
+    cat ${name}.hlatmp.2.fastq |awk '{if(NR%4 == 1){O=\$0;gsub("/2"," 2",O);print O}else{print \$0}}' | gzip > ${name}.hla.2.fastq.gz
+    """
+}
diff --git a/modules/01_hlahd.nf b/modules/01_hlahd.nf
@@ -0,0 +1,45 @@
+process HLAHD {
+    cpus params.cpus
+    memory params.memory
+    tag "${name}"
+    publishDir "${params.output}/${name}", mode: "copy", pattern: "*.txt"
+    module params.bowtie2_module
+
+    input:
+    tuple val(name), val(fastq1), val(fastq2)
+
+    output:
+    tuple val("${name}"), path("*final*")
+
+    script:
+    """
+    mkdir temp
+
+    # HLA-HD wants its own binaries and bowtie2 in path
+    export PATH=${params.hlahd_folder}/bin/:${params.bowtie2_folder}:\$PATH
+    export LD_LIBRARY_PATH=${params.ld_library_path}
+
+    zcat ${fastq1} > input_fastq1.fastq
+    zcat ${fastq2} > input_fastq2.fastq
+
+    # HLA HD does not accept gzipped fastq files, unzip them first
+    hlahd.sh \
+        -m ${params.read_length} \
+        -t ${task.cpus} \
+        -f ${params.hlahd_folder}/freq_data/ \
+        input_fastq1.fastq \
+        input_fastq2.fastq \
+        ${params.hlahd_folder}/HLA_gene.split.txt \
+        ${params.hlahd_folder}/dictionary/ \
+        ${name} \
+        temp
+
+    # moves the final result to base folder
+    mv temp/${name}/result/* .
+
+    # deletes temp folder
+    rm -rf temp
+    rm -f input_fastq1.fastq
+    rm -f input_fastq2.fastq
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -14,9 +14,21 @@ params.bowtie2_folder = "/code/bowtie/2.3.4.3"
 params.bowtie2_module = "bioinf/bowtie2/2.3.4.3"
 params.ld_library_path = "/usr/local/lib64/"
 params.read_length = 50
+params.reference = 'hg38'
 
 profiles {
+  conda {
+    params.enable_conda = true
+  }
   debug { process.beforeScript = 'echo $HOSTNAME' }
+  test {
+    params.cpus = 1
+    params.memory = "3g"
+    timeline.enabled = false
+    report.enabled = false
+    trace.enabled = false
+    dag.enabled = false
+  }
 }
 
 // Export this variable to prevent local Python libraries from conflicting with those in the container
@@ -27,7 +39,7 @@ env {
 // Capture exit codes from upstream processes when piping
 process.shell = ['/bin/bash', '-euo', 'pipefail']
 
-VERSION = '0.1.0'
+VERSION = '0.2.0'
 
 manifest {
   name = 'TRON-Bioinformatics/tronflow-hlahd'
@@ -40,20 +52,28 @@ manifest {
 }
 
 params.help_message = """
-TronFlow HLA-HD v${VERSION}
+nextflow run tron-bioinformatics/tronflow-hla-hd --help
+
+Launching `main.nf` [intergalactic_shannon] - revision: e707c77d7b
 
 Usage:
     nextflow run main.nf --input_files input_files --output output_folder
 
 Input:
-    * input_files: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
+    * input_fastqs: the path to a tab-separated values file containing in each row the sample name, FASTQ 1 and FASTQ 2
+    The input file does not have header!
+    Example input file:
+    name1       fastq1.fq.gz    fastq2.fq.gz
+    name2       fastq1.fq.gz    fastq2.fq.gz
+    * input_bams: the path to a tab-separated values file containing in each row the sample name and BAM
     The input file does not have header!
     Example input file:
-    name1	fastq1.fq.gz	fastq2.fq.gz
-    name2	fastq1.fq.gz	fastq2.fq.gz
+    name1       name1.bam
+    name2       name2.bam
     * output: output folder where results will be stored
 
 Optional input:
+    * reference: the reference genome to use (default: hg38, possible values: hg38 or hg19)
     * read_length: the read length (default: 50)
     * hlahd_folder: the HLA-HD folder (default: /code/hlahd.1.2.0.1)
     * bowtie2_folder: the bowtie2 folder (default: /code/bowtie/2.3.4.3)

diff --git a/references/contigs_hla_reads_hg19.bed b/references/contigs_hla_reads_hg19.bed
@@ -0,0 +1,8 @@
+chr6	28477797	33448354
+chr6_apd_hap1	1	4622290
+chr6_cox_hap2	1	4795371
+chr6_dbb_hap3	1	4610396
+chr6_mann_hap4	1	4683263
+chr6_mcf_hap5	1	4833398
+chr6_qbl_hap6	1	4611984
+chr6_ssto_hap7	1	4928567