Merge pull request #17 from skchronicles/main

Feat: adding whole exome sequencing (WES) pipeline, resolves #16
OpenOmics · Feb 1, 2024 · 6988c86 · 6988c86
2 parents 6b85cf4 + eb64f8e
commit 6988c86
Show file tree

Hide file tree

Showing 19 changed files with 817 additions and 43 deletions.
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -15,7 +15,7 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - uses: docker://snakemake/snakemake:v7.25.3
-    - name: Dry Run with test data, all options
+    - name: Dry Run WGS pipeline with test data, all options
       run: |
         docker run -v $PWD:/opt2 snakemake/snakemake:v7.25.3 \
         /opt2/genome-seek run --input \
@@ -27,7 +27,20 @@ jobs:
         --call-hla --call-sv --call-cnv --call-somatic --open-cravat \
         --oc-annotators encode_tfbs ccre_screen vista_enhancer gnomad3 thousandgenomes cadd \
         --pon /opt2/.tests/1000g_pon.hg38.vcf.gz --dry-run
-    - name: Dry Run with test data, skip QC
+    - name: Dry Run WES pipeline with test data, all options
+      run: |
+        docker run -v $PWD:/opt2 snakemake/snakemake:v7.25.3 \
+        /opt2/genome-seek run --input \
+        /opt2/.tests/WT_S1.R1.fastq.gz /opt2/.tests/WT_S1.R2.fastq.gz \
+        /opt2/.tests/WT_S2_R1.fastq.gz /opt2/.tests/WT_S2_R2.fastq.gz \
+        /opt2/.tests/WT_S3_1.fastq.gz /opt2/.tests/WT_S3_2.fastq.gz \
+        /opt2/.tests/WT_S4_R1.001.fastq.gz /opt2/.tests/WT_S4_R2.001.fastq.gz \
+        --output /opt2/output --mode local --pairs /opt2/.tests/pairs.tsv \
+        --call-hla --call-sv --call-cnv --call-somatic --open-cravat \
+        --oc-annotators encode_tfbs ccre_screen vista_enhancer gnomad3 thousandgenomes cadd \
+        --pon /opt2/.tests/1000g_pon.hg38.vcf.gz \
+        --wes-mode --wes-bed /opt2/.tests/wes_regions.bed --dry-run
+    - name: Dry Run WGS pipeline with test data, skip QC
       run: |
         docker run -v $PWD:/opt2 snakemake/snakemake:v7.25.3 \
         /opt2/genome-seek run --input \

diff --git a/.tests/wes_regions.bed b/.tests/wes_regions.bed
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.6.0
+0.7.0
diff --git a/config/cluster.json b/config/cluster.json
@@ -52,6 +52,16 @@
         "partition": "norm",
         "time": "12:00:00"
     },
+    "cnvkit_access": {
+        "threads": "2",
+        "mem": "8G",
+        "time": "2:00:00"
+    },
+    "cnvkit_batch": {
+        "threads": "8",
+        "mem": "64G",
+        "time": "1-00:00:00"
+    },
     "collectvariantcallmetrics": {
         "threads": "2",
         "time": "12:00:00"
@@ -333,6 +343,16 @@
         "time": "12:00:00",
         "mem": "24G"
     },
+    "sequenza": {
+        "threads": "8",
+        "mem": "64G",
+        "time": "1-00:00:00"
+    },
+    "sequenza_utils": {
+        "threads": "8",
+        "mem": "64G",
+        "time": "1-00:00:00"
+    },
     "snpeff": {
         "mem": "24G",
         "time": "12:00:00"

diff --git a/config/containers.json b/config/containers.json
@@ -5,7 +5,7 @@
         "genome-seek_somatic":  "docker://skchronicles/genome-seek_somatic:v0.1.0",
         "genome-seek_qc":       "docker://skchronicles/genome-seek_qc:v0.1.0",
         "genome-seek_sv":       "docker://skchronicles/genome-seek_sv:v0.2.0",
-        "genome-seek_cnv":      "docker://skchronicles/genome-seek_cnv:v0.2.0",
+        "genome-seek_cnv":      "docker://skchronicles/genome-seek_cnv:v0.3.0",
         "genome-seek_hla":      "docker://skchronicles/genome-seek_hla:v0.1.0",
         "base":                 "docker://skchronicles/ccbr_wes_base:v0.1.0",
         "deepvariant_gpu":      "docker://google/deepvariant:1.5.0-gpu",
@@ -14,6 +14,7 @@
         "open_cravat":          "docker://skchronicles/ncbr_opencravat:v0.1.0",
         "octopus":              "docker://skchronicles/ncbr_octopus:v0.2.0",
         "sigprofiler":          "docker://skchronicles/ncbr_sigprofiler:v0.1.0",
+        "sequenza":             "docker://sequenza/sequenza:3.0.0",
         "vcf2maf":              "docker://skchronicles/ncbr_vcf2maf:v0.1.0"
     }
 }
diff --git a/config/genome.json b/config/genome.json
@@ -99,6 +99,8 @@
         "OCTOPUS_ERROR_MODEL": "PCR-FREE.NOVASEQ",
         "PEDDY_FILTER": "chr22",
         "PON": "/data/OpenOmics/references/genome-seek/PON/hg38.noCOSMIC_ClinVar.pon.vcf.gz",
+        "SEQUENZA_GC": "/data/OpenOmics/references/genome-seek/FREEC/hg38_gc50Base.txt.gz",
+        "SEQUENZA_SPECIES": "Human",
         "SIGPROFILER_GENOME": "GRCh38",
         "SNPEFF_GENOME": "GRCh38.86",
         "SNPEFF_CONFIG": "/data/OpenOmics/references/genome-seek/snpEff/4.3t/snpEff.config",
@@ -113,6 +115,7 @@
         "VEP_BUILD": "GRCh38",
         "VEP_DATA": "/data/OpenOmics/references/genome-seek/VEP/106/cache",
         "VEP_SPECIES": "homo_sapiens",
-        "VEP_REF_VERSION": "106"
+        "VEP_REF_VERSION": "106",
+        "WES_BED": "/data/OpenOmics/references/genome-seek/wes_bedfiles/gencode_v44_protein-coding_exons.bed"
     }
-}
+}
diff --git a/config/modules.json b/config/modules.json
@@ -8,6 +8,7 @@
         "bedtools": "bedtools/2.27.1",
         "canvas": "Canvas/1.40",
         "circos": "circos/0.69-9",
+        "cnvkit": "cnvkit/0.9.9",
         "deepvariant": "deepvariant/1.4.0",
         "perl": "perl/5.24.3",
         "peddy": "peddy/0.3.1",
@@ -20,6 +21,7 @@
         "bwa_mem2": "bwa-mem2/2.2.1",
         "rlang": "R/4.2.0",
         "samblaster": "samblaster/0.1.25",
+        "sequenza": "sequenza-utils/3.0.0",
         "strelka": "strelka/2.9.10",
         "svtools": "svtools/0.5.1",
         "python2": "python/2.7",

diff --git a/genome-seek b/genome-seek
@@ -32,7 +32,6 @@ EXAMPLE:
 
 # Python standard library
 from __future__ import print_function
-from operator import sub
 import sys, os, subprocess, re, json, textwrap
 
 # 3rd party imports from pypi
@@ -51,13 +50,14 @@ from src.utils import (
     pairs,
     permissions,
     check_cache,
-    require)
+    require
+)
 
 
 # Pipeline Metadata
 __version__ = version
-__authors__ = 'Skyler Kuhn, Justin Lack'
-__email__ = 'skyler.kuhn@nih.gov, justin.lack@nih.gov'
+__authors__ = 'Skyler Kuhn, Justin Lack, Keyur Talsania'
+__email__ = 'skyler.kuhn <AT> nih.gov, justin.lack <AT> nih.gov, keyur.talsania <AT> nih.gov'
 __home__  =  os.path.dirname(os.path.abspath(__file__))
 _name = os.path.basename(sys.argv[0])
 _description = 'Clinical whole genome sequencing pipeline'
@@ -188,6 +188,10 @@ def run(sub_args):
     if sub_args.pon: 
         # Use custom provided PON
         config['references']['PON'] = sub_args.pon
+
+    if sub_args.wes_bed:
+        # Use custom provided WES catpure kit
+        config['references']['WES_BED'] = sub_args.wes_bed
 
     # Save config to output directory
     with open(os.path.join(sub_args.output, 'config.json'), 'w') as fh:
@@ -355,8 +359,9 @@ def parsed_arguments(name, description):
                 [--mode {{slurm,local}}] [--job-name JOB_NAME] [--batch-id BATCH_ID] \\
                 [--call-cnv] [--call-sv] [--call-hla] [--call-somatic] [--open-cravat] \\
                 [--skip-qc] [--oc-annotators OC_ANNOTATORS] [--oc-modules OC_MODULES] \\
-                [--pairs PAIRS] [--pon PANEL_OF_NORMALS] [--tmp-dir TMP_DIR] [--silent] \\
-                [--sif-cache SIF_CACHE] [--singularity-cache SINGULARITY_CACHE] \\
+                [--pairs PAIRS] [--pon PANEL_OF_NORMALS] [--wes-mode] [--wes-bed WES_BED] \\
+                [--tmp-dir TMP_DIR] [--silent] [--sif-cache SIF_CACHE] \\
+                [--singularity-cache SINGULARITY_CACHE] \\
                 [--dry-run] [--threads THREADS] \\
                 --input INPUT [INPUT ...] \\
                 --output OUTPUT
@@ -419,6 +424,24 @@ def parsed_arguments(name, description):
                                 changes to data processing steps or when evaluating 
                                 the overall accuracy and precision of the pipeline.
                                   Example: --skip-qc
+          
+          --wes-mode            Run the whole exome pipeline. By default, the whole
+                                genome sequencing pipeline is run. This option allows
+                                a user to process and analyze whole exome sequencing 
+                                data. Please note when this mode is enabled, a sub-
+                                set of the WGS rules will run. Please see the option
+                                below for more information about providing a custom
+                                exome targets BED file.
+                                  Example: --wes-mode
+
+          --wes-bed WES_BED
+                                Path to exome targets BED file. This file can be
+                                obtained from the manufacturer of the target capture
+                                kit that was used. By default, a set of BED files 
+                                generated from GENCODE's exon annotation for protein
+                                coding gene's exon is used.
+                                  Example: --wes-bed gencode_v44_protein-coding_exons.bed
+        
           --batch-id BATCH_ID   
                                 Unique identifer for a batch of samples. A batch 
                                 identifer should be a string containing no spaces.
@@ -667,6 +690,25 @@ def parsed_arguments(name, description):
         help = argparse.SUPPRESS
     )
 
+    # Run the WES pipeline
+    subparser_run.add_argument(
+        '--wes-mode',
+        action = 'store_true',
+        required = False,
+        default = False,
+        help = argparse.SUPPRESS
+    )
+
+    # WES capture targets BED file
+    subparser_run.add_argument(
+        '--wes-bed',
+        # Check if the file exists and if it is readable
+        type = lambda file: permissions(parser, file, os.R_OK),
+        required = False,
+        default = None,
+        help = argparse.SUPPRESS
+    )
+
     # Tumor-normal pairs file
     subparser_run.add_argument(
         '--pairs',

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -33,6 +33,7 @@ tumors   = list(config['pairs'].keys())        # List of tumor samples
 normals  = list(set(config['pairs'].values())) # List of normal samples
 normals  = [n for n in normals if n]           # Remove tumor-onlys, i.e. empty strings
 tumor2normal =  config['pairs']                # Dict to map a tumor to its normal
+
 # List of tumor samples with a paired normal
 tumorWnormal = [t for t in tumors if tumor2normal[t]]
 
@@ -56,6 +57,20 @@ run_oc  = str_bool(                      # Run OpenCRAVAT rules,
     config['options']['open_cravat']     # default: False
 )
 
+# Whole-exome sequencing pipeline options
+# By default, the WGS pipeline runs.
+# The WES pipeline will run if the 
+# --wes  option or the --wes-bed
+# option are provided. If just the
+# --wes option is provided the default
+# GENCODE exons BED file is used.
+wes_bed_provided = config['options']['wes_bed']   # default: None
+wes_bed_file = config['references']['WES_BED']    # default: gencode_v44_protein-coding_exons.bed
+# Run WES pipeline, default: False
+run_wes = (wes_bed_provided != "None" or str_bool(config['options']['wes_mode']))
+
+
+
 # List of somatic callers
 # TODO: add as cli option,
 # make sure at least one
@@ -80,6 +95,14 @@ with open(join('config', 'cluster.json')) as fh:
 # Final ouput files of the pipeline
 rule all:
     input:
+        # Build and reformats WES BED capture kit,
+        # Optional step, only runs if --wes or --wes-bed
+        # option(s) provided.
+        # @imported from rules/wes.smk
+        provided(
+            [join(workpath, "references", "wes_regions_50bp_padded.bed")],
+            run_wes
+        ),
         # FastQ Information, flowcell and lanes
         # Optional step, not run if --skip-qc 
         # @imported from rules/qc.smk
@@ -418,33 +441,53 @@ rule all:
         # HMF Tools Purple, estimates CNV, purity and ploidy
         # only runs if provided --call-somatic AND --call-cnv,
         # purple will optionally use somatic SVs from MANTA 
-        # if --call-sv is also provided at runtime.
+        # if --call-sv is also provided at runtime. These
+        # steps will only run for WGS data, if the --wes-mode
+        # option is provided the steps will be skipped. 
         # @imported from rules/cnv.smk
         expand( 
             join(workpath, "hmftools", "amber", "{name}", "{name}.amber.baf.tsv"),
-            name=provided(provided(tumors, call_somatic), call_cnv)
+            name=provided(provided(tumors, call_somatic), call_cnv and not run_wes)
         ),
         expand( 
             join(workpath, "hmftools", "cobalt", "{name}", "{name}.cobalt.ratio.tsv"),
-            name=provided(provided(tumors, call_somatic), call_cnv)
+            name=provided(provided(tumors, call_somatic), call_cnv and not run_wes)
         ),
         expand( 
             join(workpath, "hmftools", "purple", "{name}", "{name}.purple.cnv.somatic.tsv"),
-            name=provided(provided(tumors, call_somatic), call_cnv)
+            name=provided(provided(tumors, call_somatic), call_cnv and not run_wes)
         ),
         expand( 
             join(workpath, "hmftools", "purple", "{name}", "{name}.purple.maf"),
-            name=provided(provided(tumors, call_somatic), call_cnv)
+            name=provided(provided(tumors, call_somatic), call_cnv and not run_wes)
         ),
         # Maftools, create cohort-level summary plots from purple somatic VCF,
-        # only runs if provided --call-somatic AND --call-cnv
+        # only runs if provided --call-somatic AND --call-cnv and NOT WES mode,
+        # This step will only run for WGS data.
         # @imported from rules/somatic.smk    
         provided(
             provided(
                 [join(workpath, "hmftools", "cohort_oncoplot.pdf")],
                 call_somatic
             ),
-            call_cnv
+            call_cnv and not run_wes
+        ),
+        # Sequenza, estimates purity/ploidy and CNV,
+        # only runs if provided --call-somatic AND --call-cnv AND --wes-mode
+        # AND only runs with tumor-normal pairs,
+        # @imported from rules/wes.smk
+        expand( 
+            join(workpath, "sequenza_out", "{name}_alternative_solutions.txt"),
+            name=provided(tumorWnormal, call_somatic and call_cnv and run_wes)
+        ),
+        # cnvkit, infer and visualize CNVs in targeted sequencing data,
+        # only runs if provided --call-somatic AND --call-cnv AND --wes-mode
+        # AND only runs with tumor-normal pairs
+        # NOTE: currently only being run in the WES pipeline
+        # @imported from rules/wes.smk
+        expand( 
+            join(workpath, "cnvkit", "{name}", "{name}.call.cns"),
+            name=provided(tumorWnormal, call_somatic and call_cnv and run_wes)
         ),
         # OpenCRAVAT, somatic annotation, rank and score variants,
         # only runs if provided --open-cravat AND --call-somatic
@@ -478,4 +521,5 @@ include: join("rules", "sv.smk")
 include: join("rules", "open_cravat.smk")
 include: join("rules", "gatk_recal.smk")
 include: join("rules", "somatic.smk")
+include: join("rules", "wes.smk")
 include: join("rules", "hooks.smk")
diff --git a/workflow/rules/cnv.smk b/workflow/rules/cnv.smk
@@ -507,4 +507,4 @@ rule purple_cohort_maftools:
         {input.maf} \\
         {output.summary} \\
         {output.oncoplot}
-    """
+    """