Skip to content

Commit

Permalink
Merge pull request #17 from skchronicles/main
Browse files Browse the repository at this point in the history
Feat: adding whole exome sequencing (WES) pipeline, resolves #16
  • Loading branch information
skchronicles committed Feb 1, 2024
2 parents 6b85cf4 + eb64f8e commit 6988c86
Show file tree
Hide file tree
Showing 19 changed files with 817 additions and 43 deletions.
17 changes: 15 additions & 2 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
steps:
- uses: actions/checkout@v2
- uses: docker://snakemake/snakemake:v7.25.3
- name: Dry Run with test data, all options
- name: Dry Run WGS pipeline with test data, all options
run: |
docker run -v $PWD:/opt2 snakemake/snakemake:v7.25.3 \
/opt2/genome-seek run --input \
Expand All @@ -27,7 +27,20 @@ jobs:
--call-hla --call-sv --call-cnv --call-somatic --open-cravat \
--oc-annotators encode_tfbs ccre_screen vista_enhancer gnomad3 thousandgenomes cadd \
--pon /opt2/.tests/1000g_pon.hg38.vcf.gz --dry-run
- name: Dry Run with test data, skip QC
- name: Dry Run WES pipeline with test data, all options
run: |
docker run -v $PWD:/opt2 snakemake/snakemake:v7.25.3 \
/opt2/genome-seek run --input \
/opt2/.tests/WT_S1.R1.fastq.gz /opt2/.tests/WT_S1.R2.fastq.gz \
/opt2/.tests/WT_S2_R1.fastq.gz /opt2/.tests/WT_S2_R2.fastq.gz \
/opt2/.tests/WT_S3_1.fastq.gz /opt2/.tests/WT_S3_2.fastq.gz \
/opt2/.tests/WT_S4_R1.001.fastq.gz /opt2/.tests/WT_S4_R2.001.fastq.gz \
--output /opt2/output --mode local --pairs /opt2/.tests/pairs.tsv \
--call-hla --call-sv --call-cnv --call-somatic --open-cravat \
--oc-annotators encode_tfbs ccre_screen vista_enhancer gnomad3 thousandgenomes cadd \
--pon /opt2/.tests/1000g_pon.hg38.vcf.gz \
--wes-mode --wes-bed /opt2/.tests/wes_regions.bed --dry-run
- name: Dry Run WGS pipeline with test data, skip QC
run: |
docker run -v $PWD:/opt2 snakemake/snakemake:v7.25.3 \
/opt2/genome-seek run --input \
Expand Down
Empty file added .tests/wes_regions.bed
Empty file.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.6.0
0.7.0
20 changes: 20 additions & 0 deletions config/cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,16 @@
"partition": "norm",
"time": "12:00:00"
},
"cnvkit_access": {
"threads": "2",
"mem": "8G",
"time": "2:00:00"
},
"cnvkit_batch": {
"threads": "8",
"mem": "64G",
"time": "1-00:00:00"
},
"collectvariantcallmetrics": {
"threads": "2",
"time": "12:00:00"
Expand Down Expand Up @@ -333,6 +343,16 @@
"time": "12:00:00",
"mem": "24G"
},
"sequenza": {
"threads": "8",
"mem": "64G",
"time": "1-00:00:00"
},
"sequenza_utils": {
"threads": "8",
"mem": "64G",
"time": "1-00:00:00"
},
"snpeff": {
"mem": "24G",
"time": "12:00:00"
Expand Down
3 changes: 2 additions & 1 deletion config/containers.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"genome-seek_somatic": "docker://skchronicles/genome-seek_somatic:v0.1.0",
"genome-seek_qc": "docker://skchronicles/genome-seek_qc:v0.1.0",
"genome-seek_sv": "docker://skchronicles/genome-seek_sv:v0.2.0",
"genome-seek_cnv": "docker://skchronicles/genome-seek_cnv:v0.2.0",
"genome-seek_cnv": "docker://skchronicles/genome-seek_cnv:v0.3.0",
"genome-seek_hla": "docker://skchronicles/genome-seek_hla:v0.1.0",
"base": "docker://skchronicles/ccbr_wes_base:v0.1.0",
"deepvariant_gpu": "docker://google/deepvariant:1.5.0-gpu",
Expand All @@ -14,6 +14,7 @@
"open_cravat": "docker://skchronicles/ncbr_opencravat:v0.1.0",
"octopus": "docker://skchronicles/ncbr_octopus:v0.2.0",
"sigprofiler": "docker://skchronicles/ncbr_sigprofiler:v0.1.0",
"sequenza": "docker://sequenza/sequenza:3.0.0",
"vcf2maf": "docker://skchronicles/ncbr_vcf2maf:v0.1.0"
}
}
7 changes: 5 additions & 2 deletions config/genome.json
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@
"OCTOPUS_ERROR_MODEL": "PCR-FREE.NOVASEQ",
"PEDDY_FILTER": "chr22",
"PON": "/data/OpenOmics/references/genome-seek/PON/hg38.noCOSMIC_ClinVar.pon.vcf.gz",
"SEQUENZA_GC": "/data/OpenOmics/references/genome-seek/FREEC/hg38_gc50Base.txt.gz",
"SEQUENZA_SPECIES": "Human",
"SIGPROFILER_GENOME": "GRCh38",
"SNPEFF_GENOME": "GRCh38.86",
"SNPEFF_CONFIG": "/data/OpenOmics/references/genome-seek/snpEff/4.3t/snpEff.config",
Expand All @@ -113,6 +115,7 @@
"VEP_BUILD": "GRCh38",
"VEP_DATA": "/data/OpenOmics/references/genome-seek/VEP/106/cache",
"VEP_SPECIES": "homo_sapiens",
"VEP_REF_VERSION": "106"
"VEP_REF_VERSION": "106",
"WES_BED": "/data/OpenOmics/references/genome-seek/wes_bedfiles/gencode_v44_protein-coding_exons.bed"
}
}
}
2 changes: 2 additions & 0 deletions config/modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"bedtools": "bedtools/2.27.1",
"canvas": "Canvas/1.40",
"circos": "circos/0.69-9",
"cnvkit": "cnvkit/0.9.9",
"deepvariant": "deepvariant/1.4.0",
"perl": "perl/5.24.3",
"peddy": "peddy/0.3.1",
Expand All @@ -20,6 +21,7 @@
"bwa_mem2": "bwa-mem2/2.2.1",
"rlang": "R/4.2.0",
"samblaster": "samblaster/0.1.25",
"sequenza": "sequenza-utils/3.0.0",
"strelka": "strelka/2.9.10",
"svtools": "svtools/0.5.1",
"python2": "python/2.7",
Expand Down
54 changes: 48 additions & 6 deletions genome-seek
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ EXAMPLE:

# Python standard library
from __future__ import print_function
from operator import sub
import sys, os, subprocess, re, json, textwrap

# 3rd party imports from pypi
Expand All @@ -51,13 +50,14 @@ from src.utils import (
pairs,
permissions,
check_cache,
require)
require
)


# Pipeline Metadata
__version__ = version
__authors__ = 'Skyler Kuhn, Justin Lack'
__email__ = 'skyler.kuhn@nih.gov, justin.lack@nih.gov'
__authors__ = 'Skyler Kuhn, Justin Lack, Keyur Talsania'
__email__ = 'skyler.kuhn <AT> nih.gov, justin.lack <AT> nih.gov, keyur.talsania <AT> nih.gov'
__home__ = os.path.dirname(os.path.abspath(__file__))
_name = os.path.basename(sys.argv[0])
_description = 'Clinical whole genome sequencing pipeline'
Expand Down Expand Up @@ -188,6 +188,10 @@ def run(sub_args):
if sub_args.pon:
# Use custom provided PON
config['references']['PON'] = sub_args.pon

if sub_args.wes_bed:
# Use custom provided WES catpure kit
config['references']['WES_BED'] = sub_args.wes_bed

# Save config to output directory
with open(os.path.join(sub_args.output, 'config.json'), 'w') as fh:
Expand Down Expand Up @@ -355,8 +359,9 @@ def parsed_arguments(name, description):
[--mode {{slurm,local}}] [--job-name JOB_NAME] [--batch-id BATCH_ID] \\
[--call-cnv] [--call-sv] [--call-hla] [--call-somatic] [--open-cravat] \\
[--skip-qc] [--oc-annotators OC_ANNOTATORS] [--oc-modules OC_MODULES] \\
[--pairs PAIRS] [--pon PANEL_OF_NORMALS] [--tmp-dir TMP_DIR] [--silent] \\
[--sif-cache SIF_CACHE] [--singularity-cache SINGULARITY_CACHE] \\
[--pairs PAIRS] [--pon PANEL_OF_NORMALS] [--wes-mode] [--wes-bed WES_BED] \\
[--tmp-dir TMP_DIR] [--silent] [--sif-cache SIF_CACHE] \\
[--singularity-cache SINGULARITY_CACHE] \\
[--dry-run] [--threads THREADS] \\
--input INPUT [INPUT ...] \\
--output OUTPUT
Expand Down Expand Up @@ -419,6 +424,24 @@ def parsed_arguments(name, description):
changes to data processing steps or when evaluating
the overall accuracy and precision of the pipeline.
Example: --skip-qc
--wes-mode Run the whole exome pipeline. By default, the whole
genome sequencing pipeline is run. This option allows
a user to process and analyze whole exome sequencing
data. Please note when this mode is enabled, a sub-
set of the WGS rules will run. Please see the option
below for more information about providing a custom
exome targets BED file.
Example: --wes-mode
--wes-bed WES_BED
Path to exome targets BED file. This file can be
obtained from the manufacturer of the target capture
kit that was used. By default, a set of BED files
generated from GENCODE's exon annotation for protein
coding gene's exon is used.
Example: --wes-bed gencode_v44_protein-coding_exons.bed
--batch-id BATCH_ID
Unique identifer for a batch of samples. A batch
identifer should be a string containing no spaces.
Expand Down Expand Up @@ -667,6 +690,25 @@ def parsed_arguments(name, description):
help = argparse.SUPPRESS
)

# Run the WES pipeline
subparser_run.add_argument(
'--wes-mode',
action = 'store_true',
required = False,
default = False,
help = argparse.SUPPRESS
)

# WES capture targets BED file
subparser_run.add_argument(
'--wes-bed',
# Check if the file exists and if it is readable
type = lambda file: permissions(parser, file, os.R_OK),
required = False,
default = None,
help = argparse.SUPPRESS
)

# Tumor-normal pairs file
subparser_run.add_argument(
'--pairs',
Expand Down
58 changes: 51 additions & 7 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ tumors = list(config['pairs'].keys()) # List of tumor samples
normals = list(set(config['pairs'].values())) # List of normal samples
normals = [n for n in normals if n] # Remove tumor-onlys, i.e. empty strings
tumor2normal = config['pairs'] # Dict to map a tumor to its normal

# List of tumor samples with a paired normal
tumorWnormal = [t for t in tumors if tumor2normal[t]]

Expand All @@ -56,6 +57,20 @@ run_oc = str_bool( # Run OpenCRAVAT rules,
config['options']['open_cravat'] # default: False
)

# Whole-exome sequencing pipeline options
# By default, the WGS pipeline runs.
# The WES pipeline will run if the
# --wes option or the --wes-bed
# option are provided. If just the
# --wes option is provided the default
# GENCODE exons BED file is used.
wes_bed_provided = config['options']['wes_bed'] # default: None
wes_bed_file = config['references']['WES_BED'] # default: gencode_v44_protein-coding_exons.bed
# Run WES pipeline, default: False
run_wes = (wes_bed_provided != "None" or str_bool(config['options']['wes_mode']))



# List of somatic callers
# TODO: add as cli option,
# make sure at least one
Expand All @@ -80,6 +95,14 @@ with open(join('config', 'cluster.json')) as fh:
# Final ouput files of the pipeline
rule all:
input:
# Build and reformats WES BED capture kit,
# Optional step, only runs if --wes or --wes-bed
# option(s) provided.
# @imported from rules/wes.smk
provided(
[join(workpath, "references", "wes_regions_50bp_padded.bed")],
run_wes
),
# FastQ Information, flowcell and lanes
# Optional step, not run if --skip-qc
# @imported from rules/qc.smk
Expand Down Expand Up @@ -418,33 +441,53 @@ rule all:
# HMF Tools Purple, estimates CNV, purity and ploidy
# only runs if provided --call-somatic AND --call-cnv,
# purple will optionally use somatic SVs from MANTA
# if --call-sv is also provided at runtime.
# if --call-sv is also provided at runtime. These
# steps will only run for WGS data, if the --wes-mode
# option is provided the steps will be skipped.
# @imported from rules/cnv.smk
expand(
join(workpath, "hmftools", "amber", "{name}", "{name}.amber.baf.tsv"),
name=provided(provided(tumors, call_somatic), call_cnv)
name=provided(provided(tumors, call_somatic), call_cnv and not run_wes)
),
expand(
join(workpath, "hmftools", "cobalt", "{name}", "{name}.cobalt.ratio.tsv"),
name=provided(provided(tumors, call_somatic), call_cnv)
name=provided(provided(tumors, call_somatic), call_cnv and not run_wes)
),
expand(
join(workpath, "hmftools", "purple", "{name}", "{name}.purple.cnv.somatic.tsv"),
name=provided(provided(tumors, call_somatic), call_cnv)
name=provided(provided(tumors, call_somatic), call_cnv and not run_wes)
),
expand(
join(workpath, "hmftools", "purple", "{name}", "{name}.purple.maf"),
name=provided(provided(tumors, call_somatic), call_cnv)
name=provided(provided(tumors, call_somatic), call_cnv and not run_wes)
),
# Maftools, create cohort-level summary plots from purple somatic VCF,
# only runs if provided --call-somatic AND --call-cnv
# only runs if provided --call-somatic AND --call-cnv and NOT WES mode,
# This step will only run for WGS data.
# @imported from rules/somatic.smk
provided(
provided(
[join(workpath, "hmftools", "cohort_oncoplot.pdf")],
call_somatic
),
call_cnv
call_cnv and not run_wes
),
# Sequenza, estimates purity/ploidy and CNV,
# only runs if provided --call-somatic AND --call-cnv AND --wes-mode
# AND only runs with tumor-normal pairs,
# @imported from rules/wes.smk
expand(
join(workpath, "sequenza_out", "{name}_alternative_solutions.txt"),
name=provided(tumorWnormal, call_somatic and call_cnv and run_wes)
),
# cnvkit, infer and visualize CNVs in targeted sequencing data,
# only runs if provided --call-somatic AND --call-cnv AND --wes-mode
# AND only runs with tumor-normal pairs
# NOTE: currently only being run in the WES pipeline
# @imported from rules/wes.smk
expand(
join(workpath, "cnvkit", "{name}", "{name}.call.cns"),
name=provided(tumorWnormal, call_somatic and call_cnv and run_wes)
),
# OpenCRAVAT, somatic annotation, rank and score variants,
# only runs if provided --open-cravat AND --call-somatic
Expand Down Expand Up @@ -478,4 +521,5 @@ include: join("rules", "sv.smk")
include: join("rules", "open_cravat.smk")
include: join("rules", "gatk_recal.smk")
include: join("rules", "somatic.smk")
include: join("rules", "wes.smk")
include: join("rules", "hooks.smk")
2 changes: 1 addition & 1 deletion workflow/rules/cnv.smk
Original file line number Diff line number Diff line change
Expand Up @@ -507,4 +507,4 @@ rule purple_cohort_maftools:
{input.maf} \\
{output.summary} \\
{output.oncoplot}
"""
"""
Loading

0 comments on commit 6988c86

Please sign in to comment.