diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index f8c6c1e..9194ebd 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -15,7 +15,7 @@ jobs: steps: - uses: actions/checkout@v2 - uses: docker://snakemake/snakemake:v7.25.3 - - name: Dry Run with test data, all options + - name: Dry Run WGS pipeline with test data, all options run: | docker run -v $PWD:/opt2 snakemake/snakemake:v7.25.3 \ /opt2/genome-seek run --input \ @@ -27,7 +27,20 @@ jobs: --call-hla --call-sv --call-cnv --call-somatic --open-cravat \ --oc-annotators encode_tfbs ccre_screen vista_enhancer gnomad3 thousandgenomes cadd \ --pon /opt2/.tests/1000g_pon.hg38.vcf.gz --dry-run - - name: Dry Run with test data, skip QC + - name: Dry Run WES pipeline with test data, all options + run: | + docker run -v $PWD:/opt2 snakemake/snakemake:v7.25.3 \ + /opt2/genome-seek run --input \ + /opt2/.tests/WT_S1.R1.fastq.gz /opt2/.tests/WT_S1.R2.fastq.gz \ + /opt2/.tests/WT_S2_R1.fastq.gz /opt2/.tests/WT_S2_R2.fastq.gz \ + /opt2/.tests/WT_S3_1.fastq.gz /opt2/.tests/WT_S3_2.fastq.gz \ + /opt2/.tests/WT_S4_R1.001.fastq.gz /opt2/.tests/WT_S4_R2.001.fastq.gz \ + --output /opt2/output --mode local --pairs /opt2/.tests/pairs.tsv \ + --call-hla --call-sv --call-cnv --call-somatic --open-cravat \ + --oc-annotators encode_tfbs ccre_screen vista_enhancer gnomad3 thousandgenomes cadd \ + --pon /opt2/.tests/1000g_pon.hg38.vcf.gz \ + --wes-mode --wes-bed /opt2/.tests/wes_regions.bed --dry-run + - name: Dry Run WGS pipeline with test data, skip QC run: | docker run -v $PWD:/opt2 snakemake/snakemake:v7.25.3 \ /opt2/genome-seek run --input \ diff --git a/.tests/wes_regions.bed b/.tests/wes_regions.bed new file mode 100644 index 0000000..e69de29 diff --git a/VERSION b/VERSION index a918a2a..faef31a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.6.0 +0.7.0 diff --git a/config/cluster.json b/config/cluster.json index 4190fbf..046ffbb 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -52,6 +52,16 @@ "partition": "norm", "time": "12:00:00" }, + "cnvkit_access": { + "threads": "2", + "mem": "8G", + "time": "2:00:00" + }, + "cnvkit_batch": { + "threads": "8", + "mem": "64G", + "time": "1-00:00:00" + }, "collectvariantcallmetrics": { "threads": "2", "time": "12:00:00" @@ -333,6 +343,16 @@ "time": "12:00:00", "mem": "24G" }, + "sequenza": { + "threads": "8", + "mem": "64G", + "time": "1-00:00:00" + }, + "sequenza_utils": { + "threads": "8", + "mem": "64G", + "time": "1-00:00:00" + }, "snpeff": { "mem": "24G", "time": "12:00:00" diff --git a/config/containers.json b/config/containers.json index 40e63cf..e57119f 100644 --- a/config/containers.json +++ b/config/containers.json @@ -5,7 +5,7 @@ "genome-seek_somatic": "docker://skchronicles/genome-seek_somatic:v0.1.0", "genome-seek_qc": "docker://skchronicles/genome-seek_qc:v0.1.0", "genome-seek_sv": "docker://skchronicles/genome-seek_sv:v0.2.0", - "genome-seek_cnv": "docker://skchronicles/genome-seek_cnv:v0.2.0", + "genome-seek_cnv": "docker://skchronicles/genome-seek_cnv:v0.3.0", "genome-seek_hla": "docker://skchronicles/genome-seek_hla:v0.1.0", "base": "docker://skchronicles/ccbr_wes_base:v0.1.0", "deepvariant_gpu": "docker://google/deepvariant:1.5.0-gpu", @@ -14,6 +14,7 @@ "open_cravat": "docker://skchronicles/ncbr_opencravat:v0.1.0", "octopus": "docker://skchronicles/ncbr_octopus:v0.2.0", "sigprofiler": "docker://skchronicles/ncbr_sigprofiler:v0.1.0", + "sequenza": "docker://sequenza/sequenza:3.0.0", "vcf2maf": "docker://skchronicles/ncbr_vcf2maf:v0.1.0" } } diff --git a/config/genome.json b/config/genome.json index 47d7a14..d23dee6 100644 --- a/config/genome.json +++ b/config/genome.json @@ -99,6 +99,8 @@ "OCTOPUS_ERROR_MODEL": "PCR-FREE.NOVASEQ", "PEDDY_FILTER": "chr22", "PON": "/data/OpenOmics/references/genome-seek/PON/hg38.noCOSMIC_ClinVar.pon.vcf.gz", + "SEQUENZA_GC": "/data/OpenOmics/references/genome-seek/FREEC/hg38_gc50Base.txt.gz", + "SEQUENZA_SPECIES": "Human", "SIGPROFILER_GENOME": "GRCh38", "SNPEFF_GENOME": "GRCh38.86", "SNPEFF_CONFIG": "/data/OpenOmics/references/genome-seek/snpEff/4.3t/snpEff.config", @@ -113,6 +115,7 @@ "VEP_BUILD": "GRCh38", "VEP_DATA": "/data/OpenOmics/references/genome-seek/VEP/106/cache", "VEP_SPECIES": "homo_sapiens", - "VEP_REF_VERSION": "106" + "VEP_REF_VERSION": "106", + "WES_BED": "/data/OpenOmics/references/genome-seek/wes_bedfiles/gencode_v44_protein-coding_exons.bed" } -} +} \ No newline at end of file diff --git a/config/modules.json b/config/modules.json index aa35c7e..b22dc60 100644 --- a/config/modules.json +++ b/config/modules.json @@ -8,6 +8,7 @@ "bedtools": "bedtools/2.27.1", "canvas": "Canvas/1.40", "circos": "circos/0.69-9", + "cnvkit": "cnvkit/0.9.9", "deepvariant": "deepvariant/1.4.0", "perl": "perl/5.24.3", "peddy": "peddy/0.3.1", @@ -20,6 +21,7 @@ "bwa_mem2": "bwa-mem2/2.2.1", "rlang": "R/4.2.0", "samblaster": "samblaster/0.1.25", + "sequenza": "sequenza-utils/3.0.0", "strelka": "strelka/2.9.10", "svtools": "svtools/0.5.1", "python2": "python/2.7", diff --git a/genome-seek b/genome-seek index 480addc..5256cd7 100755 --- a/genome-seek +++ b/genome-seek @@ -32,7 +32,6 @@ EXAMPLE: # Python standard library from __future__ import print_function -from operator import sub import sys, os, subprocess, re, json, textwrap # 3rd party imports from pypi @@ -51,13 +50,14 @@ from src.utils import ( pairs, permissions, check_cache, - require) + require +) # Pipeline Metadata __version__ = version -__authors__ = 'Skyler Kuhn, Justin Lack' -__email__ = 'skyler.kuhn@nih.gov, justin.lack@nih.gov' +__authors__ = 'Skyler Kuhn, Justin Lack, Keyur Talsania' +__email__ = 'skyler.kuhn nih.gov, justin.lack nih.gov, keyur.talsania nih.gov' __home__ = os.path.dirname(os.path.abspath(__file__)) _name = os.path.basename(sys.argv[0]) _description = 'Clinical whole genome sequencing pipeline' @@ -188,6 +188,10 @@ def run(sub_args): if sub_args.pon: # Use custom provided PON config['references']['PON'] = sub_args.pon + + if sub_args.wes_bed: + # Use custom provided WES catpure kit + config['references']['WES_BED'] = sub_args.wes_bed # Save config to output directory with open(os.path.join(sub_args.output, 'config.json'), 'w') as fh: @@ -355,8 +359,9 @@ def parsed_arguments(name, description): [--mode {{slurm,local}}] [--job-name JOB_NAME] [--batch-id BATCH_ID] \\ [--call-cnv] [--call-sv] [--call-hla] [--call-somatic] [--open-cravat] \\ [--skip-qc] [--oc-annotators OC_ANNOTATORS] [--oc-modules OC_MODULES] \\ - [--pairs PAIRS] [--pon PANEL_OF_NORMALS] [--tmp-dir TMP_DIR] [--silent] \\ - [--sif-cache SIF_CACHE] [--singularity-cache SINGULARITY_CACHE] \\ + [--pairs PAIRS] [--pon PANEL_OF_NORMALS] [--wes-mode] [--wes-bed WES_BED] \\ + [--tmp-dir TMP_DIR] [--silent] [--sif-cache SIF_CACHE] \\ + [--singularity-cache SINGULARITY_CACHE] \\ [--dry-run] [--threads THREADS] \\ --input INPUT [INPUT ...] \\ --output OUTPUT @@ -419,6 +424,24 @@ def parsed_arguments(name, description): changes to data processing steps or when evaluating the overall accuracy and precision of the pipeline. Example: --skip-qc + + --wes-mode Run the whole exome pipeline. By default, the whole + genome sequencing pipeline is run. This option allows + a user to process and analyze whole exome sequencing + data. Please note when this mode is enabled, a sub- + set of the WGS rules will run. Please see the option + below for more information about providing a custom + exome targets BED file. + Example: --wes-mode + + --wes-bed WES_BED + Path to exome targets BED file. This file can be + obtained from the manufacturer of the target capture + kit that was used. By default, a set of BED files + generated from GENCODE's exon annotation for protein + coding gene's exon is used. + Example: --wes-bed gencode_v44_protein-coding_exons.bed + --batch-id BATCH_ID Unique identifer for a batch of samples. A batch identifer should be a string containing no spaces. @@ -667,6 +690,25 @@ def parsed_arguments(name, description): help = argparse.SUPPRESS ) + # Run the WES pipeline + subparser_run.add_argument( + '--wes-mode', + action = 'store_true', + required = False, + default = False, + help = argparse.SUPPRESS + ) + + # WES capture targets BED file + subparser_run.add_argument( + '--wes-bed', + # Check if the file exists and if it is readable + type = lambda file: permissions(parser, file, os.R_OK), + required = False, + default = None, + help = argparse.SUPPRESS + ) + # Tumor-normal pairs file subparser_run.add_argument( '--pairs', diff --git a/workflow/Snakefile b/workflow/Snakefile index 0f0ccd6..6b185da 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -33,6 +33,7 @@ tumors = list(config['pairs'].keys()) # List of tumor samples normals = list(set(config['pairs'].values())) # List of normal samples normals = [n for n in normals if n] # Remove tumor-onlys, i.e. empty strings tumor2normal = config['pairs'] # Dict to map a tumor to its normal + # List of tumor samples with a paired normal tumorWnormal = [t for t in tumors if tumor2normal[t]] @@ -56,6 +57,20 @@ run_oc = str_bool( # Run OpenCRAVAT rules, config['options']['open_cravat'] # default: False ) +# Whole-exome sequencing pipeline options +# By default, the WGS pipeline runs. +# The WES pipeline will run if the +# --wes option or the --wes-bed +# option are provided. If just the +# --wes option is provided the default +# GENCODE exons BED file is used. +wes_bed_provided = config['options']['wes_bed'] # default: None +wes_bed_file = config['references']['WES_BED'] # default: gencode_v44_protein-coding_exons.bed +# Run WES pipeline, default: False +run_wes = (wes_bed_provided != "None" or str_bool(config['options']['wes_mode'])) + + + # List of somatic callers # TODO: add as cli option, # make sure at least one @@ -80,6 +95,14 @@ with open(join('config', 'cluster.json')) as fh: # Final ouput files of the pipeline rule all: input: + # Build and reformats WES BED capture kit, + # Optional step, only runs if --wes or --wes-bed + # option(s) provided. + # @imported from rules/wes.smk + provided( + [join(workpath, "references", "wes_regions_50bp_padded.bed")], + run_wes + ), # FastQ Information, flowcell and lanes # Optional step, not run if --skip-qc # @imported from rules/qc.smk @@ -418,33 +441,53 @@ rule all: # HMF Tools Purple, estimates CNV, purity and ploidy # only runs if provided --call-somatic AND --call-cnv, # purple will optionally use somatic SVs from MANTA - # if --call-sv is also provided at runtime. + # if --call-sv is also provided at runtime. These + # steps will only run for WGS data, if the --wes-mode + # option is provided the steps will be skipped. # @imported from rules/cnv.smk expand( join(workpath, "hmftools", "amber", "{name}", "{name}.amber.baf.tsv"), - name=provided(provided(tumors, call_somatic), call_cnv) + name=provided(provided(tumors, call_somatic), call_cnv and not run_wes) ), expand( join(workpath, "hmftools", "cobalt", "{name}", "{name}.cobalt.ratio.tsv"), - name=provided(provided(tumors, call_somatic), call_cnv) + name=provided(provided(tumors, call_somatic), call_cnv and not run_wes) ), expand( join(workpath, "hmftools", "purple", "{name}", "{name}.purple.cnv.somatic.tsv"), - name=provided(provided(tumors, call_somatic), call_cnv) + name=provided(provided(tumors, call_somatic), call_cnv and not run_wes) ), expand( join(workpath, "hmftools", "purple", "{name}", "{name}.purple.maf"), - name=provided(provided(tumors, call_somatic), call_cnv) + name=provided(provided(tumors, call_somatic), call_cnv and not run_wes) ), # Maftools, create cohort-level summary plots from purple somatic VCF, - # only runs if provided --call-somatic AND --call-cnv + # only runs if provided --call-somatic AND --call-cnv and NOT WES mode, + # This step will only run for WGS data. # @imported from rules/somatic.smk provided( provided( [join(workpath, "hmftools", "cohort_oncoplot.pdf")], call_somatic ), - call_cnv + call_cnv and not run_wes + ), + # Sequenza, estimates purity/ploidy and CNV, + # only runs if provided --call-somatic AND --call-cnv AND --wes-mode + # AND only runs with tumor-normal pairs, + # @imported from rules/wes.smk + expand( + join(workpath, "sequenza_out", "{name}_alternative_solutions.txt"), + name=provided(tumorWnormal, call_somatic and call_cnv and run_wes) + ), + # cnvkit, infer and visualize CNVs in targeted sequencing data, + # only runs if provided --call-somatic AND --call-cnv AND --wes-mode + # AND only runs with tumor-normal pairs + # NOTE: currently only being run in the WES pipeline + # @imported from rules/wes.smk + expand( + join(workpath, "cnvkit", "{name}", "{name}.call.cns"), + name=provided(tumorWnormal, call_somatic and call_cnv and run_wes) ), # OpenCRAVAT, somatic annotation, rank and score variants, # only runs if provided --open-cravat AND --call-somatic @@ -478,4 +521,5 @@ include: join("rules", "sv.smk") include: join("rules", "open_cravat.smk") include: join("rules", "gatk_recal.smk") include: join("rules", "somatic.smk") +include: join("rules", "wes.smk") include: join("rules", "hooks.smk") \ No newline at end of file diff --git a/workflow/rules/cnv.smk b/workflow/rules/cnv.smk index 7490635..b1eaf47 100644 --- a/workflow/rules/cnv.smk +++ b/workflow/rules/cnv.smk @@ -507,4 +507,4 @@ rule purple_cohort_maftools: {input.maf} \\ {output.summary} \\ {output.oncoplot} - """ + """ \ No newline at end of file diff --git a/workflow/rules/germline.smk b/workflow/rules/germline.smk index ffe8da1..1c85d45 100644 --- a/workflow/rules/germline.smk +++ b/workflow/rules/germline.smk @@ -1,6 +1,9 @@ # Functions and rules for calling germline variants -from scripts.common import abstract_location, allocated - +from scripts.common import ( + abstract_location, + allocated, + provided +) rule deepvariant: """ @@ -25,6 +28,10 @@ rule deepvariant: rname = "deepvar", genome = config['references']['GENOME'], tmpdir = tmpdir, + # Building option for glnexus config, where: + # @WES = --model_type=WES + # @WGS = --model_type=WGS + dv_model_type = lambda _: "WES" if run_wes else "WGS", message: "Running DeepVariant on '{input.bam}' input file" threads: int(allocated("threads", "deepvariant", cluster)) container: config['images']['deepvariant'] @@ -38,7 +45,7 @@ rule deepvariant: trap 'rm -rf "${{tmp}}"' EXIT run_deepvariant \\ - --model_type=WGS \\ + --model_type={params.dv_model_type} \\ --ref={params.genome} \\ --reads={input.bam} \\ --output_gvcf={output.gvcf} \\ @@ -62,6 +69,7 @@ rule glnexus: """ input: gvcf = expand(join(workpath,"deepvariant","gVCFs","{name}.g.vcf.gz"), name=samples), + bed = provided(join(workpath, "references", "wes_regions_50bp_padded.bed"), run_wes), output: gvcfs = join(workpath, "deepvariant", "VCFs", "gvcfs.list"), bcf = join(workpath, "deepvariant", "VCFs", "joint.bcf"), @@ -73,6 +81,16 @@ rule glnexus: gvcfdir = join(workpath, "deepvariant", "gVCFs"), memory = allocated("mem", "glnexus", cluster).rstrip('G'), genome = config['references']['GENOME'], + # Building option for glnexus config, where: + # @WES = --config DeepVariantWES + # @WGS = --config DeepVariant_unfiltered + gl_config = lambda _: "DeepVariantWES" if run_wes else "DeepVariant_unfiltered", + # Building option for GLnexus bed file: + # @WES = --bed wes_regions_50bp_padded.bed + # @WGS = '' + wes_bed_option = lambda _: "--bed {0}".format( + join(workpath, "references", "wes_regions_50bp_padded.bed"), + ) if run_wes else "", message: "Running GLnexus on a set of gVCF files" threads: int(allocated("threads", "glnexus", cluster)) container: config['images']['glnexus'] @@ -96,7 +114,7 @@ rule glnexus: glnexus_cli \\ --dir ${{tmp_dne}} \\ - --config DeepVariant_unfiltered \\ + --config {params.gl_config} {params.wes_bed_option} \\ --list {output.gvcfs} \\ --threads {threads} \\ --mem-gbytes {params.memory} \\ @@ -138,20 +156,26 @@ rule gatk_selectvariants: """ input: vcf = join(workpath, "deepvariant", "VCFs", "joint.glnexus.norm.vcf.gz"), + bed = provided(wes_bed_file, run_wes), output: vcf = join(workpath, "deepvariant", "VCFs", "{name}.germline.vcf.gz"), params: rname = "varselect", genome = config['references']['GENOME'], sample = "{name}", - memory = allocated("mem", "gatk_selectvariants", cluster).rstrip('G') + memory = allocated("mem", "gatk_selectvariants", cluster).rstrip('G'), + # Building WES options for gatk selectvariants, where: + # @WES = --intervals gencode_v44_protein-coding_exons.bed -ip 100 + # @WGS = '' + wes_bed_option = lambda _: "--intervals {0}".format(wes_bed_file) if run_wes else "", + wes_bed_padding = lambda _: "-ip 100" if run_wes else "", message: "Running GATK4 SelectVariants on '{input.vcf}' input file" threads: int(allocated("threads", "gatk_selectvariants", cluster)) container: config['images']['genome-seek'] envmodules: config['tools']['gatk4'] shell: """ gatk --java-options '-Xmx{params.memory}g -XX:ParallelGCThreads={threads}' SelectVariants \\ - -R {params.genome} \\ + -R {params.genome} {params.wes_bed_option} {params.wes_bed_padding} \\ --variant {input.vcf} \\ --sample-name {params.sample} \\ --exclude-non-variants \\ diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk index e6c1b1c..175967d 100644 --- a/workflow/rules/qc.smk +++ b/workflow/rules/qc.smk @@ -6,6 +6,7 @@ from scripts.common import ( provided ) + # Pre alignment QC-related rules rule fc_lane: """ @@ -113,6 +114,7 @@ rule fastq_screen: {input.fq2} """ + # Post-alignment QC-related rules rule fastqc_bam: """ @@ -159,12 +161,14 @@ rule qualimap: """ input: bam = join(workpath, "BAM", "{name}.sorted.bam"), + bed = provided(wes_bed_file, run_wes), output: txt = join(workpath,"QC","{name}","genome_results.txt"), html = join(workpath,"QC","{name}","qualimapReport.html") params: outdir = join(workpath,"QC","{name}"), - rname = "qualibam" + rname = "qualibam", + gff_option = lambda _: "-gff {0}".format(wes_bed_file) if run_wes else "", message: "Running QualiMap BAM QC with {threads} threads on '{input.bam}' input file" threads: int(allocated("threads", "qualimap", cluster)) container: config['images']['genome-seek_qc'] @@ -179,7 +183,7 @@ rule qualimap: -nt {threads} \\ --skip-duplicated \\ -nw 500 \\ - -p NON-STRAND-SPECIFIC + -p NON-STRAND-SPECIFIC {params.gff_option} """ diff --git a/workflow/rules/somatic.smk b/workflow/rules/somatic.smk index 52d3410..e477465 100644 --- a/workflow/rules/somatic.smk +++ b/workflow/rules/somatic.smk @@ -120,6 +120,7 @@ rule octopus_merge: """ input: vcfs = expand(join(workpath, "octopus", "somatic", "chunks", "{region}", "{{name}}.vcf.gz"), region=regions), + bed = provided(join(workpath, "references", "wes_regions_50bp_padded.bed"), run_wes), output: lsl = join(workpath, "octopus", "somatic", "{name}.list"), raw = join(workpath, "octopus", "somatic", "{name}.octopus.raw.vcf"), @@ -134,7 +135,12 @@ rule octopus_merge: genome = config['references']['GENOME'], rname = "octomerge", tumor = "{name}", - octopath = join(workpath, "octopus", "somatic", "chunks") + octopath = join(workpath, "octopus", "somatic", "chunks"), + # Building option for WES, if WES use padded + # WES BED file as regions file + wes_regions_option = lambda _: "--regions-file {0}".format( + join(workpath, "references", "wes_regions_50bp_padded.bed"), + ) if run_wes else '', threads: int(allocated("threads", "octopus_merge", cluster)) container: config['images']['genome-seek'] @@ -173,7 +179,7 @@ rule octopus_merge: -a \\ -f {output.sortlsl} \\ -o {output.raw} \\ - -O v + -O v {params.wes_regions_option} # Filter Octopus callset for # variants with SOMATIC tag grep -E "#|CHROM|SOMATIC" {output.raw} \\ @@ -343,6 +349,7 @@ rule gatk_gather_mutect2: """ input: vcf = expand(join(workpath, "mutect2", "chrom_split", "{{name}}.{chrom}.vcf"), chrom=chunks), + bed = provided(wes_bed_file, run_wes), output: vcf = temp(join(workpath, "mutect2", "somatic", "{name}.mutect2.tmp.vcf")), params: @@ -355,6 +362,11 @@ rule gatk_gather_mutect2: '--variant', expand(join(workpath, "mutect2", "chrom_split", "{{name}}.{chrom}.vcf"), chrom=chunks), ), + # Building option for WES, if WES run + # build option for intervals file + wes_intervals_option = lambda _: "--intervals {0}".format( + wes_bed_file, + ) if run_wes else '', threads: max(int(allocated("threads", "gatk_gather_mutect2", cluster))-1, 1) container: config['images']['genome-seek'] @@ -373,7 +385,7 @@ rule gatk_gather_mutect2: -R {params.genome} \\ --filteredrecordsmergetype KEEP_UNCONDITIONAL \\ --assumeIdenticalSamples \\ - -o {output.vcf} \\ + -o {output.vcf} {params.wes_intervals_option} \\ {params.multi_variant_option} """ @@ -612,6 +624,10 @@ rule muse: normal_header = lambda w: "\\nNORMAL\\t{0}".format( tumor2normal[w.name] ) if tumor2normal[w.name] else "", + # Building option for WGS/WES, + # if WES then "muse sump -E" + # else (WGS) set "muse sump -G" + muse_sump_option = lambda _: "-E" if run_wes else '-G', threads: # MuSE over-alllocates threads, # see this issue for more info: @@ -633,7 +649,7 @@ rule muse: # sample specific error model MuSE sump \\ -n {threads} \\ - -G \\ + {params.muse_sump_option} \\ -I {output.txt} \\ -O {output.vcf} \\ -D {params.dbsnp} @@ -674,7 +690,6 @@ rule strelka: purple_jar = config['references']['HMFTOOLS_PURPLE_JAR'], outdir = join(workpath, "strelka", "{name}"), workflow = join(workpath, "strelka", "{name}", "runWorkflow.py"), - regions = config['references']['MANTA_CALLREGIONS'], genome = config['references']['GENOME'], pon = config['references']['PON'], memory = allocated("mem", "strelka", cluster).rstrip('G'), @@ -687,6 +702,13 @@ rule strelka: normal_header = lambda w: "\\nNORMAL\\t{0}".format( tumor2normal[w.name] ) if tumor2normal[w.name] else "", + # Building option for WGS/WES, if WES use padded + # WES BED file, else use manta default + regions = lambda _: "{0}".format( + join(workpath, "references", "wes_regions_50bp_padded.bed.gz"), + ) if run_wes else config['references']['MANTA_CALLREGIONS'], + # Building option for WES flag + wes = lambda _: "--exome" if run_wes else "", threads: max(int(allocated("threads", "strelka", cluster))-1, 1) container: config['images']['genome-seek_somatic'] @@ -713,7 +735,7 @@ rule strelka: --referenceFasta {params.genome} \\ --tumorBam {input.tumor} {params.normal_option} \\ --runDir {params.outdir} \\ - --callRegions {params.regions} + --callRegions {params.regions} {params.wes} # Call somatic variants with Strelka echo "Starting Strelka workflow..." @@ -805,14 +827,21 @@ rule somatic_selectvar: """ input: vcf = join(workpath, "{caller}", "somatic", "{name}.{caller}.vcf"), + bed = provided(join(workpath, "references", "wes_regions_50bp_padded.bed.gz"), run_wes), output: norm = join(workpath, "{caller}", "somatic", "{name}.{caller}.norm.vcf"), + ngz = join(workpath, "{caller}", "somatic", "{name}.{caller}.norm.vcf.gz"), + tbi = join(workpath, "{caller}", "somatic", "{name}.{caller}.norm.vcf.gz.tbi"), filt = join(workpath, "{caller}", "somatic", "{name}.{caller}.filtered.norm.vcf"), params: rname = 'somselect', genome = config['references']['GENOME'], pon = config['references']['PON'], memory = allocated("mem", "somatic_selectvar", cluster).rstrip('G'), + # Building intervals option for WES + wes_intervals_option = lambda _: "--intervals {0}".format( + join(workpath, "references", "wes_regions_50bp_padded.bed.gz"), + ) if run_wes else '', threads: int(allocated("threads", "somatic_selectvar", cluster)) container: config['images']['genome-seek_somatic'] @@ -832,14 +861,23 @@ rule somatic_selectvar: -f {params.genome} \\ -o {output.norm} \\ {input.vcf} + # Index normalized VCF for GATK + # SelectVariants, which requires + # a tabix index-ed bgzip-ed VCF + # if the --interval option is used + bgzip -c \\ + {output.norm} \\ + > {output.ngz} + tabix -p vcf \\ + {output.ngz} # Remove filtered sites and output # variants not called in the PON echo "Running SelectVariants..." gatk --java-options "-Xmx{params.memory}g" SelectVariants \\ -R {params.genome} \\ - --variant {output.norm} \\ + --variant {output.ngz} \\ --discordance {params.pon} \\ - --exclude-filtered \\ + --exclude-filtered {params.wes_intervals_option} \\ --output {output.filt} # Fix format number metadata, gatk # SelectVariants converts Number diff --git a/workflow/rules/sv.smk b/workflow/rules/sv.smk index 9bd3c9a..a803160 100644 --- a/workflow/rules/sv.smk +++ b/workflow/rules/sv.smk @@ -18,6 +18,7 @@ def get_normal_recal_bam(wildcards): # Runs in tumor-only mode return [] + # Germline SV calling rule manta_germline: """ @@ -40,13 +41,20 @@ rule manta_germline: input: bam = join(workpath, "BAM", "{name}.sorted.bam"), bai = join(workpath, "BAM", "{name}.sorted.bam.bai"), + bed = provided(join(workpath, "references", "wes_regions_50bp_padded.bed.gz"), run_wes), output: vcf = join(workpath, "MANTA", "germline", "{name}", "results", "variants", "diploidSV.vcf.gz"), params: rname = "manta_germline", outdir = join(workpath, "MANTA", "germline", "{name}"), workflow = join(workpath, "MANTA", "germline", "{name}", "runWorkflow.py"), - regions = config['references']['MANTA_CALLREGIONS'], + # Building option for WGS/WES, if WES use padded + # WES BED file, else use manta default + regions = lambda _: "{0}".format( + join(workpath, "references", "wes_regions_50bp_padded.bed.gz"), + ) if run_wes else config['references']['MANTA_CALLREGIONS'], + # Building option for WES flag + wes = lambda _: "--exome" if run_wes else "", genome = config['references']['GENOME'], memory = allocated("mem", "manta_germline", cluster).rstrip('G'), threads: int(allocated("threads", "manta_germline", cluster)) @@ -64,7 +72,7 @@ rule manta_germline: --callRegions {params.regions} \\ --bam {input.bam} \\ --referenceFasta {params.genome} \\ - --runDir {params.outdir} + --runDir {params.outdir} {params.wes} # Call germline SV with Manta workflow echo "Starting Manta workflow..." @@ -73,6 +81,7 @@ rule manta_germline: -g {params.memory} """ + # Somatic SV calling rule manta_somatic: """ @@ -94,7 +103,13 @@ rule manta_somatic: rname = "manta_somatic", outdir = join(workpath, "MANTA", "somatic", "{name}"), workflow = join(workpath, "MANTA", "somatic", "{name}", "runWorkflow.py"), - regions = config['references']['MANTA_CALLREGIONS'], + # Building option for WGS/WES, if WES use padded + # WES BED file, else use manta default + regions = lambda _: "{0}".format( + join(workpath, "references", "wes_regions_50bp_padded.bed.gz"), + ) if run_wes else config['references']['MANTA_CALLREGIONS'], + # Building option for WES flag + wes = lambda _: "--exome" if run_wes else "", genome = config['references']['GENOME'], memory = allocated("mem", "manta_somatic", cluster).rstrip('G'), # Building optional argument for paired normal @@ -126,7 +141,7 @@ rule manta_somatic: --tumorBam {input.tumor} \\ --referenceFasta {params.genome} \\ --runDir {params.outdir} \\ - --outputContig + --outputContig {params.wes} # Call somatic SV with Manta workflow echo "Starting Manta workflow..." @@ -136,6 +151,7 @@ rule manta_somatic: {params.symlink} """ + # Filter Somatic SV calls rule manta_somatic_filter: """ diff --git a/workflow/rules/wes.smk b/workflow/rules/wes.smk new file mode 100644 index 0000000..e8a2db2 --- /dev/null +++ b/workflow/rules/wes.smk @@ -0,0 +1,293 @@ +# Functions and rules related to whole-exome sequencing +from scripts.common import ( + abstract_location, + allocated +) + + +# Helper functions for tumor, normal pairs +def get_normal_recal_bam(wildcards): + """ + Returns a tumor samples paired normal + See config['pairs'] for tumor, normal pairs. + """ + normal = tumor2normal[wildcards.name] + if normal: + # Runs in a tumor, normal mode + return join(workpath, "BAM", "{0}.recal.bam".format(normal)) + else: + # Runs in tumor-only mode + return [] + + +rule build_exome_bed: + """ + Data processing step reformat and build a new exome capture + kit bed files. This step will create a padded WES BED file + and a bgzip-ed and tabix-ed BED file. + @Input: + WES capture kit BED file (singleton) + @Output: + Chromomsome to size TSV file + Padded WES capture kit BED file + bgzip padded WES capture kit BED file + Tabix index of bgzip padded WES capture kit BED file + """ + input: + bed = wes_bed_file, + output: + sizes = join(workpath, "references", "genome_chrom_sizes.tsv"), + padded = join(workpath, "references", "wes_regions_50bp_padded.bed"), + bgzip = join(workpath, "references", "wes_regions_50bp_padded.bed.gz"), + tabix = join(workpath, "references", "wes_regions_50bp_padded.bed.gz.tbi"), + params: + rname = "wes_bed", + padding = '50', + tmpdir = tmpdir, + genome = config['references']['GENOME'], + threads: int(allocated("threads", "build_exome_bed", cluster)) + container: config['images']['genome-seek'] + envmodules: config['tools']['vcftools'] + shell: """ + # Get sizes of each chrom + samtools faidx {params.genome} -o - \\ + | cut -f1,2 \\ + > {output.sizes} + + # Determine if BED file contains strand + # information. If so, pass this along to + # bedtools merge cmd, via the '-s' option. + # If any features in the BED file are set + # to '.' for strand (6th column), then + # strand information is used with merge + # cmd. This is important because bedtools + # merge will not merge or consider any + # features where the strand is set to + # '.' with the '-s' option. This will + # result in an empty BED file to be + # produced or for entries in the BED + # file to be dropped if the strand + # column contains a mixture of "+/-/." + # strandedness=$( + # awk -F '\\t' -v OFS='\\t' -v option="-s" \\ + # '$6=="." || $6=="" {{option=""; exit}} END {{print option}}' \\ + # {input.bed} + # ) + # echo "Setting the bedtools merge strandedness option: ${{strandedness}}" + # Enforcing BED is in BED6 format, + # Padding features +/- N base pairs + # according to strand and merge any + # overlapping features after padding + awk -F '\\t' -v OFS='\\t' \\ + 'function trim(s) {{ sub(/^[ ]+/, "", s); sub(/[ ]+$/, "", s); return s }}; \\ + NF>="3" {{print \\ + trim($1), \\ + trim($2), \\ + trim($3), \\ + ((trim($4)=="" || trim($4)==".") ? trim($1)"_"trim($2)"_"trim($3)"_"NR : trim($4)), \\ + ($5=="" ? "." : trim($5)), \\ + ($6=="" ? "." : trim($6)) \\ + }}' {input.bed} \\ + | bedtools slop \\ + -s \\ + -l {params.padding} \\ + -r {params.padding} \\ + -g {output.sizes} \\ + -i - \\ + | bedtools sort -i - \\ + | bedtools merge \\ + -i - \\ + -c 4,5,6 \\ + -o distinct,mean,distinct \\ + > {output.padded} + + # Bgzip padded file and create + # a tabix index + bgzip -c {output.padded} > {output.bgzip} + tabix -f -p bed {output.bgzip} + """ + + +rule cnvkit_access: + """ + CNVkit is a Python library and command-line toolkit to infer and + visualize Copy Number Variant (CNV) detection from targeted DNA + sequencing. This rule calculates the sequence-accessible coordinates + in chromosomes from the given reference genome, output as a BED file. + This BED file is used as an optional argument for CNVkit batch cmd. + For more information, please visit: https://github.com/etal/cnvkit + @Input: + Genomic FASTA file of reference genome (singleton) + @Output: + BED file of sequence-accessible coordinates + """ + output: + bed = join(workpath, "references", "genome.cnvkit_access.bed"), + params: + rname = "cnvkit_access", + fa = config['references']['GENOME'], + threads: int(allocated("threads", "cnvkit_access", cluster)), + container: config['images']['genome-seek_cnv'], + envmodules: + config['tools']['cnvkit'], + shell: """ + # Run CNVkit access to calculate + # sequence-accessible coordinates + cnvkit.py access \\ + -o {output.bed} \\ + {params.fa} + """ + + +# Rule(s) for calling Copy Number Variation (CNV) from WES data +rule cnvkit_batch: + """ + CNVkit is a Python library and command-line toolkit to infer and + visualize Copy Number Variant (CNV) detection from targeted DNA + sequencing. It is designed for use with hybrid capture, including + both whole-exome and custom target panels, and short-read sequencing. + It can also be used with whole-genome sequencing, although we are + not using it for that purpose (better tools are available for WGS). + CNVkit will only run if a tumor-normal pair is provided. For more + information, please visit: https://github.com/etal/cnvkit + @Input: + Realigned, recalibrated BAM file (scatter-per-tumor-sample), + WES capture kit BED file + """ + input: + tumor = join(workpath, "BAM", "{name}.recal.bam"), + bed = join(workpath, "references", "genome.cnvkit_access.bed"), + normal = get_normal_recal_bam, + output: + fit = join(workpath, "cnvkit", "{name}", "{name}.call.cns"), + params: + rname = "cnvkit_batch", + outdir = workpath, + tumor = "{name}", + fasta = config['references']['GENOME'], + bed = wes_bed_file, + # Building optional argument for paired normal, + # if normal not provided, cnvkit will NOT run + normal_option = lambda w: "-n {0}.recal.bam".format( + join(workpath, "BAM", tumor2normal[w.name]) + ) if tumor2normal[w.name] else "", + threads: int(allocated("threads", "cnvkit_batch", cluster)), + container: config['images']['genome-seek_cnv'], + envmodules: + config['tools']['cnvkit'], + shell: """ + # Run CNVkit with a TN pair + cnvkit.py batch \\ + --scatter {params.normal_option} \\ + -d cnvkit/{params.tumor}/ \\ + --diagram \\ + -t {params.bed} \\ + -f {params.fasta} \\ + -p {threads} \\ + --drop-low-coverage \\ + -g {input.bed} \\ + -y \\ + --short-names {input.tumor} + """ + + +rule sequenza_utils: + """ + Sequenza can be used to estimate and quantify of purity/ploidy + and copy number alteration in sequencing experiments. It contains + a set of tools to analyze genomic sequencing data from paired + normal-tumor samples, including cellularity and ploidy estimation; + mutation and copy number (allele-specific and total copy number) + detection, quantification and visualization. For more information, + please visit: https://bitbucket.org/sequenzatools/sequenza/src/master/ + Sequenza will only run if a tumor-normal pair is provided. + @Input: + Realigned, recalibrated BAM file (scatter-per-tumor-sample), + WES capture kit BED file + @Output: + 1kb Binned Sequenza SEQZ file + """ + input: + tumor = join(workpath, "BAM", "{name}.recal.bam"), + normal = get_normal_recal_bam, + output: + bins = join(workpath, "sequenza_out", "{name}", "{name}.1kb.seqz.gz"), + params: + rname = "sequenza_utils", + outdir = workpath, + tumor = "{name}", + fasta = config['references']['GENOME'], + species = config['references']['SEQUENZA_SPECIES'], + gc = config['references']['SEQUENZA_GC'], + shell_script = join("workflow", "scripts", "run_sequenza.sh"), + # Building optional argument for paired normal, + # if normal not provided, sequenza will NOT run + normal = lambda w: "{0}".format( + tumor2normal[w.name] + ) if tumor2normal[w.name] else "", + # Building optional flag for wes capture kit + wes_flag = lambda _: "-b {0}".format(wes_bed_file) if run_wes else "", + threads: int(allocated("threads", "sequenza_utils", cluster)), + container: config['images']['genome-seek_cnv'], + envmodules: + config['tools']['sequenza'], + shell: """ + # Create output directory + mkdir -p sequenza_out/{params.tumor} + # Runs sequenza-utils bam2seqz to convert BAM + # to seqz file format AND runs sequenza-utils + # seqz_binning to bin seqz file into 1kb bins + {params.shell_script} \\ + -s {params.tumor} \\ + -t {input.tumor} \\ + -n {input.normal} \\ + -r {params.fasta} \\ + -c {threads} \\ + -g {params.gc} \\ + -e {params.species} {params.wes_flag} + """ + + +rule sequenza: + """ + Sequenza can be used to estimate and quantify of purity/ploidy + and copy number alteration in sequencing experiments. It contains + a set of tools to analyze genomic sequencing data from paired + normal-tumor samples, including cellularity and ploidy estimation; + mutation and copy number (allele-specific and total copy number) + detection, quantification and visualization. For more information, + please visit: https://bitbucket.org/sequenzatools/sequenza/src/master/ + Sequenza will only run if a tumor-normal pair is provided. + @Input: + Realigned, recalibrated BAM file (scatter-per-tumor-sample), + WES capture kit BED file + """ + input: + bins = join(workpath, "sequenza_out", "{name}", "{name}.1kb.seqz.gz"), + output: + solutions = join(workpath, "sequenza_out", "{name}_alternative_solutions.txt"), + params: + rname = "sequenza", + outdir = workpath, + tumor = "{name}", + rlang_script = join("workflow", "scripts", "run_sequenza.R"), + # Building optional argument for paired normal, + # if normal not provided, sequenza will NOT run + normal = lambda w: "{0}".format( + tumor2normal[w.name] + ) if tumor2normal[w.name] else "", + threads: int(allocated("threads", "sequenza", cluster)), + container: config['images']['sequenza'], + envmodules: + config['tools']['sequenza'], + shell: """ + # Runs sequenza to estimate purity/ploidy and CNVs + Rscript {params.rlang_script} \\ + {input.bins} \\ + {params.outdir}/sequenza_out/{params.tumor} \\ + {threads} \\ + {params.normal}+{params.tumor} + mv \\ + {params.outdir}/sequenza_out/{params.tumor}/{params.normal}+{params.tumor}_alternative_solutions.txt \\ + {output.solutions} + """ diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index 0e51470..aa31092 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -183,7 +183,7 @@ def str_bool(s): val = s.lower() if val in ['true', '1', 'y', 'yes']: return True - elif val in ['false', '0', 'n', 'no', '']: + elif val in ['false', '0', 'n', 'no', '', 'none']: return False else: # Provided value could not be @@ -201,4 +201,4 @@ def joint_option(prefix, valueslist): for v in valueslist: s += "{} {} ".format(prefix, v) s = s.rstrip() - return s \ No newline at end of file + return s diff --git a/workflow/scripts/flowcell_lane.py b/workflow/scripts/flowcell_lane.py index fad4182..5a08690 100755 --- a/workflow/scripts/flowcell_lane.py +++ b/workflow/scripts/flowcell_lane.py @@ -72,7 +72,7 @@ def get_flowcell_lane(sequence_identifer): if len(id_list) < 7: # No Flowcell IDs in this format # Return next instrument id instead (next best thing) - if sequence_identifer.startswith('@SRR'): + if id_list[0].startswith('@SRR'): # SRA format or downloaded SRA FastQ file # SRA format 1: contains machine and lane information # @SRR001666.1 071112_SLXA-EAS1_s_7:5:1:817:345 length=36 diff --git a/workflow/scripts/run_sequenza.R b/workflow/scripts/run_sequenza.R new file mode 100755 index 0000000..9106c23 --- /dev/null +++ b/workflow/scripts/run_sequenza.R @@ -0,0 +1,93 @@ +#!/usr/bin/env Rscript +library("sequenza") + +args <- commandArgs(trailingOnly = TRUE) + +if (length(args) == 0) { + stop("Must provide a seqz file") +} else { + seqz_file <- args[1] + if (! file.exists(seqz_file)) { + stop( + paste0( + "Can't find this SEQZ output file: ", + seqz_file + ) + ) + } +} + +if (length(args) > 1) { + out_dir <- args[2] +} else { + out_dir <- dirname(seqz_file) +} + +if (length(args) > 2) { + n_cores <- as.numeric(args[3]) +} else { + n_cores <- as.numeric( + Sys.getenv("SLURM_CPUS_PER_TASK", 1) + ) +} +if (is.na(n_cores)) { + n_cores = 1 +} + +if (length(args) > 3) { + sampleid <- args[4] +} else { + sampleid <- gsub( + ".seqz.gz", + "", + basename(seqz_file) + ) +} + +print(paste0("Using ", n_cores, " cores...")) +print("Extracting seqz data...") +date() +seqzdata <- sequenza.extract( + seqz_file, + min.reads = 30, + min.reads.normal= 20 +) + +print("Fitting model...") +date() +# CP.example <- sequenza.fit(seqzdata, mc.cores = n_cores) +CP.example <- sequenza.fit(seqzdata) + +# Sequenza.extract seems to fail if too few mutations +# num_mutations <- unlist(lapply(seqzdata$mutations, nrow)) +# chrom_list <- names(num_mutations)[num_mutations >= 0] +# But it might actually be segments, idk? +# num_mutations <- unlist(lapply(seqzdata$segments, nrow)) +# chrom_list <- names(num_mutations)[num_segments > 0] +chrom_list <- c( + "chr1", "chr2", "chr3", + "chr4", "chr5", "chr6", + "chr7", "chr8", "chr9", + "chr10", "chr11", "chr12", + "chr13", "chr14", "chr15", + "chr16", "chr17", "chr18", + "chr19", "chr20", "chr21", + "chr22", "chrX" +) +# not_included <- setdiff(names(num_mutations), chrom_list) +# if (length(not_included) > 0) { +# print("Excluding these chromosomes because of too few mutations...") +# print(not_included) +# } +date() +print("Printing results...") +sequenza.results( + sequenza.extract = seqzdata, + cp.table = CP.example, + sample.id = sampleid, + out.dir = out_dir, + chromosome.list = chrom_list +) + +date() +print("Done") \ No newline at end of file diff --git a/workflow/scripts/run_sequenza.sh b/workflow/scripts/run_sequenza.sh new file mode 100755 index 0000000..0d43d84 --- /dev/null +++ b/workflow/scripts/run_sequenza.sh @@ -0,0 +1,181 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Usage and help message +function usage() { + cat << EOF +Usage: $(basename "$0") [-h] \\ + [-c THREADS] \\ + [-b CAPTURE_BED] \\ + -s SAMPLE_ID \\ + -t TUMOR_BAM \\ + -n NORMAL_BAM \\ + -r REF_FASTA \\ + -g GC_WIGGLE \\ + -e SPECIES \\ + -b CAPTURE + +Required: + -s SAMPLE_ID Sample ID + -t TUMOR_BAM Tumor BAM file + -n NORMAL_BAM Normal BAM file + -r REF_FASTA FASTA file + -g GC_WIGGLE GC Wiggle file + -e SPECIES Human or Mouse + +Optional: + -c THREADS The number of threads (default 1) + -b CAPTURE WES Bed file of capture (default: none) + -h Display help message and exit +EOF +} + +# Functions +function err() { cat <<< "$@" 1>&2; } +function fatal() { cat <<< "$@" 1>&2; usage; exit 1; } +function timestamp() { date +"%Y-%m-%d_%H-%M-%S"; } +function require(){ + # Requires an executable is in $PATH, + # as a last resort it will attempt to load + # the executable or dependency as a module + # @INPUT $@ = List of executables to check + for exe in "${@}"; do + # Check if executable is in $PATH + command -V "${exe}" &> /dev/null && continue; + # Try to load exe as lua module + module load "${exe}" &> /dev/null || \ + fatal "Failed to find or load '${exe}', not installed on target system." + done +} + + +# Parse command-line options +# Set defaults for non-required options +num_threads=1 # default: number of threads to use +new_bait="none" # default: none (i.e. run in WGS-mode) +while getopts s:t:n:r:c:g:e:b:h OPT; do + case $OPT in + s ) sample_id=$OPTARG;; + t ) tumor_bam=$OPTARG;; + n ) normal_bam=$OPTARG;; + r ) reference_fasta=$OPTARG;; + c ) num_threads=$OPTARG;; + g ) gc_wiggle=$OPTARG;; + e ) species=$OPTARG;; + b ) new_bait=$OPTARG;; + h ) usage && exit 0;; + ? ) usage && exit 1;; + esac +done + +# Sanity check: was anything was provided?! +{ [ -z "${1:-}" ] ; } \ + && fatal "Error: Did not provide any of the required options!" + +# Check if all required options +# where provided at runtime +declare -A required_options +required_options=( + ["s"]="${sample_id:-}" + ["t"]="${tumor_bam:-}" + ["n"]="${normal_bam:-}" + ["r"]="${reference_fasta:-}" + ["g"]="${gc_wiggle:-}" + ["e"]="${species:-}" +) +echo "Running with provided required options:" +for k in "${!required_options[@]}"; do + v="${required_options[$k]:-}" + echo " -$k $v" + if [[ -z "${v}" ]]; then + fatal "Error: Failed to provide all required options... missing -${k} OPTION" + fi +done + + +# Check for software dependencies, +# as last resort try to module load +# the specified tool or dependency +require bedtools samtools sequenza-utils + + +# Set sequenza-utils run options +if [ "$species" == "Human" ]; then + echo "Human as reference so chromosome 1 to 22 & X will be analyzed"; + chromosomes="chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX" +else + echo "Mouse as reference so chromosome 1 to 19 & X will be analyzed"; + chromosomes="chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chrX" +fi + + +echo "[$(timestamp)] Running sequenza-utils bam2seqz with the follow bam files: ${normal_bam} ${tumor_bam}"; +sequenza-utils bam2seqz \ + -n "${normal_bam}" \ + -t "${tumor_bam}" \ + --fasta "${reference_fasta}" \ + -gc "${gc_wiggle}" \ + -o "sequenza_out/${sample_id}/${sample_id}.seqz.gz" \ + -C ${chromosomes} \ + --parallel ${num_threads} \ + -N 40 + + +# Merge the output across all chromosomes, +# not sure why we are doing it this way but +# didn't want to change the original script +# too much, it gets the job done, so let's +# keep it +{ + for chr in $chromosomes; do + if [[ "$chr" = "chr1" ]]; then + zcat "sequenza_out/${sample_id}/${sample_id}_chr1.seqz.gz" \ + > "sequenza_out/${sample_id}/${sample_id}.combine.chr1.seqz" + else + zcat "sequenza_out/${sample_id}/${sample_id}_${chr}.seqz.gz" \ + | tail -n +2 \ + >> "sequenza_out/${sample_id}/${sample_id}.combine.chrall.seqz" + fi + done +} +# Merge and gzip, probably could add +# a clean up step here as well +cat "sequenza_out/${sample_id}/${sample_id}.combine.chr1.seqz" "sequenza_out/${sample_id}/${sample_id}.combine.chrall.seqz" \ + | gzip -c \ +> "sequenza_out/${sample_id}/${sample_id}.combine.seqz.gz" + + +if [ "$new_bait" != "none" ]; then + # WES capture kit provided via + # the -b CAPTURE option, default + # is none or run in WGS-mode + zgrep -v "chromosome" "sequenza_out/${sample_id}/${sample_id}.combine.seqz.gz" \ + | awk '{print $1"\t"$2"\t"$2+1"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14}' \ + | bedtools intersect -a stdin -b "$new_bait" \ + | cut -f-2,4- \ + > "sequenza_out/${sample_id}/${sample_id}.combine.seqz.body" + gunzip -c "sequenza_out/${sample_id}/${sample_id}.combine.seqz.gz" \ + | head -n 1 \ + > "sequenza_out/${sample_id}/head" || true # head cmd causes SIGPIPE, so ignore it + cat "sequenza_out/${sample_id}/head" "sequenza_out/${sample_id}/${sample_id}.combine.seqz.body" \ + | gzip -c \ + > "sequenza_out/${sample_id}/${sample_id}.combine.ontarget.seqz.gz" + + echo "[$(timestamp)] Running sequenza-utils seqz_binning with the follow input file: sequenza_out/${sample_id}/${sample_id}.combine.ontarget.seqz.gz"; + sequenza-utils seqz_binning \ + --seqz "sequenza_out/${sample_id}/${sample_id}.combine.ontarget.seqz.gz" \ + -w 1000 \ + -o "sequenza_out/${sample_id}/${sample_id}.1kb.seqz.gz" +else + # Run in WGS-like-mode + echo "[$(timestamp)] Running sequenza-utils seqz_binning with the follow input file: sequenza_out/${sample_id}/${sample_id}.combine.seqz.gz"; + sequenza-utils seqz_binning \ + --seqz "sequenza_out/${sample_id}/${sample_id}.combine.seqz.gz" \ + -w 1000 \ + -o "sequenza_out/${sample_id}/${sample_id}.1kb.seqz.gz" +fi + + +# Clean tmp/intermediate files +rm -f "sequenza_out/${sample_id}/${sample_id}.combine.chr1.seqz" +rm -f "sequenza_out/${sample_id}/${sample_id}.combine.chrall.seqz" \ No newline at end of file