From 28765c504c61ad2238518166287b1f2c58e056ea Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 1 Nov 2018 16:27:10 +0100 Subject: [PATCH 1/8] modifications for awsbatch --- conf/aws-batch.config | 5 +++-- conf/containers.config | 3 +++ conf/genomes.config | 4 ++-- germlineVC.nf | 14 +++++++------- 4 files changed, 15 insertions(+), 11 deletions(-) diff --git a/conf/aws-batch.config b/conf/aws-batch.config index 4c4c837610..88f2101ff8 100644 --- a/conf/aws-batch.config +++ b/conf/aws-batch.config @@ -8,7 +8,8 @@ */ params { - genome_base = params.genome == 'GRCh37' ? "s3://caw-references/grch37" : params.genome == 'GRCh38' ? "s3://caw-references/grch38" : "s3://caw-references/smallgrch37" + genome_base = params.genome == 'GRCh37' ? "s3://sarek-references/Homo_sapiens/GATK/GRCh37" : params.genome == 'GRCh38' ? "s3://sarek-references/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small" + publishDirMode = 'copy' } executor.name = 'awsbatch' @@ -16,7 +17,7 @@ executor.awscli = '/home/ec2-user/miniconda/bin/aws' process { executor = 'awsbatch' - queue = 'caw-job-queue' + queue = 'Sarek-queue' errorStrategy = {task.exitStatus == 143 ? 'retry' : 'terminate'} maxErrors = '-1' diff --git a/conf/containers.config b/conf/containers.config index 6e0ab0a1ad..ad2becd9e5 100644 --- a/conf/containers.config +++ b/conf/containers.config @@ -26,6 +26,9 @@ process { withName:ConcatVCF { container = "${params.repository}/sarek:${params.tag}" } + withName:CreateIntervalBeds { + container = "${params.repository}/sarek:${params.tag}" + } withName:CreateRecalibrationTable { container = "${params.repository}/sarek:${params.tag}" } diff --git a/conf/genomes.config b/conf/genomes.config index 53ceecf9cf..88c53fc23d 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -43,8 +43,8 @@ params { knownIndelsIndex = "${params.genome_base}/{Mills_and_1000G_gold_standard.indels.hg38,beta/Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" snpeffDb = "GRCh38.86" // This a nasty-looking list of allele-frequencies files. Add/remove files to match to your sets - //AF_files = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf" - //AF_indexes = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf.idx" + //AF_files = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf" + //AF_indexes = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf.idx" } 'smallGRCh37' { acLoci = "${params.genome_base}/1000G_phase3_20130502_SNP_maf0.3.small.loci" diff --git a/germlineVC.nf b/germlineVC.nf index 1cc9f39ddc..7b53d1006c 100644 --- a/germlineVC.nf +++ b/germlineVC.nf @@ -104,7 +104,7 @@ if (params.verbose) recalibratedBam = recalibratedBam.view { process RunSamtoolsStats { tag {idPatient + "-" + idSample} - publishDir directoryMap.samtoolsStats, mode: 'link' + publishDir directoryMap.samtoolsStats, mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamForSamToolsStats @@ -125,7 +125,7 @@ if (params.verbose) samtoolsStatsReport = samtoolsStatsReport.view { process RunBamQC { tag {idPatient + "-" + idSample} - publishDir directoryMap.bamQC, mode: 'link' + publishDir directoryMap.bamQC, mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamForBamQC @@ -356,7 +356,7 @@ if (params.verbose) vcfsToMerge = vcfsToMerge.view { process ConcatVCF { tag {variantCaller + "-" + idSampleNormal} - publishDir "${directoryMap."$variantCaller"}", mode: 'link' + publishDir "${directoryMap."$variantCaller"}", mode: params.publishDirMode input: set variantCaller, idPatient, idSampleNormal, idSampleTumor, file(vcFiles) from vcfsToMerge @@ -394,7 +394,7 @@ if (params.verbose) vcfConcatenated = vcfConcatenated.view { process RunSingleStrelka { tag {idSample} - publishDir directoryMap.strelka, mode: 'link' + publishDir directoryMap.strelka, mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamsForSingleStrelka @@ -447,7 +447,7 @@ if (params.verbose) singleStrelkaOutput = singleStrelkaOutput.view { process RunSingleManta { tag {idSample + " - Single Diploid"} - publishDir directoryMap.manta, mode: 'link' + publishDir directoryMap.manta, mode: params.publishDirMode input: set idPatient, status, idSample, file(bam), file(bai) from bamsForSingleManta @@ -511,7 +511,7 @@ vcfForQC = Channel.empty().mix( process RunBcftoolsStats { tag {vcf} - publishDir directoryMap.bcftoolsStats, mode: 'link' + publishDir directoryMap.bcftoolsStats, mode: params.publishDirMode input: set variantCaller, file(vcf) from vcfForBCFtools @@ -534,7 +534,7 @@ bcfReport.close() process RunVcftools { tag {vcf} - publishDir directoryMap.vcftools, mode: 'link' + publishDir directoryMap.vcftools, mode: params.publishDirMode input: set variantCaller, file(vcf) from vcfForVCFtools From fd427c9c57755f3dc14bf3c940b9bc23d700ee71 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 1 Nov 2018 16:59:48 +0100 Subject: [PATCH 2/8] update submodule --- Sarek-data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sarek-data b/Sarek-data index c2da0d2a8a..5eeaa59631 160000 --- a/Sarek-data +++ b/Sarek-data @@ -1 +1 @@ -Subproject commit c2da0d2a8a1c1a8e9b9b0930b84e34073ea43d03 +Subproject commit 5eeaa5963110c07e200c9622cca5e86cb53d8a39 From 7e3c877b2ca7c1e1bf105bfd354457aa02e94179 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 1 Nov 2018 17:05:31 +0100 Subject: [PATCH 3/8] update CHANGELOG [skip-ci] --- CHANGELOG.md | 3 +++ conf/singularity-path.config | 3 +++ 2 files changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b8256d0027..cb579a42de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,16 +11,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - [#671](https://github.com/SciLifeLab/Sarek/pull/671) - New `publishDirMode` param and docs - [#673](https://github.com/SciLifeLab/Sarek/pull/673) - Profiles for BinAC and CFC clusters in Tübingen +- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Add container for `CreateIntervalBeds` ### `Changed` - [#663](https://github.com/SciLifeLab/Sarek/pull/663) - Update `do_release.sh` script - [#671](https://github.com/SciLifeLab/Sarek/pull/671) - publishDir modes are now params +- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Update old awsbatch configuration ### `Fixed` - [#665](https://github.com/SciLifeLab/Sarek/pull/665) - Input bam file now has always the same name (whether it is from a single fastq pair or multiple) in the MarkDuplicates process, so metrics too - [#672](https://github.com/SciLifeLab/Sarek/pull/672) - process `PullSingularityContainers` from `buildContainers.nf` now expect a file with the correct `.simg` extension for singularity images, and no longer the `.img` one. +- [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Add publishDirMode for `germlineVC.nf` ## [2.2.1] - 2018-10-04 diff --git a/conf/singularity-path.config b/conf/singularity-path.config index 448e7d1432..d41cf45059 100644 --- a/conf/singularity-path.config +++ b/conf/singularity-path.config @@ -31,6 +31,9 @@ process { withName:ConcatVCF { container = "${params.containerPath}/sarek-${params.tag}.simg" } + withName:CreateIntervalBeds { + container = "${params.containerPath}/sarek-${params.tag}.simg" + } withName:CreateRecalibrationTable { container = "${params.containerPath}/sarek-${params.tag}.simg" } From 146eb939ccc8d0da4e5181cffffeede9ff984a75 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Thu, 1 Nov 2018 17:09:39 +0100 Subject: [PATCH 4/8] update CHANGELOG [skip ci] --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb579a42de..5bc611a19c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,13 +10,14 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Added` - [#671](https://github.com/SciLifeLab/Sarek/pull/671) - New `publishDirMode` param and docs -- [#673](https://github.com/SciLifeLab/Sarek/pull/673) - Profiles for BinAC and CFC clusters in Tübingen +- [#673](https://github.com/SciLifeLab/Sarek/pull/673), [#675](https://github.com/SciLifeLab/Sarek/pull/675), [#676](https://github.com/SciLifeLab/Sarek/pull/676) - Profiles for BinAC and CFC clusters in Tübingen - [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Add container for `CreateIntervalBeds` ### `Changed` - [#663](https://github.com/SciLifeLab/Sarek/pull/663) - Update `do_release.sh` script - [#671](https://github.com/SciLifeLab/Sarek/pull/671) - publishDir modes are now params +- [#677](https://github.com/SciLifeLab/Sarek/pull/677) - Update docs - [#679](https://github.com/SciLifeLab/Sarek/pull/679) - Update old awsbatch configuration ### `Fixed` From 96c54344aeacc6497aee49029e07d558cef184ee Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Fri, 2 Nov 2018 08:59:26 +0100 Subject: [PATCH 5/8] update Sarek-data submodule --- Sarek-data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sarek-data b/Sarek-data index 5eeaa59631..d1f1848688 160000 --- a/Sarek-data +++ b/Sarek-data @@ -1 +1 @@ -Subproject commit 5eeaa5963110c07e200c9622cca5e86cb53d8a39 +Subproject commit d1f1848688d2f0a4f8c792373c3002ab046063c0 From b3ca47a8fedf71c833d635ebe777e2335f937ed4 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Fri, 2 Nov 2018 09:29:54 +0100 Subject: [PATCH 6/8] add genome for AWS test and remove COSMIC --- conf/aws-batch.config | 2 +- conf/genomes.config | 19 +++++++++++++------ somaticVC.nf | 11 ++--------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/conf/aws-batch.config b/conf/aws-batch.config index 88f2101ff8..ec8a9e9139 100644 --- a/conf/aws-batch.config +++ b/conf/aws-batch.config @@ -8,7 +8,7 @@ */ params { - genome_base = params.genome == 'GRCh37' ? "s3://sarek-references/Homo_sapiens/GATK/GRCh37" : params.genome == 'GRCh38' ? "s3://sarek-references/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small" + genome_base = params.genome == 'GRCh37' ? "s3://sarek-references/Homo_sapiens/GATK/GRCh37" : params.genome == 'iGRCh38' ? "s3://sarek-references/Homo_sapiens/GATK/GRCh38" : "s3://sarek-references/small" publishDirMode = 'copy' } diff --git a/conf/genomes.config b/conf/genomes.config index 88c53fc23d..3e2d7dd97b 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -15,8 +15,6 @@ params { genomes { 'GRCh37' { acLoci = "${params.genome_base}/1000G_phase3_20130502_SNP_maf0.3.loci" - cosmic = "${params.genome_base}/GRCh37_Cosmic_v83.vcf" - cosmicIndex = "${cosmic}.idx" dbsnp = "${params.genome_base}/dbsnp_138.b37.vcf" dbsnpIndex = "${dbsnp}.idx" genomeFile = "${params.genome_base}/human_g1k_v37_decoy.fasta" @@ -30,8 +28,6 @@ params { } 'GRCh38' { acLoci = "${params.genome_base}/1000G_phase3_GRCh38_maf0.3.loci" - cosmic = "${params.genome_base}/COSMICv80.vcf" - cosmicIndex = "${cosmic}.idx" dbsnp = "${params.genome_base}/dbsnp_146.hg38.vcf.gz" dbsnpIndex = "${dbsnp}.tbi" genomeFile = "${params.genome_base}/Homo_sapiens_assembly38.fasta" @@ -46,10 +42,21 @@ params { //AF_files = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf" //AF_indexes = "${params.genome_base}/{00-All.dbsnp_151.hg38.CAF.TOPMED.alternate.allele.freq,hapmap_3.3_grch38_pop_stratified_af.HMAF,SweGen_hg38_stratified.SWAF}.vcf.idx" } + 'iGRCh38' { + acLoci = "${params.genome_base}/Annotation/ASCAT/1000G_phase3_GRCh38_maf0.3.loci" + dbsnp = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz" + dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz.tbi" + genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" + genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict" + genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fai" + bwaIndex = "${params.genome_base}/Sequence/BWAIndex/Homo_sapiens_assembly38.fasta.64.{alt,amb,ann,bwt,pac,sa}" + intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions.hg38.bed" + knownIndels = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,Homo_sapiens_assembly38.known_indels}.vcf.gz" + knownIndelsIndex = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,Homo_sapiens_assembly38.known_indels}.vcf.gz.tbi" + snpeffDb = "GRCh38.86" + } 'smallGRCh37' { acLoci = "${params.genome_base}/1000G_phase3_20130502_SNP_maf0.3.small.loci" - cosmic = "${params.genome_base}/b37_cosmic_v74.noCHR.sort.4.1.small.vcf" - cosmicIndex = "${cosmic}.idx" dbsnp = "${params.genome_base}/dbsnp_138.b37.small.vcf" dbsnpIndex = "${dbsnp}.idx" genomeFile = "${params.genome_base}/human_g1k_v37_decoy.small.fasta" diff --git a/somaticVC.nf b/somaticVC.nf index 89a814da2c..9a99aebebd 100644 --- a/somaticVC.nf +++ b/somaticVC.nf @@ -279,14 +279,12 @@ process RunMutect2 { input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor), file(intervalBed) from bamsFMT2 - set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex), file(cosmic), file(cosmicIndex) from Channel.value([ + set file(genomeFile), file(genomeIndex), file(genomeDict), file(dbsnp), file(dbsnpIndex) from Channel.value([ referenceMap.genomeFile, referenceMap.genomeIndex, referenceMap.genomeDict, referenceMap.dbsnp, - referenceMap.dbsnpIndex, - referenceMap.cosmic, - referenceMap.cosmicIndex + referenceMap.dbsnpIndex ]) output: @@ -832,9 +830,6 @@ def defineReferenceMap() { 'acLoci' : checkParamReturnFile("acLoci"), 'dbsnp' : checkParamReturnFile("dbsnp"), 'dbsnpIndex' : checkParamReturnFile("dbsnpIndex"), - // cosmic VCF with VCF4.1 header - 'cosmic' : checkParamReturnFile("cosmic"), - 'cosmicIndex' : checkParamReturnFile("cosmicIndex"), // genome reference dictionary 'genomeDict' : checkParamReturnFile("genomeDict"), // FASTA genome reference @@ -923,8 +918,6 @@ def minimalInformationMessage() { log.info " Tag : " + params.tag log.info "Reference files used:" log.info " acLoci :\n\t" + referenceMap.acLoci - log.info " cosmic :\n\t" + referenceMap.cosmic - log.info "\t" + referenceMap.cosmicIndex log.info " dbsnp :\n\t" + referenceMap.dbsnp log.info "\t" + referenceMap.dbsnpIndex log.info " genome :\n\t" + referenceMap.genomeFile From d1440695e5eba433d12a9a9c5ee9e6627fa18c67 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Fri, 2 Nov 2018 09:35:28 +0100 Subject: [PATCH 7/8] hacking the hack to make sure that annotation works with iGRCh38 --- annotate.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annotate.nf b/annotate.nf index 7ae5fb828a..1599239767 100644 --- a/annotate.nf +++ b/annotate.nf @@ -215,7 +215,7 @@ process RunVEP { script: finalannotator = annotator == "snpeff" ? 'merge' : 'vep' genome = params.genome == 'smallGRCh37' ? 'GRCh37' : params.genome - cache_version = params.genome == 'GRCh38' ? 92 : 91 + cache_version = params.genome == 'GRCh38' || params.genome == 'iGRCh38' ? 92 : 91 """ /opt/vep/src/ensembl-vep/vep --dir /opt/vep/.vep/ \ -i ${vcf} \ From 9364d2a3b2e1f05946fe3f6f22a90d12d3149bc8 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Fri, 2 Nov 2018 10:14:33 +0100 Subject: [PATCH 8/8] fix typo --- conf/genomes.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/genomes.config b/conf/genomes.config index 3e2d7dd97b..b736bc9e4f 100644 --- a/conf/genomes.config +++ b/conf/genomes.config @@ -48,7 +48,7 @@ params { dbsnpIndex = "${params.genome_base}/Annotation/GATKBundle/dbsnp_146.hg38.vcf.gz.tbi" genomeFile = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta" genomeDict = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict" - genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fai" + genomeIndex = "${params.genome_base}/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai" bwaIndex = "${params.genome_base}/Sequence/BWAIndex/Homo_sapiens_assembly38.fasta.64.{alt,amb,ann,bwt,pac,sa}" intervals = "${params.genome_base}/Annotation/intervals/wgs_calling_regions.hg38.bed" knownIndels = "${params.genome_base}/Annotation/GATKBundle/{Mills_and_1000G_gold_standard.indels.hg38,Homo_sapiens_assembly38.known_indels}.vcf.gz"