Skip to content

Commit

Permalink
issue #1014 - bcftools normalize
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed May 27, 2024
1 parent 189fc7b commit 5f4c743
Show file tree
Hide file tree
Showing 7 changed files with 106 additions and 65 deletions.
65 changes: 63 additions & 2 deletions library/genomics/vcf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import cyvcf2
import vcf

from snpdb.models import Variant, Sequence
from snpdb.models import Variant, Sequence, GenomeFasta


def cyvcf2_header_types(cyvcf2_reader):
Expand Down Expand Up @@ -121,9 +121,70 @@ def vcf_get_ref_alt_svlen(variant: cyvcf2.Variant):
return ref, alt, svlen


def get_vcf_header_contig_lines(contigs: list[tuple]):
def get_vcf_header_contig_lines(contigs: list[tuple]) -> list[str]:
header_lines = []
for contig, length, assembly in contigs:
line = f"##contig=<ID={contig},length={length},assembly={assembly}>"
header_lines.append(line)
return header_lines


def get_contigs_header_lines(genome_build, standard_only=True, contig_allow_list: set = None) -> list[str]:
if standard_only:
contig_qs = genome_build.standard_contigs
else:
contig_qs = genome_build.contigs

contigs = []
for contig in contig_qs:
if contig_allow_list is not None:
if contig.refseq_accession not in contig_allow_list:
continue
contigs.append((contig.refseq_accession, contig.length, genome_build.name))
return get_vcf_header_contig_lines(contigs)


def write_cleaned_vcf_header(genome_build, source_vcf_filename: str, output_filename: str, new_info_lines: list[str] = None):
contig_regex = re.compile(r"^##contig=<ID=(.+),length=(\d+)")

header_lines = []
with open(source_vcf_filename) as in_f:
for line in in_f:
if not line.startswith("#"):
break # End of header
header_lines.append(line.strip())

# These are used to validate contigs in header
genome_fasta = GenomeFasta.get_for_genome_build(genome_build)
chrom_to_contig_id = genome_build.get_chrom_contig_id_mappings()
contig_lengths = dict(genome_build.contigs.values_list("pk", "length"))
contig_to_fasta_names = genome_fasta.get_contig_id_to_name_mappings()

with open(output_filename, "w") as f:
found_column_names_line = False
for line in header_lines:
if line.startswith("#CHROM"):
found_column_names_line = True
# This is where we dump the new stuff
if new_info_lines:
for new_info_line in new_info_lines:
f.write(new_info_line + "\n")
for contig_line in get_contigs_header_lines(genome_build):
f.write(contig_line + "\n")

elif m := contig_regex.match(line):
# Strip existing contig lines from header - though check they match so we don't get build swaps
contig_name, provided_contig_length = m.groups()
if contig_id := chrom_to_contig_id.get(contig_name):
if fasta_chrom := contig_to_fasta_names.get(contig_id):
provided_contig_length = int(provided_contig_length)
ref_contig_length = contig_lengths[contig_id]
if provided_contig_length != ref_contig_length:
msg = f"VCF header contig '{contig_name}' (length={provided_contig_length}) has " + \
f"different length than ref contig {fasta_chrom} (length={ref_contig_length})"
raise ValueError(msg)

f.write(line + "\n")

if not found_column_names_line:
raise ValueError("VCF header was missing line starting with '#CHROM'")
12 changes: 2 additions & 10 deletions snpdb/liftover.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from genes.hgvs import HGVSMatcher
from library.django_utils.django_file_utils import get_import_processing_dir
from library.genomics.vcf_utils import write_vcf_from_tuples, get_vcf_header_contig_lines
from library.genomics.vcf_utils import write_vcf_from_tuples, get_vcf_header_contig_lines, get_contigs_header_lines
from library.guardian_utils import admin_bot
from library.log_utils import log_traceback
from snpdb.clingen_allele import populate_clingen_alleles_for_variants
Expand All @@ -24,14 +24,6 @@
from upload.upload_processing import process_upload_pipeline


def get_used_contigs_header_lines(genome_build, used_contigs: set) -> list[str]:
contigs = []
for contig in genome_build.contigs:
if contig.name in used_contigs:
contigs.append((contig.name, contig.length, genome_build.name))
return get_vcf_header_contig_lines(contigs)


def create_liftover_pipelines(user: User, alleles: Iterable[Allele],
import_source: ImportSource,
inserted_genome_build: GenomeBuild,
Expand Down Expand Up @@ -74,7 +66,7 @@ def create_liftover_pipelines(user: User, alleles: Iterable[Allele],
if allele_liftover_records:
AlleleLiftover.objects.bulk_create(allele_liftover_records, batch_size=2000)

header_lines = get_used_contigs_header_lines(vcf_genome_build, used_contigs)
header_lines = get_contigs_header_lines(vcf_genome_build, contig_allow_list=used_contigs)
write_vcf_from_tuples(vcf_filename, av_tuples, tuples_have_id_field=True, header_lines=header_lines)
uploaded_file = UploadedFile.objects.create(path=liftover_vcf_filename,
import_source=import_source,
Expand Down
4 changes: 4 additions & 0 deletions snpdb/models/models_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,10 @@ def contigs(self):
qs = Contig.objects.filter(genomebuildcontig__genome_build=self)
return qs.order_by("genomebuildcontig__order")

@cached_property
def standard_contigs(self):
return self.contigs.filter(role=SequenceRole.ASSEMBLED_MOLECULE)

@property
def genome_builds(self):
""" For PreviewModelMixin - search knows it's a genome build """
Expand Down
19 changes: 7 additions & 12 deletions snpdb/vcf_export_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from library.genomics.vcf_utils import get_vcf_header_contig_lines
from library.genomics.vcf_utils import get_vcf_header_contig_lines, get_contigs_header_lines


def get_vcf_header_lines(top_lines=None, info_dict=None, formats=None, contigs=None, samples=None):
""" info_dict - values = dict with keys of 'type', 'description' (optional 'number' default = 1)
contigs - tuples of (contig, length, assembly)
"""
def get_vcf_header_lines(top_lines=None, info_dict=None, formats=None, contig_lines=None, samples=None):
""" info_dict - values = dict with keys of 'type', 'description' (optional 'number' default = 1) """

if samples is None:
samples = []
Expand All @@ -25,8 +23,8 @@ def get_vcf_header_lines(top_lines=None, info_dict=None, formats=None, contigs=N
if use_format:
header_lines.extend(formats)

if contigs:
header_lines.extend(get_vcf_header_contig_lines(contigs))
if contig_lines:
header_lines.extend(contig_lines)

colnames = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
if use_format:
Expand All @@ -40,15 +38,12 @@ def get_vcf_header_lines(top_lines=None, info_dict=None, formats=None, contigs=N
def get_vcf_header_from_contigs(genome_build, info_dict=None, samples=None):
""" info_dict which contains ('number', 'type', 'description') """

contigs = []
for contig in genome_build.contigs:
contigs.append((contig.name, contig.length, genome_build.name))

formats = ['##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">',
'##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
'##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">',
'##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">',
'##FORMAT=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1)">']

return get_vcf_header_lines(info_dict=info_dict, formats=formats, contigs=contigs, samples=samples)
contig_lines = get_contigs_header_lines(genome_build)
return get_vcf_header_lines(info_dict=info_dict, formats=formats, contig_lines=contig_lines, samples=samples)
2 changes: 1 addition & 1 deletion snpdb/views/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -1644,7 +1644,7 @@ def view_genome_build(request, genome_build_name):
genome_build = GenomeBuild.get_name_or_alias(genome_build_name)
context = {
"genome_build": genome_build,
"standard_contigs": genome_build.contigs.filter(role=SequenceRole.ASSEMBLED_MOLECULE)
"standard_contigs": genome_build.standard_contigs,
}
return render(request, "snpdb/genomics/view_genome_build.html", context)

Expand Down
32 changes: 15 additions & 17 deletions upload/management/commands/vcf_clean_and_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class Command(BaseCommand):

def add_arguments(self, parser):
parser.add_argument('--vcf', help='VCF file, default: - (stdin)', default="-")
parser.add_argument('--replace-header', help='VCF header file', required=False)
parser.add_argument('--genome-build', help='GenomeBuild name', required=True)
parser.add_argument('--remove-info', action='store_true', help='clear INFO field')
parser.add_argument('--skipped-contigs-stats-file', help='File name')
Expand All @@ -35,6 +36,7 @@ def handle(self, *args, **options):
vcf_filename = options["vcf"]
build_name = options["genome_build"]
remove_info = options["remove_info"]
replace_header = options["replace_header"]
skipped_contigs_stats_file = options.get("skipped_contigs_stats_file")
skipped_records_stats_file = options.get("skipped_records_stats_file")
skipped_filters_stats_file = options.get("skipped_filters_stats_file")
Expand All @@ -56,35 +58,31 @@ def handle(self, *args, **options):

ref_standard_bases_pattern = re.compile(r"[GATC]")
alt_standard_bases_pattern = re.compile(r"[GATC,\.]") # Can be multi-alts, or "." for reference
contig_regex = re.compile(r"^##contig=<ID=(.+),length=(\d+)")

skip_patterns = {}
if skip_regex := getattr(settings, "VCF_IMPORT_SKIP_RECORD_REGEX", {}):
for name, regex in skip_regex.items():
skip_patterns[name] = re.compile(regex)

vcf_header_lines = []
if replace_header:
with open(replace_header, "r") as rh_f:
vcf_header_lines = rh_f.readlines()

first_non_header_line = True
defined_filters = None
for line in f:
if line[0] == '#':
if m := contig_regex.match(line):
contig_name, provided_contig_length = m.groups()
if contig_id := chrom_to_contig_id.get(contig_name):
if fasta_chrom := contig_to_fasta_names.get(contig_id):
provided_contig_length = int(provided_contig_length)
ref_contig_length = contig_lengths[contig_id]
if provided_contig_length != ref_contig_length:
msg = f"VCF header contig '{contig_name}' (length={provided_contig_length}) has " + \
f"different length than ref contig {fasta_chrom} (length={ref_contig_length})"
raise ValueError(msg)
# This contig would be replaced below, so change header
line = f"##contig=<ID={fasta_chrom},length={ref_contig_length}>\n"

vcf_header_lines.append(line)
sys.stdout.write(line)
if not replace_header:
vcf_header_lines.append(line)
else:
if defined_filters is None:
if first_non_header_line:
# Dump out the header
for header_line in vcf_header_lines:
sys.stdout.write(header_line)
defined_filters = self._get_defined_vcf_filters(vcf_header_lines)
first_non_header_line = False

columns = line.split("\t")
chrom = columns[VCFColumns.CHROM]
fasta_chrom = None
Expand Down
37 changes: 14 additions & 23 deletions upload/vcf/vcf_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from collections import OrderedDict
from subprocess import Popen, PIPE, CalledProcessError

import cyvcf2
import pandas as pd
from django.conf import settings
from django.utils import timezone

from library.django_utils.django_file_utils import get_import_processing_filename
from library.genomics.vcf_utils import write_cleaned_vcf_header
from library.utils.file_utils import name_from_filename
from upload.models import ModifiedImportedVariants, ToolVersion, UploadStep, \
UploadStepTaskType, VCFSkippedContigs, VCFSkippedContig, UploadStepMultiFileOutput, VCFPipelineStage, \
Expand Down Expand Up @@ -61,26 +61,17 @@ def create_sub_step(upload_step, sub_step_name, sub_step_commands):
task_type=UploadStepTaskType.TOOL)


def _write_split_headers(vcf_filename: str, remove_info: bool, split_headers_filename: str):
""" Add VT normalize INFO to header, strip if remove_info = True """
def _write_cleaned_header(genome_build, upload_pipeline, vcf_filename) -> str:
# We clean/replace header. Used for initial read of vcf_clean_and_filter then added on top of every splice vcf
clean_vcf_dir = upload_pipeline.get_pipeline_processing_subdir("clean_vcf_dir")
cleaned_vcf_header_filename = os.path.join(clean_vcf_dir, name_from_filename(vcf_filename, remove_gz=True) + ".header.vcf")
# TODO need to change this for bcftools whatever it uses
VT_HEADERS = [
'##INFO=<ID=OLD_MULTIALLELIC,Number=1,Type=String,Description="Original chr:pos:ref:alt encoding">',
'##INFO=<ID=OLD_VARIANT,Number=.,Type=String,Description="Original chr:pos:ref:alt encoding">',
]
with open(split_headers_filename, "w") as f:
written_vt_headers = False
for line in cyvcf2.Reader(vcf_filename).raw_header.split("\n"):
info_line = line.startswith("##INFO")
if info_line or line.startswith("#CHROM"):
if not written_vt_headers:
for vt_line in VT_HEADERS:
f.write(vt_line + "\n")
written_vt_headers = True

if info_line and remove_info:
continue
if line:
f.write(line + "\n")
write_cleaned_vcf_header(genome_build, vcf_filename, cleaned_vcf_header_filename, new_info_lines=VT_HEADERS)
return cleaned_vcf_header_filename


def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
Expand Down Expand Up @@ -112,10 +103,14 @@ def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
skipped_records_stats_file = get_import_processing_filename(upload_pipeline.pk, f"{vcf_name}.skipped_records.tsv")
skipped_filters_stats_file = get_import_processing_filename(upload_pipeline.pk, f"{vcf_name}.skipped_filters.tsv")

cleaned_vcf_header_filename = _write_cleaned_header(genome_build, upload_pipeline, vcf_filename)

manage_command = settings.MANAGE_COMMAND
read_variants_cmd = manage_command + ["vcf_clean_and_filter",
"--genome-build",
genome_build.name,
"--replace-header",
cleaned_vcf_header_filename,
"--skipped-contigs-stats-file",
skipped_contigs_stats_file,
"--skipped-records-stats-file",
Expand All @@ -141,20 +136,16 @@ def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
else:
pipe_commands[NORMALIZE_SUB_STEP] = [settings.BCFTOOLS_COMMAND, "norm",
"--multiallelics=-", "--rm-dup=exact",
# "--check-ref=w", # Not sure if this works?
"--check-ref=w", # Not sure if this works?
f"--fasta-ref={genome_build.reference_fasta}", "-"]
pipe_commands[REMOVE_HEADER_SUB_STEP] = [settings.BCFTOOLS_COMMAND, "view", "--no-header", "-"]
norm_substep_names = [NORMALIZE_SUB_STEP] # Just 1

# Split up the VCF
split_vcf_dir = upload_pipeline.get_pipeline_processing_subdir("split_vcf")
# We'll cat split_headers_filename on top of every split VCF
split_headers_filename = os.path.join(split_vcf_dir, name_from_filename(vcf_filename, remove_gz=True))
_write_split_headers(vcf_filename, remove_info, split_headers_filename)

pipe_commands[SPLIT_VCF_SUB_STEP] = ["split", "-", vcf_name, "--additional-suffix=.vcf.gz", "--numeric-suffixes",
"--lines", str(settings.VCF_IMPORT_FILE_SPLIT_ROWS),
f"--filter='sh -c \"{{ cat {split_headers_filename}; cat; }} | bgzip -c > {split_vcf_dir}/$FILE\"'"]
f"--filter='sh -c \"{{ cat {cleaned_vcf_header_filename}; cat; }} | bgzip -c > {split_vcf_dir}/$FILE\"'"]

for sub_step_name in norm_substep_names:
sub_step_commands = pipe_commands[sub_step_name]
Expand Down

0 comments on commit 5f4c743

Please sign in to comment.