issue #1014 - bcftools normalize

SACGF · May 27, 2024 · 5f4c743 · 5f4c743
1 parent 189fc7b
commit 5f4c743
Show file tree

Hide file tree

Showing 7 changed files with 106 additions and 65 deletions.
diff --git a/library/genomics/vcf_utils.py b/library/genomics/vcf_utils.py
@@ -6,7 +6,7 @@
 import cyvcf2
 import vcf
 
-from snpdb.models import Variant, Sequence
+from snpdb.models import Variant, Sequence, GenomeFasta
 
 
 def cyvcf2_header_types(cyvcf2_reader):
@@ -121,9 +121,70 @@ def vcf_get_ref_alt_svlen(variant: cyvcf2.Variant):
     return ref, alt, svlen
 
 
-def get_vcf_header_contig_lines(contigs: list[tuple]):
+def get_vcf_header_contig_lines(contigs: list[tuple]) -> list[str]:
     header_lines = []
     for contig, length, assembly in contigs:
         line = f"##contig=<ID={contig},length={length},assembly={assembly}>"
         header_lines.append(line)
     return header_lines
+
+
+def get_contigs_header_lines(genome_build, standard_only=True, contig_allow_list: set = None) -> list[str]:
+    if standard_only:
+        contig_qs = genome_build.standard_contigs
+    else:
+        contig_qs = genome_build.contigs
+
+    contigs = []
+    for contig in contig_qs:
+        if contig_allow_list is not None:
+            if contig.refseq_accession not in contig_allow_list:
+                continue
+        contigs.append((contig.refseq_accession, contig.length, genome_build.name))
+    return get_vcf_header_contig_lines(contigs)
+
+
+def write_cleaned_vcf_header(genome_build, source_vcf_filename: str, output_filename: str, new_info_lines: list[str] = None):
+    contig_regex = re.compile(r"^##contig=<ID=(.+),length=(\d+)")
+
+    header_lines = []
+    with open(source_vcf_filename) as in_f:
+        for line in in_f:
+            if not line.startswith("#"):
+                break  # End of header
+            header_lines.append(line.strip())
+
+    # These are used to validate contigs in header
+    genome_fasta = GenomeFasta.get_for_genome_build(genome_build)
+    chrom_to_contig_id = genome_build.get_chrom_contig_id_mappings()
+    contig_lengths = dict(genome_build.contigs.values_list("pk", "length"))
+    contig_to_fasta_names = genome_fasta.get_contig_id_to_name_mappings()
+
+    with open(output_filename, "w") as f:
+        found_column_names_line = False
+        for line in header_lines:
+            if line.startswith("#CHROM"):
+                found_column_names_line = True
+                # This is where we dump the new stuff
+                if new_info_lines:
+                    for new_info_line in new_info_lines:
+                        f.write(new_info_line + "\n")
+                for contig_line in get_contigs_header_lines(genome_build):
+                    f.write(contig_line + "\n")
+
+            elif m := contig_regex.match(line):
+                # Strip existing contig lines from header - though check they match so we don't get build swaps
+                contig_name, provided_contig_length = m.groups()
+                if contig_id := chrom_to_contig_id.get(contig_name):
+                    if fasta_chrom := contig_to_fasta_names.get(contig_id):
+                        provided_contig_length = int(provided_contig_length)
+                        ref_contig_length = contig_lengths[contig_id]
+                        if provided_contig_length != ref_contig_length:
+                            msg = f"VCF header contig '{contig_name}' (length={provided_contig_length}) has " + \
+                                f"different length than ref contig {fasta_chrom} (length={ref_contig_length})"
+                            raise ValueError(msg)
+
+            f.write(line + "\n")
+
+        if not found_column_names_line:
+            raise ValueError("VCF header was missing line starting with '#CHROM'")
diff --git a/snpdb/liftover.py b/snpdb/liftover.py
@@ -13,7 +13,7 @@
 
 from genes.hgvs import HGVSMatcher
 from library.django_utils.django_file_utils import get_import_processing_dir
-from library.genomics.vcf_utils import write_vcf_from_tuples, get_vcf_header_contig_lines
+from library.genomics.vcf_utils import write_vcf_from_tuples, get_vcf_header_contig_lines, get_contigs_header_lines
 from library.guardian_utils import admin_bot
 from library.log_utils import log_traceback
 from snpdb.clingen_allele import populate_clingen_alleles_for_variants
@@ -24,14 +24,6 @@
 from upload.upload_processing import process_upload_pipeline
 
 
-def get_used_contigs_header_lines(genome_build, used_contigs: set) -> list[str]:
-    contigs = []
-    for contig in genome_build.contigs:
-        if contig.name in used_contigs:
-            contigs.append((contig.name, contig.length, genome_build.name))
-    return get_vcf_header_contig_lines(contigs)
-
-
 def create_liftover_pipelines(user: User, alleles: Iterable[Allele],
                               import_source: ImportSource,
                               inserted_genome_build: GenomeBuild,
@@ -74,7 +66,7 @@ def create_liftover_pipelines(user: User, alleles: Iterable[Allele],
                 if allele_liftover_records:
                     AlleleLiftover.objects.bulk_create(allele_liftover_records, batch_size=2000)
 
-                header_lines = get_used_contigs_header_lines(vcf_genome_build, used_contigs)
+                header_lines = get_contigs_header_lines(vcf_genome_build, contig_allow_list=used_contigs)
                 write_vcf_from_tuples(vcf_filename, av_tuples, tuples_have_id_field=True, header_lines=header_lines)
                 uploaded_file = UploadedFile.objects.create(path=liftover_vcf_filename,
                                                             import_source=import_source,

diff --git a/snpdb/models/models_genome.py b/snpdb/models/models_genome.py
@@ -129,6 +129,10 @@ def contigs(self):
         qs = Contig.objects.filter(genomebuildcontig__genome_build=self)
         return qs.order_by("genomebuildcontig__order")
 
+    @cached_property
+    def standard_contigs(self):
+        return self.contigs.filter(role=SequenceRole.ASSEMBLED_MOLECULE)
+
     @property
     def genome_builds(self):
         """ For PreviewModelMixin - search knows it's a genome build """

diff --git a/snpdb/vcf_export_utils.py b/snpdb/vcf_export_utils.py
@@ -1,10 +1,8 @@
-from library.genomics.vcf_utils import get_vcf_header_contig_lines
+from library.genomics.vcf_utils import get_vcf_header_contig_lines, get_contigs_header_lines
 
 
-def get_vcf_header_lines(top_lines=None, info_dict=None, formats=None, contigs=None, samples=None):
-    """ info_dict - values = dict with keys of 'type', 'description' (optional 'number' default = 1)
-        contigs - tuples of (contig, length, assembly)
-    """
+def get_vcf_header_lines(top_lines=None, info_dict=None, formats=None, contig_lines=None, samples=None):
+    """ info_dict - values = dict with keys of 'type', 'description' (optional 'number' default = 1) """
 
     if samples is None:
         samples = []
@@ -25,8 +23,8 @@ def get_vcf_header_lines(top_lines=None, info_dict=None, formats=None, contigs=N
     if use_format:
         header_lines.extend(formats)
 
-    if contigs:
-        header_lines.extend(get_vcf_header_contig_lines(contigs))
+    if contig_lines:
+        header_lines.extend(contig_lines)
 
     colnames = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
     if use_format:
@@ -40,15 +38,12 @@ def get_vcf_header_lines(top_lines=None, info_dict=None, formats=None, contigs=N
 def get_vcf_header_from_contigs(genome_build, info_dict=None, samples=None):
     """ info_dict which contains ('number', 'type', 'description') """
 
-    contigs = []
-    for contig in genome_build.contigs:
-        contigs.append((contig.name, contig.length, genome_build.name))
-
     formats = ['##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">',
                '##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">',
                '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                '##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">',
                '##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">',
                '##FORMAT=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1)">']
 
-    return get_vcf_header_lines(info_dict=info_dict, formats=formats, contigs=contigs, samples=samples)
+    contig_lines = get_contigs_header_lines(genome_build)
+    return get_vcf_header_lines(info_dict=info_dict, formats=formats, contig_lines=contig_lines, samples=samples)
diff --git a/snpdb/views/views.py b/snpdb/views/views.py
@@ -1644,7 +1644,7 @@ def view_genome_build(request, genome_build_name):
     genome_build = GenomeBuild.get_name_or_alias(genome_build_name)
     context = {
         "genome_build": genome_build,
-        "standard_contigs": genome_build.contigs.filter(role=SequenceRole.ASSEMBLED_MOLECULE)
+        "standard_contigs": genome_build.standard_contigs,
     }
     return render(request, "snpdb/genomics/view_genome_build.html", context)
 

diff --git a/upload/management/commands/vcf_clean_and_filter.py b/upload/management/commands/vcf_clean_and_filter.py
@@ -24,6 +24,7 @@ class Command(BaseCommand):
 
     def add_arguments(self, parser):
         parser.add_argument('--vcf', help='VCF file, default: - (stdin)', default="-")
+        parser.add_argument('--replace-header', help='VCF header file', required=False)
         parser.add_argument('--genome-build', help='GenomeBuild name', required=True)
         parser.add_argument('--remove-info', action='store_true', help='clear INFO field')
         parser.add_argument('--skipped-contigs-stats-file', help='File name')
@@ -35,6 +36,7 @@ def handle(self, *args, **options):
         vcf_filename = options["vcf"]
         build_name = options["genome_build"]
         remove_info = options["remove_info"]
+        replace_header = options["replace_header"]
         skipped_contigs_stats_file = options.get("skipped_contigs_stats_file")
         skipped_records_stats_file = options.get("skipped_records_stats_file")
         skipped_filters_stats_file = options.get("skipped_filters_stats_file")
@@ -56,35 +58,31 @@ def handle(self, *args, **options):
 
         ref_standard_bases_pattern = re.compile(r"[GATC]")
         alt_standard_bases_pattern = re.compile(r"[GATC,\.]")  # Can be multi-alts, or "." for reference
-        contig_regex = re.compile(r"^##contig=<ID=(.+),length=(\d+)")
 
         skip_patterns = {}
         if skip_regex := getattr(settings, "VCF_IMPORT_SKIP_RECORD_REGEX", {}):
             for name, regex in skip_regex.items():
                 skip_patterns[name] = re.compile(regex)
 
         vcf_header_lines = []
+        if replace_header:
+            with open(replace_header, "r") as rh_f:
+                vcf_header_lines = rh_f.readlines()
+
+        first_non_header_line = True
         defined_filters = None
         for line in f:
             if line[0] == '#':
-                if m := contig_regex.match(line):
-                    contig_name, provided_contig_length = m.groups()
-                    if contig_id := chrom_to_contig_id.get(contig_name):
-                        if fasta_chrom := contig_to_fasta_names.get(contig_id):
-                            provided_contig_length = int(provided_contig_length)
-                            ref_contig_length = contig_lengths[contig_id]
-                            if provided_contig_length != ref_contig_length:
-                                msg = f"VCF header contig '{contig_name}' (length={provided_contig_length}) has " + \
-                                    f"different length than ref contig {fasta_chrom} (length={ref_contig_length})"
-                                raise ValueError(msg)
-                            # This contig would be replaced below, so change header
-                            line = f"##contig=<ID={fasta_chrom},length={ref_contig_length}>\n"
-
-                vcf_header_lines.append(line)
-                sys.stdout.write(line)
+                if not replace_header:
+                    vcf_header_lines.append(line)
             else:
-                if defined_filters is None:
+                if first_non_header_line:
+                    # Dump out the header
+                    for header_line in vcf_header_lines:
+                        sys.stdout.write(header_line)
                     defined_filters = self._get_defined_vcf_filters(vcf_header_lines)
+                    first_non_header_line = False
+
                 columns = line.split("\t")
                 chrom = columns[VCFColumns.CHROM]
                 fasta_chrom = None

diff --git a/upload/vcf/vcf_preprocess.py b/upload/vcf/vcf_preprocess.py
@@ -4,12 +4,12 @@
 from collections import OrderedDict
 from subprocess import Popen, PIPE, CalledProcessError
 
-import cyvcf2
 import pandas as pd
 from django.conf import settings
 from django.utils import timezone
 
 from library.django_utils.django_file_utils import get_import_processing_filename
+from library.genomics.vcf_utils import write_cleaned_vcf_header
 from library.utils.file_utils import name_from_filename
 from upload.models import ModifiedImportedVariants, ToolVersion, UploadStep, \
     UploadStepTaskType, VCFSkippedContigs, VCFSkippedContig, UploadStepMultiFileOutput, VCFPipelineStage, \
@@ -61,26 +61,17 @@ def create_sub_step(upload_step, sub_step_name, sub_step_commands):
                                      task_type=UploadStepTaskType.TOOL)
 
 
-def _write_split_headers(vcf_filename: str, remove_info: bool, split_headers_filename: str):
-    """ Add VT normalize INFO to header, strip if remove_info = True """
+def _write_cleaned_header(genome_build, upload_pipeline, vcf_filename) -> str:
+    # We clean/replace header. Used for initial read of vcf_clean_and_filter then added on top of every splice vcf
+    clean_vcf_dir = upload_pipeline.get_pipeline_processing_subdir("clean_vcf_dir")
+    cleaned_vcf_header_filename = os.path.join(clean_vcf_dir, name_from_filename(vcf_filename, remove_gz=True) + ".header.vcf")
+    # TODO need to change this for bcftools whatever it uses
     VT_HEADERS = [
         '##INFO=<ID=OLD_MULTIALLELIC,Number=1,Type=String,Description="Original chr:pos:ref:alt encoding">',
         '##INFO=<ID=OLD_VARIANT,Number=.,Type=String,Description="Original chr:pos:ref:alt encoding">',
     ]
-    with open(split_headers_filename, "w") as f:
-        written_vt_headers = False
-        for line in cyvcf2.Reader(vcf_filename).raw_header.split("\n"):
-            info_line = line.startswith("##INFO")
-            if info_line or line.startswith("#CHROM"):
-                if not written_vt_headers:
-                    for vt_line in VT_HEADERS:
-                        f.write(vt_line + "\n")
-                    written_vt_headers = True
-
-                if info_line and remove_info:
-                    continue
-            if line:
-                f.write(line + "\n")
+    write_cleaned_vcf_header(genome_build, vcf_filename, cleaned_vcf_header_filename, new_info_lines=VT_HEADERS)
+    return cleaned_vcf_header_filename
 
 
 def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
@@ -112,10 +103,14 @@ def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
     skipped_records_stats_file = get_import_processing_filename(upload_pipeline.pk, f"{vcf_name}.skipped_records.tsv")
     skipped_filters_stats_file = get_import_processing_filename(upload_pipeline.pk, f"{vcf_name}.skipped_filters.tsv")
 
+    cleaned_vcf_header_filename = _write_cleaned_header(genome_build, upload_pipeline, vcf_filename)
+
     manage_command = settings.MANAGE_COMMAND
     read_variants_cmd = manage_command + ["vcf_clean_and_filter",
                                           "--genome-build",
                                           genome_build.name,
+                                          "--replace-header",
+                                          cleaned_vcf_header_filename,
                                           "--skipped-contigs-stats-file",
                                           skipped_contigs_stats_file,
                                           "--skipped-records-stats-file",
@@ -141,20 +136,16 @@ def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
     else:
         pipe_commands[NORMALIZE_SUB_STEP] = [settings.BCFTOOLS_COMMAND, "norm",
                                              "--multiallelics=-", "--rm-dup=exact",
-                                             # "--check-ref=w", # Not sure if this works?
+                                             "--check-ref=w", # Not sure if this works?
                                              f"--fasta-ref={genome_build.reference_fasta}", "-"]
         pipe_commands[REMOVE_HEADER_SUB_STEP] = [settings.BCFTOOLS_COMMAND, "view", "--no-header", "-"]
         norm_substep_names = [NORMALIZE_SUB_STEP]  # Just 1
 
     # Split up the VCF
     split_vcf_dir = upload_pipeline.get_pipeline_processing_subdir("split_vcf")
-    # We'll cat split_headers_filename on top of every split VCF
-    split_headers_filename = os.path.join(split_vcf_dir, name_from_filename(vcf_filename, remove_gz=True))
-    _write_split_headers(vcf_filename, remove_info, split_headers_filename)
-
     pipe_commands[SPLIT_VCF_SUB_STEP] = ["split", "-", vcf_name, "--additional-suffix=.vcf.gz", "--numeric-suffixes",
                                          "--lines", str(settings.VCF_IMPORT_FILE_SPLIT_ROWS),
-                                         f"--filter='sh -c \"{{ cat {split_headers_filename}; cat; }} | bgzip -c > {split_vcf_dir}/$FILE\"'"]
+                                         f"--filter='sh -c \"{{ cat {cleaned_vcf_header_filename}; cat; }} | bgzip -c > {split_vcf_dir}/$FILE\"'"]
 
     for sub_step_name in norm_substep_names:
         sub_step_commands = pipe_commands[sub_step_name]