Skip to content

Commit

Permalink
issue #1014 - bcftools normalizing, store modified imported variant
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed May 29, 2024
1 parent 20e14e8 commit a6cd6d4
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 57 deletions.
3 changes: 3 additions & 0 deletions snpdb/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ class SoftwareVersion(models.Model):
name = models.TextField()
version = models.TextField()

def __str__(self):
return f"{self.name} ({self.version})"

class Meta:
abstract = True

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Generated by Django 4.2.10 on 2024-05-29 05:39

from django.db import migrations


def _one_off_move_modified_imported_variants_to_norm_substep(apps, sample):
ModifiedImportedVariants = apps.get_model("upload", "ModifiedImportedVariants")

# These are useless, get rid of them so we speed up migration
ModifiedImportedVariants.objects.filter(modifiedimportedvariant__isnull=True).delete()

# Move the step from "Preprocess VCF" to "normalize" (same pipeline)
records = []
for mivs in ModifiedImportedVariants.objects.filter(upload_step__name='Preprocess VCF'):
up = mivs.upload_step.upload_pipeline
# Should always be there but just in case use first()
if norm_step := up.uploadstep_set.filter(name='normalize').first():
mivs.upload_step = norm_step
records.append(mivs)

if records:
ModifiedImportedVariants.objects.bulk_update(records, ['upload_step'])


class Migration(migrations.Migration):

dependencies = [
('upload', '0017_alter_uploadedclinvarversion_clinvar_version'),
]

operations = [
migrations.RunPython(_one_off_move_modified_imported_variants_to_norm_substep)
]
41 changes: 36 additions & 5 deletions upload/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,10 @@ def message(self):
class ModifiedImportedVariant(models.Model):
""" Keep track of variants that were modified during import pre-processing by vt,
so people can find out why a variant they expected didn't turn up. """
BCFTOOLS_OLD_VARIANT_TAG = "BCFTOOLS_OLD_VARIANT"
VT_OLD_VARIANT_PATTERN = re.compile(r"^([^:]+):(\d+):([^/]+)/([^/]+)$")
NORM_TOOL_VT = "vt"
NORM_TOOL_BCFTOOLS = "bcftools"

import_info = models.ForeignKey(ModifiedImportedVariants, on_delete=CASCADE, null=True)
variant = models.ForeignKey(Variant, on_delete=CASCADE)
Expand All @@ -578,6 +581,13 @@ class ModifiedImportedVariant(models.Model):
old_variant = models.TextField(null=True)
old_variant_formatted = models.TextField(null=True) # consistently format for retrieval

@property
def tool_name(self) -> Optional[str]:
name = None
if tv := self.import_info.upload_step.tool_version:
name = tv.name
return name

@property
def genome_build(self):
return self.import_info.upload_step.genome_build
Expand All @@ -593,19 +603,41 @@ def _to_variant_coordinate(old_variant) -> VariantCoordinate:
raise ValueError(f"{old_variant} didn't match regex {ModifiedImportedVariant.VT_OLD_VARIANT_PATTERN}")

@staticmethod
def format_old_variant(old_variant: str, genome_build: GenomeBuild) -> list[str]:
def vt_format_old_variant(old_variant: str, genome_build: GenomeBuild) -> list[str]:
""" We need consistent formatting (case and use of chrom) so we can retrieve it easily.
May return multiple values """
formatted_old_variants = []
for ov in ModifiedImportedVariant._split_old_variant(old_variant):
for ov in ModifiedImportedVariant._vt_split_old_variant(old_variant):
vc = ModifiedImportedVariant._to_variant_coordinate(ov)
contig = genome_build.chrom_contig_mappings[vc.chrom]
variant_coordinate = VariantCoordinate(chrom=contig.name, position=vc.position, ref=vc.ref, alt=vc.alt, svlen=vc.svlen)
formatted_old_variants.append(ModifiedImportedVariant.get_old_variant_from_variant_coordinate(variant_coordinate))
return formatted_old_variants

@staticmethod
def _split_old_variant(old_variant) -> list[str]:
def bcftools_format_old_variant(old_variant: str, genome_build: GenomeBuild) -> list[str]:
""" We need consistent formatting (case and use of chrom) so we can retrieve it easily.
May return multiple values """
formatted_old_variants = []
# old variant can either have 4 or 5 fields (last one is alt index if multi-allelic)
cols = old_variant.split("|")
chrom, position, ref, alts = cols[:4]
alt_list = alts.split(",")
if len(cols) > 4:
alt_index = int(cols[4]) - 1 # 1-based
alt = alt_list[alt_index]
else:
if len(alt_list) != 1:
raise ValueError(f"BCFTOOLS normalized old variant: {old_variant} has multi-alts but no provided alt index")
alt = alt_list[0]

contig = genome_build.chrom_contig_mappings[chrom]
variant_coordinate = VariantCoordinate.from_explicit_no_svlen(contig.name, position, ref, alt)
return [ModifiedImportedVariant.get_old_variant_from_variant_coordinate(variant_coordinate)]


@staticmethod
def _vt_split_old_variant(old_variant) -> list[str]:
""" VT decompose writes OLD_VARIANT as comma separated, but if not decomposed (eg someone uploads already
normalized) could be multi-alt that looks like 5:132240059:CT/CTT/T """
old_variants = []
Expand All @@ -624,8 +656,7 @@ def _split_old_variant(old_variant) -> list[str]:

@staticmethod
def get_old_variant_from_variant_coordinate(vc: VariantCoordinate) -> str:
# TODO - this doesn't work w/symbolic alts but neither does VT - will eventually rewrite this to use
# BCF tools
# This doesn't handle symbolic alts with SVLEN but BCF tools don't record SVLEN in old tag anyway
return f"{vc.chrom}:{int(vc.position)}:{vc.ref}/{vc.alt}"

@classmethod
Expand Down
34 changes: 28 additions & 6 deletions upload/tests/test_modified_imported_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,49 @@


class TestModifiedImportedVariant(TestCase):
def test_format_old_variant(self):
def test_vt_format_old_variant(self):
grch37 = GenomeBuild.get_name_or_alias('GRCh37')
OLD_VARIANT = "NC_000019.9:536068:G/GTCCTCGTCCTTCCGGGACCCGGGGCGCTGGGAGCCTCACG"
old_variant_formatted = ModifiedImportedVariant.format_old_variant(OLD_VARIANT, grch37)
old_variant_formatted = ModifiedImportedVariant.vt_format_old_variant(OLD_VARIANT, grch37)
ov = old_variant_formatted.pop()
chrom = ov.split(":", 1)[0]
old_chrom = OLD_VARIANT.split(":", 1)[0]
contig = grch37.chrom_contig_mappings[old_chrom]
self.assertEqual(chrom, contig.name, "contig converted to chrom name")

def test_format_old_variant_multi(self):
def test_bcftools_format_old_variant(self):
grch37 = GenomeBuild.get_name_or_alias('GRCh37')
OLD_VARIANT = "NC_000019.9|536068|G|GTCCTCGTCCTTCCGGGACCCGGGGCGCTGGGAGCCTCACG"
old_variant_formatted = ModifiedImportedVariant.bcftools_format_old_variant(OLD_VARIANT, grch37)
ov = old_variant_formatted.pop()
chrom = ov.split(":", 1)[0]
old_chrom = OLD_VARIANT.split("|", 1)[0]
contig = grch37.chrom_contig_mappings[old_chrom]
self.assertEqual(chrom, contig.name, "contig converted to chrom name")

def test_vt_format_old_variant_multi(self):
grch37 = GenomeBuild.get_name_or_alias('GRCh37')
OLD_VARIANT_MULTI = "19:536068:G/GTCCTCGTCCTTCCGGGACCCGGGGCGCTGGGAGCCTCACG,NC_000019.9:536046:C/CCCGGGGCGCTGGGAGCCTCACGTCCTCGTCCTTCCGGGAC"
old_variant_formatted = ModifiedImportedVariant.format_old_variant(OLD_VARIANT_MULTI, grch37)
old_variant_formatted = ModifiedImportedVariant.vt_format_old_variant(OLD_VARIANT_MULTI, grch37)
self.assertEqual(len(old_variant_formatted), 2)

def test_format_old_variant_non_decomposed_multi(self):
def test_bcftools_format_old_variant_multi(self):
grch37 = GenomeBuild.get_name_or_alias('GRCh37')
OLD_VARIANT_MULTI_1 = "19|536068|G|GA,GTCCTCGTCCTTCCGGGACCCGGGGCGCTGGGAGCCTCACG|1"
old_variant_formatted = ModifiedImportedVariant.bcftools_format_old_variant(OLD_VARIANT_MULTI_1, grch37)
alt = old_variant_formatted[0].split("/")[-1]
self.assertEqual(alt, "GA")

OLD_VARIANT_MULTI_2 = "19|536068|G|GA,GTCCTCGTCCTTCCGGGACCCGGGGCGCTGGGAGCCTCACG|2"
old_variant_formatted = ModifiedImportedVariant.bcftools_format_old_variant(OLD_VARIANT_MULTI_2, grch37)
alt = old_variant_formatted[0].split("/")[-1]
self.assertEqual(alt, "GTCCTCGTCCTTCCGGGACCCGGGGCGCTGGGAGCCTCACG")

def test_vt_format_old_variant_non_decomposed_multi(self):
""" People may upload a VCF which has already been normalized by VT (but not decomposed) so had the INFO
already set. As it's normalized already VT won't replace it, thus we'll get a multi but with slashes
not comma separated """
grch37 = GenomeBuild.get_name_or_alias('GRCh37')
OLD_VARIANT_MULTI = "5:132240059:CT/CTT/T" # Already set
old_variant_formatted = ModifiedImportedVariant.format_old_variant(OLD_VARIANT_MULTI, grch37)
old_variant_formatted = ModifiedImportedVariant.vt_format_old_variant(OLD_VARIANT_MULTI, grch37)
self.assertEqual(len(old_variant_formatted), 2)
24 changes: 15 additions & 9 deletions upload/vcf/abstract_bulk_vcf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,27 @@ def get_max_variant_id(self):
return None # No variants, or only reference

def add_modified_imported_variant(self, variant, variant_hash, miv_hash_list=None, miv_list=None):
old_multiallelic = variant.INFO.get("OLD_MULTIALLELIC")
old_variant = variant.INFO.get("OLD_VARIANT")

if old_multiallelic or old_variant:
# This used to handle VT tags: OLD_MULTIALLELIC / OLD_VARIANT but now we handle BCFTOOLS only
if bcftools_old_variant := variant.INFO.get(ModifiedImportedVariant.BCFTOOLS_OLD_VARIANT_TAG):
if miv_hash_list is None:
miv_hash_list = self.modified_imported_variant_hashes
if miv_list is None:
miv_list = self.modified_imported_variants

miv_hash_list.append(variant_hash)
if old_variant:
old_variant_formatted = ModifiedImportedVariant.format_old_variant(old_variant, self.genome_build)
if "," in bcftools_old_variant: # Was a multi-allelic
old_multiallelic = bcftools_old_variant
else:
old_multiallelic = None

old_position = int(bcftools_old_variant.split("|")[1])
if old_position != variant.locus.position:
old_variant = old_position
else:
old_variant_formatted = [None]
for ov in old_variant_formatted:
old_variant = None # Wasn't normalized

for ov in ModifiedImportedVariant.bcftools_format_old_variant(bcftools_old_variant, self.genome_build):
# These 2 need to be in sync
miv_hash_list.append(variant_hash)
miv_list.append((old_multiallelic, old_variant, ov))

def process_modified_imported_variants(self, variant_ids_by_hash):
Expand Down
51 changes: 15 additions & 36 deletions upload/vcf/vcf_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,10 @@
from library.utils.file_utils import name_from_filename
from upload.models import ModifiedImportedVariants, ToolVersion, UploadStep, \
UploadStepTaskType, VCFSkippedContigs, VCFSkippedContig, UploadStepMultiFileOutput, VCFPipelineStage, \
SimpleVCFImportInfo
SimpleVCFImportInfo, ModifiedImportedVariant
from upload.tasks.vcf.unknown_variants_task import SeparateUnknownVariantsTask, AnnotateImportedVCFTask


def get_vt_tool_version(vt_command):
p = Popen([vt_command, "--version"], stderr=PIPE)
_, stderr = p.communicate()
vt_version = stderr.decode().split('\n')[0]
tool_version, _ = ToolVersion.objects.get_or_create(name='vt', version=vt_version)
return tool_version


def get_bcftools_tool_version(bcftools_command):
p = Popen([bcftools_command, "--version"], stdout=PIPE, stderr=PIPE)
stdout, stderr = p.communicate()
Expand All @@ -48,8 +40,7 @@ def get_bcftools_tool_version(bcftools_command):

def create_sub_step(upload_step, sub_step_name, sub_step_commands):
start_date = timezone.now()
tool_version = get_vt_tool_version(settings.VCF_IMPORT_VT_COMMAND)
# tool_version = get_bcftools_tool_version(settings.BCFTOOLS_COMMAND)
tool_version = get_bcftools_tool_version(settings.BCFTOOLS_COMMAND)
command_line = ' '.join(sub_step_commands)
return UploadStep.objects.create(name=sub_step_name,
script=command_line,
Expand All @@ -65,20 +56,18 @@ def _write_cleaned_header(genome_build, upload_pipeline, vcf_filename) -> str:
# We clean/replace header. Used for initial read of vcf_clean_and_filter then added on top of every splice vcf
clean_vcf_dir = upload_pipeline.get_pipeline_processing_subdir("clean_vcf_dir")
cleaned_vcf_header_filename = os.path.join(clean_vcf_dir, name_from_filename(vcf_filename, remove_gz=True) + ".header.vcf")
# TODO need to change this for bcftools whatever it uses
VT_HEADERS = [
'##INFO=<ID=OLD_MULTIALLELIC,Number=1,Type=String,Description="Original chr:pos:ref:alt encoding">',
'##INFO=<ID=OLD_VARIANT,Number=.,Type=String,Description="Original chr:pos:ref:alt encoding">',
tag = ModifiedImportedVariant.BCFTOOLS_OLD_VARIANT_TAG
HEADERS = [
f'##INFO=<ID={tag},Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">',
]
write_cleaned_vcf_header(genome_build, vcf_filename, cleaned_vcf_header_filename, new_info_lines=VT_HEADERS)
write_cleaned_vcf_header(genome_build, vcf_filename, cleaned_vcf_header_filename, new_info_lines=HEADERS)
return cleaned_vcf_header_filename


def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
MAX_STDERR_OUTPUT = 5000 # How much stderr output per process to store in DB

VCF_CLEAN_AND_FILTER_SUB_STEP = "vcf_clean_and_filter"
DECOMPOSE_SUB_STEP = "decompose"
NORMALIZE_SUB_STEP = "normalize"
REMOVE_HEADER_SUB_STEP = "remove_header"
SPLIT_VCF_SUB_STEP = "split_vcf"
Expand Down Expand Up @@ -122,24 +111,13 @@ def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
pipe_commands[VCF_CLEAN_AND_FILTER_SUB_STEP] = read_variants_cmd
sub_steps[VCF_CLEAN_AND_FILTER_SUB_STEP] = create_sub_step(upload_step, VCF_CLEAN_AND_FILTER_SUB_STEP, read_variants_cmd)

use_vt = False
norm_substep_names = []
if use_vt:
# VT isn't the bottleneck here, it's my programs - so no speed advantage to using "+" for Uncompressed BCF streams
pipe_commands[DECOMPOSE_SUB_STEP] = [settings.VCF_IMPORT_VT_COMMAND, "decompose", "-s", "-"]
pipe_commands[NORMALIZE_SUB_STEP] = [settings.VCF_IMPORT_VT_COMMAND, "normalize", "-n", "-r", genome_build.reference_fasta, "-"]
# We don't run 'uniq' anymore as neither Vt or Bcftools handle SVLEN properly (so removed from sub_step_name loop below)
# @see https://github.com/SACGF/variantgrid/issues/818
# pipe_commands[UNIQ_SUB_STEP] = [settings.VCF_IMPORT_VT_COMMAND, "uniq", "-"]
pipe_commands[REMOVE_HEADER_SUB_STEP] = [settings.VCF_IMPORT_VT_COMMAND, "view", "-"]
norm_substep_names = [DECOMPOSE_SUB_STEP, NORMALIZE_SUB_STEP]
else:
pipe_commands[NORMALIZE_SUB_STEP] = [settings.BCFTOOLS_COMMAND, "norm",
"--multiallelics=-", "--rm-dup=exact",
"--check-ref=w", # Not sure if this works?
f"--fasta-ref={genome_build.reference_fasta}", "-"]
pipe_commands[REMOVE_HEADER_SUB_STEP] = [settings.BCFTOOLS_COMMAND, "view", "--no-header", "-"]
norm_substep_names = [NORMALIZE_SUB_STEP] # Just 1
pipe_commands[NORMALIZE_SUB_STEP] = [settings.BCFTOOLS_COMMAND, "norm",
"--multiallelics=-", "--rm-dup=exact",
"--check-ref=w",
f"--old-rec-tag={ModifiedImportedVariant.BCFTOOLS_OLD_VARIANT_TAG}",
f"--fasta-ref={genome_build.reference_fasta}", "-"]
pipe_commands[REMOVE_HEADER_SUB_STEP] = [settings.BCFTOOLS_COMMAND, "view", "--no-header", "-"]
norm_substep_names = [NORMALIZE_SUB_STEP]

# Split up the VCF
split_vcf_dir = upload_pipeline.get_pipeline_processing_subdir("split_vcf")
Expand Down Expand Up @@ -223,7 +201,8 @@ def preprocess_vcf(upload_step, remove_info=False, annotate_gnomad_af=False):
_store_vcf_skip_stats(skipped_filters_stats_file, clean_sub_step, "FILTER")

# Create this here so downstream tasks can add modified imported variant messages
import_info, _ = ModifiedImportedVariants.objects.get_or_create(upload_step=upload_step)
normalize_sub_step = sub_steps[NORMALIZE_SUB_STEP]
import_info, _ = ModifiedImportedVariants.objects.get_or_create(upload_step=normalize_sub_step)
vcf_import_annotate_dir = upload_pipeline.get_pipeline_processing_subdir("vcf_import_annotate")
sort_order = upload_pipeline.get_max_step_sort_order()
for split_vcf_filename in glob.glob(f"{split_vcf_dir}/*.vcf.gz"):
Expand Down
1 change: 0 additions & 1 deletion variantgrid/settings/components/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,6 @@
VCF_IMPORT_CREATE_COHORT_FROM_MULTISAMPLE_VCFS = True
VCF_IMPORT_NO_DNA_CONTROL_SAMPLE_REGEX = None
VCF_IMPORT_FILE_SPLIT_ROWS = 50000
VCF_IMPORT_VT_COMMAND = "vt" # Needs to be installed and in path
VCF_IMPORT_SKIP_RECORD_REGEX = {
"Fusion": "VARTYPE=fusion",
}
Expand Down

0 comments on commit a6cd6d4

Please sign in to comment.