Skip to content

Commit

Permalink
Merge ed0af6d into e296fc4
Browse files Browse the repository at this point in the history
  • Loading branch information
afrubin committed Jan 6, 2021
2 parents e296fc4 + ed0af6d commit dd53c95
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 50 deletions.
7 changes: 6 additions & 1 deletion docs/conf.py
Expand Up @@ -28,7 +28,12 @@
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon", "sphinx.ext.intersphinx", "sphinx.ext.autosectionlabel",]
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.napoleon",
"sphinx.ext.intersphinx",
"sphinx.ext.autosectionlabel",
]

# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
Expand Down
89 changes: 53 additions & 36 deletions mavehgvs/variant.py
@@ -1,6 +1,6 @@
import re
import itertools
from typing import Optional, Union, List, Tuple, Mapping, Any
from typing import Optional, Union, List, Tuple, Mapping, Any, Generator

from mavehgvs.position import VariantPosition
from mavehgvs.patterns.combined import any_variant
Expand Down Expand Up @@ -45,7 +45,8 @@ def __init__(
If provided, the variant will be validated for agreement with this sequence.
Target sequence validation is not supported for variants using the extended position syntax.
The type of the target sequence (DNA, RNA, or amino acid) will be inferred.
This must be an amino acid sequence for protein variants or a nucleotide sequence for
coding/noncoding/genomic variants.
DNA and amino acid sequences should be in uppercase, RNA in lowercase.
relaxed_ordering : bool
Expand Down Expand Up @@ -87,7 +88,6 @@ def __init__(
self._groupdict, relaxed_ordering=relaxed_ordering
)
elif self.variant_count > 1:
# TODO: validate variant ordering
self._variant_types = list()
self._positions = list()
self._sequences = list()
Expand Down Expand Up @@ -156,15 +156,13 @@ def sort_key(x):
else:
raise ValueError("invalid position type")

variant_tuples = list(
zip(self._variant_types, self._positions, self._sequences)
)
ordered_tuples = sorted(variant_tuples, key=sort_key)
if variant_tuples != ordered_tuples:
if relaxed_ordering:
self._variant_types = [x[0] for x in ordered_tuples]
self._positions = [x[1] for x in ordered_tuples]
self._sequences = [x[2] for x in ordered_tuples]
variant_list = list(self.variant_tuples())
ordered_list = sorted(variant_list, key=sort_key)
if variant_list != ordered_list:
if relaxed_ordering: # re-sort the variants
self._variant_types = [x[0] for x in ordered_list]
self._positions = [x[1] for x in ordered_list]
self._sequences = [x[2] for x in ordered_list]
else:
raise MaveHgvsParseError(
"multi-variants not in sorted order"
Expand All @@ -180,15 +178,38 @@ def sort_key(x):
raise ValueError("can only create Variants from string or Mapping objects")

if targetseq is not None:
if self.variant_count == 1:
if self._variant_types == "sub":
self._target_validate_substitution(
self._positions, self._sequences[0], targetseq
)
elif self._variant_types in ("ins", "del", "dup", "delins"):
self._target_validate_indel(self._positions, targetseq)
elif self.variant_count > 1:
pass
for vtype, pos, seq in self.variant_tuples():
if vtype == "sub":
self._target_validate_substitution(pos, seq[0], targetseq)
elif vtype in ("ins", "del", "dup", "delins"):
self._target_validate_indel(pos, targetseq)

def variant_tuples(
self
) -> Generator[
Tuple[
str,
Optional[Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]],
Optional[Union[str, Tuple[str, str]]],
],
None,
None,
]:
"""Generator that yields tuples containing the variant components.
Yields
------
Tuple
Tuple of the variant type, position(s), and sequence(s) for each element in the variant.
"""
if self.is_multi_variant():
for vtype, pos, seq in zip(
self._variant_types, self._positions, self._sequences
):
yield vtype, pos, seq
else:
yield self._variant_types, self._positions, self._sequences

# TODO: type hints and docstring
def __process_string_variant(self, groupdict, relaxed_ordering):
Expand Down Expand Up @@ -329,15 +350,12 @@ def format_variant(

if self.is_target_identical():
return f"{prefix}.{self._sequences}"
elif self.variant_count > 1:
elements = list()
for vtype, pos, seq in zip(
self._variant_types, self._positions, self._sequences
):
elements.append(format_variant(vtype, pos, seq))
return f"{prefix}.[{';'.join(elements)}]"
else:
return f"{prefix}.{format_variant(self._variant_types, self._positions, self._sequences)}"
elements = [format_variant(*t) for t in self.variant_tuples()]
if self.is_multi_variant():
return f"{prefix}.[{';'.join(elements)}]"
else:
return f"{prefix}.{elements[0]}"

@staticmethod
def _target_validate_substitution(
Expand All @@ -348,16 +366,16 @@ def _target_validate_substitution(
Note that variants using extended syntax cannot be validated with this method.
If an extended syntax variant is encountered, it will be interpreted as valid/matching.
# TODO: this needs to be aware of protein vs nucleotide targets
Parameters
----------
pos : VariantPosition
Position of the substitution.
ref : str
Reference base or amino acid.
Reference base or amino acid from the variant.
target : str
Target sequence.
Target sequence. This must be an amino acid sequence for protein variants or a nucleotide sequence
for coding/noncoding/genomic variants.
RNA sequences should be in lowercase, DNA sequences should be in uppercase.
Returns
-------
Expand Down Expand Up @@ -390,14 +408,13 @@ def _target_validate_indel(
Note that variants using extended syntax cannot be validated with this method.
If an extended syntax variant is encountered, it will be interpreted as valid/matching.
# TODO: this needs to be aware of protein vs nucleotide targets
Parameters
----------
pos : Union[VariantPosition, Tuple[VariantPosition, VariantPosition]]
Single variant position or start/end tuple for the indel.
target : str
Target sequence.
Target sequence. This must be an amino acid sequence for protein variants or a nucleotide sequence
for coding/noncoding/genomic variants.
Returns
-------
Expand Down
6 changes: 3 additions & 3 deletions setup.py
Expand Up @@ -4,18 +4,18 @@
with open("README.md", "r") as fh:
long_description = fh.read()

requirements = ["fqfa>=1.2.0"]
requirements = ["fqfa>=1.2.1"]
# fqfa requires backported dataclasses in Python 3.6
if sys.version_info.major == 3 and sys.version_info.minor == 6:
requirements.append("dataclasses")

setuptools.setup(
name="mavehgvs",
version="0.1.0",
version="0.2.0",
author="Daniel Esposito and Alan F Rubin",
author_email="alan.rubin@wehi.edu.au",
description=(
"Regular expression-based validation of HGVS variant strings for clinical genetics and genomics applications."
"Regular expression-based validation of HGVS-style variant strings for Multiplexed Assays of Variant Effect."
),
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
14 changes: 4 additions & 10 deletions tests/test_variant.py
Expand Up @@ -220,15 +220,15 @@ class TestCreateMultiVariantFromValues(unittest.TestCase):

class TestTargetSequenceValidation(unittest.TestCase):
def test_matching_dna_substitution(self):
variant_tuples = [("ACGT", "c.1A>T"), ("ACGT", "c.3G>C")]
variant_tuples = [("ACGT", "c.1A>T"), ("ACGT", "c.3G>C"), ("ACGT", "c.[1A>T;3G>C]")]

for target, s in variant_tuples:
with self.subTest(target=target, s=s):
v = Variant(s, targetseq=target)
self.assertEqual(s, str(v))

def test_nonmatching_dna_substitution(self):
variant_tuples = [("ACGT", "c.1C>T"), ("ACGT", "c.3T>C")]
variant_tuples = [("ACGT", "c.1C>T"), ("ACGT", "c.3T>C"), ("ACGT", "c.[1A>T;3T>C]")]

for target, s in variant_tuples:
with self.subTest(target=target, s=s):
Expand Down Expand Up @@ -284,10 +284,7 @@ def test_valid_dna_ins(self):
self.assertEqual(s, str(v))

def test_invalid_dna_ins(self):
variant_tuples = [
("ACGT", "c.4_5insA"),
("ACGT", "c.10_11insTCG"),
]
variant_tuples = [("ACGT", "c.4_5insA"), ("ACGT", "c.10_11insTCG")]

for target, s in variant_tuples:
with self.subTest(target=target, s=s):
Expand All @@ -303,10 +300,7 @@ def test_valid_dna_delins(self):
self.assertEqual(s, str(v))

def test_invalid_dna_delins(self):
variant_tuples = [
("ACGT", "c.4_5delinsA"),
("ACGT", "c.10_delinsTCG"),
]
variant_tuples = [("ACGT", "c.4_5delinsA"), ("ACGT", "c.10_delinsTCG")]

for target, s in variant_tuples:
with self.subTest(target=target, s=s):
Expand Down

0 comments on commit dd53c95

Please sign in to comment.