In [1]:
import re

from django.core.exceptions import ValidationError

In [2]:
prefix = "[gm]|[cn]"
nucleotides = "ATCGX"

# Substitutions
See [here](http://varnomen.hgvs.org/recommendations/DNA/variant/substitution/) for HGVS substitution event nomenclature.

In [63]:
# “prefix”“position_substituted”“reference_nucleotide””>”new_nucleotide”
subsitution_grouped = (
    r"(?P<pos_sub>\d+)"
    r"(?P<ref_nt>[{ref_nt}]{{1}})"
    r"(?:>|=|=/|=//)"
    r"(?P<new_nt>[{new_nt}]{{1}})"
).format(prefix=prefix, ref_nt=nucleotides, new_nt=nucleotides)

In [52]:
subsitution_nongrouped = (
    r"(\d+)"
    r"(?:[{ref_nt}]{{1}})"
    r"(?:>|=|=/|=//)"
    r"(?:[{new_nt}]{{1}})"
).format(prefix=prefix, ref_nt=nucleotides, new_nt=nucleotides)

In [53]:
multi_substitutions = r"({first_group})(;{groups}){{1,}}(?!;)".format(
    prefix=prefix,
    first_group=subsitution_nongrouped,
    groups=subsitution_nongrouped
)
multi_variant = r"[{prefix}]{{1}}\.\[{multi}\]".format(prefix=prefix, multi=multi_substitutions)

In [64]:
subsitution_grouped_re = re.compile(subsitution_grouped)
subsitution_nongrouped_re = re.compile(subsitution_nongrouped)
multi_substitutions_re = re.compile(multi_substitutions)
multi_variant_re = re.compile(multi_variant)

In [55]:
def is_multi(hgvs):
    return bool(multi_variant_re.match(hgvs)) | bool(multi_substitutions_re.match(hgvs))

def validate_multi_variant(hgvs):
    match = multi_variant_re.match(hgvs)
    if match:
        validate_multi_substitutions(hgvs)  
    else:
        raise ValidationError("Invalid HGVS string '{}'.".format(hgvs))
        
def validate_multi_substitutions(hgvs):
    inner = hgvs[3:-1] # removes prefix and square brackets
    match = multi_substitutions_re.match(inner)
    if match:
        subs = match.string.split(';')
        if not subs:
            raise ValidationError(
                "Variant '{}' has an invalid "
                "multi-substitution format. "
                "Check that substitution events are "
                "semi-colon delimited.".format(hgvs)
            )
        if len(subs) != len(set(subs)):
            raise ValidationError(
                "Variant '{}' has defined the same substution " 
                "event more than once.".format(hgvs)
            )
        for sub in subs:
            sub_match = subsitution_grouped_re.match(sub)
            if not sub_match:
                raise ValidationError("")
            position = sub_match.groupdict().get('pos_sub', None)
            ref_nt = sub_match.groupdict().get('ref_nt', None)
            new_nt = sub_match.groupdict().get('new_nt', None)
            if not ref_nt or not new_nt:
                raise ValidationError(
                    "Invalid nucleotides for "
                    "variant '{}' in '{}'.".format(sub, hgvs)
                )
            if ref_nt == new_nt:
                raise ValidationError(
                    "Reference nucleotide cannot be the same as the "
                    "new nucleotide for variant '{}' in '{}'.".format(sub, hgvs)
                ) 
            if int(position) < 1:
                raise ValidationError(
                    "The substitution position for variant '{}' in '{}' "
                    "must be 1 or greater.".format(sub, hgvs)
                )
    else:
        raise ValidationError("Invalid HGVS string '{}'.".format(hgvs))

# Deletions
See [here](http://varnomen.hgvs.org/recommendations/DNA/variant/deletion/) for HGVS deletion event nomenclature.

In [74]:
subsitution_grouped_re.match('93+1G>T')

<_sre.SRE_Match object; span=(0, 7), match='93+1G>T'>

In [75]:
import hgvs

ImportError: No module named 'hgvs'