In [1]:
import re

from django.core.exceptions import ValidationError

In [2]:
nucleotides = "ATCGXN"

utr_descriptor = r"(?P<utr>[*-])"
position = r"(\d+)|(\d+(\+|-)\d+)"
utr_descriptor_no_groups = utr_descriptor.\
    replace('P<utr>', ':')

deletion = (
    r"(?P<del_start>{0})_(?P<del_end>{0})(?P<del>del)"
    r"|"
    r"(?P<del_pos>{0})(?P<del_single>del)(?P<del_base>[{1}])".format(
        position, nucleotides)
)
deletion_no_groups = deletion\
    .replace('P<del_start>', ':')\
    .replace('P<del_end>', ':')\
    .replace('P<del_pos>', ':')\
    .replace('P<del_single>', ':')\
    .replace('P<del_base>', ':')\
    .replace('P<del>', ':')

insertion = (
    r"(?P<ins_start>{0})_(?P<ins_end>{0})(?P<ins>ins)"
    r"(?P<ins_bases>[{1}]+)".format(position, nucleotides)
)
insertion_no_groups = insertion\
    .replace('P<ins_start>', ':')\
    .replace('P<ins_end>', ':')\
    .replace('P<ins_bases>', ':')\
    .replace('P<ins>', ':')

delins = (
    r"(?P<delins_start>{0})_(?P<delins_end>{0})(?P<delins>delins)"
    r"(?P<delins_bases>[{1}]+)".format(position, nucleotides)
)
delins_no_groups = delins\
    .replace('P<delins_start>', ':')\
    .replace('P<delins_end>', ':')\
    .replace('P<delins_bases>', ':')\
    .replace('P<delins>', ':')
    

substitution = (
    r"(?P<sub_pos>{0})"
    r"(?P<sub_ref_nt>[{1}])"
    r"(?P<sub>=|>|=//|=/)"
    r"(?P<sub_new_nt>[{1}])".format(position, nucleotides)
)
substitution_no_groups = substitution\
    .replace('P<sub_pos>', ':')\
    .replace('P<sub_ref_nt>', ':')\
    .replace('P<sub>', ':')\
    .replace('P<sub_new_nt>', ':')

In [3]:
any_event_groups = (
    r"({utr_descriptor})?"
    r"(({deletion})|({insertion})|({delins})|({substitution}))"
).format(
    utr_descriptor=utr_descriptor,
    deletion=deletion,
    insertion=insertion,
    delins=delins,
    substitution=substitution
)

any_event_no_groups = (
    r"({utr_descriptor})?"
    r"(({deletion})|({insertion})|({delins})|({substitution}))"
).format(
    utr_descriptor=utr_descriptor_no_groups,
    deletion=deletion_no_groups,
    insertion=insertion_no_groups,
    delins=delins_no_groups,
    substitution=substitution_no_groups
)

single_variant = r"[cngm]\.{event}".format(event=any_event_groups)
multi_variant = r"[cngm]\.\[({event})(;{event}){{1,}}(?!;)\]".format(
    event=any_event_no_groups)

single_variant_re = re.compile(single_variant)
multi_variant_re = re.compile(multi_variant)
any_event_groups_re = re.compile(any_event_groups)

In [45]:
def is_multi(hgvs):
    return bool(multi_variant_re.fullmatch(hgvs))


def validate_event(event):
    if isinstance(event, str):
        match_event = any_event_groups_re.fullmatch(event)
        match_single = single_variant_re.fullmatch(event)
        if not match_event and not match_single:
            raise ValidationError("Invalid variant '{}'.".format(event))
        return match_single or match_event
    elif hasattr(event, 'groupdict'):
        return event
    else:
        raise TypeError(
            "Expected `event` to be str or an re match object. "
            "Found {}.".format(type(event).__name__)
        )


def event_type(event):
    event = validate_event(event).groupdict()
    ins = event.get('ins', None)
    del_ = event.get('del', None)
    delins = event.get('delins', None)
    sub = event.get('sub', None)
    if ins:
        return ins
    elif del_:
        return del_
    elif delins:
        return delins
    elif sub:
        return sub
    else:
        return None
    
    
def parse_positions(start, end):
    start = re.split(r"[\+-]", start)
    end = re.split(r"[\+-]", end)
    intronic = any([len(x) >=2 for x in [start, end]])
    if intronic and len(start) < 2:
        start = 1
    else:
        start = int(start[-1])
    if intronic and len(end) < 2:
        end = start + 1
    else:
        end = int(end[-1])
    return int(start), int(end)
        
     
def validate_substitution(event):
    event = validate_event(event)
    ref_nt = event.groupdict().get('sub_ref_nt', None)
    new_nt = event.groupdict().get('sub_new_nt', None)
    if not ref_nt or not new_nt:
        raise ValidationError(
            "Invalid nucleotides for variant '{}'".format(event.string)
        )
    if ref_nt == new_nt:
        raise ValidationError(
            "Reference nucleotide cannot be the same as the "
            "new nucleotide for variant '{}'.".format(event.string)
        ) 


def validate_deletion(event):
    event = validate_event(event)
    start = event.groupdict().get('del_start', None)
    end = event.groupdict().get('del_end', None)
    single = event.groupdict().get('del_single', False)
    if (start is None or end is None) and not single:
        raise ValidationError("Invalid deletion variant '{}',".format(
            event.string))
    
    start, end = parse_positions(start, end)
    if not single and start >= end:
        raise ValidationError(
            "Deletion starting position must be less than the ending "
            "position in variant '{}'.".format(event.string)
        )


def validate_insertion(event):
    event = validate_event(event)
    start = event.groupdict().get('ins_start', None)
    end = event.groupdict().get('ins_end', None)
    if start is None or end is None:
        raise ValidationError("Invalid insertion variant '{}',".format(
            event.string))
    
    start, end = parse_positions(start, end)
    if start >= end:
        raise ValidationError(
            "Insertion starting position must be less than the ending "
            "position in variant '{}'.".format(event.string)
        )
    flanking = start == (end - 1)
    if not flanking:
        raise ValidationError(
            "Interval must define a flanking insertion site in '{}'.".format(
                event.string)
        )


def validate_delins(event):
    event = validate_event(event)
    start = event.groupdict().get('delins_start', None)
    end = event.groupdict().get('delins_end', None)
    if start is None or end is None:
        raise ValidationError("Invalid indel variant '{}',".format(
            event.string))
    
    start, end = parse_positions(start, end)
    if start >= end:
        raise ValidationError(
            "Indel starting position must be less than the ending "
            "position in variant '{}'.".format(event.string)
        )


In [46]:
validate_event_functions = {
    'ins': validate_insertion,
    'del': validate_deletion,
    'delins': validate_delins,
    'sub': validate_substitution,
}


def validate_multi_variant(hgvs):
    match = multi_variant_re.fullmatch(hgvs)
    if match:
        validate_single_variants(hgvs)  
    else:
        raise ValidationError("Invalid HGVS string '{}'.".format(hgvs))
    
    
def validate_single_variants(hgvs):
    inner = hgvs[3:-1]  # removes prefix and square brackets
    matches = inner.split(';')
    if len(matches) != len(set(matches)):
        raise ValidationError(
            "Multi-variant '{}' has defined the same " 
            "event more than once.".format(hgvs)
        )
    if matches:
        for event in matches:
            match = any_event_groups_re.fullmatch(event)
            if not match:
                raise ValidationError(
                    "Invalid event '{}' in '{}'.".format(event, hgvs))
            type_ = event_type(match)
            validate_event_functions[type_](event, match)
    else:
        raise ValidationError(
            "Variant '{}' has an invalid "
            "multi-variant format. "
            "Check that events are "
            "semi-colon delimited.".format(hgvs)
        )