Skip to content

Commit

Permalink
Merge pull request #14 from VariantEffect/mite_fix
Browse files Browse the repository at this point in the history
MITE compatibility
  • Loading branch information
afrubin committed Jul 30, 2021
2 parents d44e159 + c0e2928 commit f2b82ed
Show file tree
Hide file tree
Showing 13 changed files with 669 additions and 110 deletions.
35 changes: 24 additions & 11 deletions docs/spec.rst
Expand Up @@ -86,21 +86,23 @@ RNA variants are intended to be used when assaying the functional consequences t
such as a tRNA or ribozyme.
Variants that are measured at the DNA level should generally not use the RNA syntax.

Substitution
------------

.. note:: TODO: add some noncoding ('n.' variants) to the examples.
Equality
--------

MAVE-HGVS supports substitutions of a single nucleotide or amino acid.
MAVE-HGVS allows variants to describe equality to the target in a variety of ways.

Unlike in HGVS, variants that describe identity to the reference (target) at a single position (e.g. :code:`c.44=`)
are not valid for nucleotide positions.
Variants describing identity to the full target sequence (e.g. :code:`c.=`) are valid and are the intended way to
specify identity to the target (wild-type) sequence.
This replaces the `Enrich2 <https://doi.org/10.1186/s13059-017-1272-5>`_ :code:`_wt` variant syntax.

Variants that describe identity to the reference (target) at a single position (e.g. :code:`c.44=`)
or range of positions (e.g. :code:`c.1_3=`) are valid for coding and genomic sequences.
These should only be used for special cases, such as in MITE-seq datasets where the scores and counts are
reported separately for each wild-type codon.

The target-identity variants :code:`c.=` and :code:`p.=` are only valid on their own and are considered invalid as
part of multi-variants.
The variants that describe nucleotide identity to part of the reference are also invalid as part of multi-variants.

Variants that describe identity to the target at a single amino acid position (e.g. :code:`p.Cys22=`) are valid and
are the preferred way to describe specific synonymous variants.
Expand All @@ -115,6 +117,21 @@ This replaces the `Enrich2 <https://doi.org/10.1186/s13059-017-1272-5>`_ :code:
resulting in duplicate protein variants in the multi-variant.
This should also be considered invalid.

Examples of valid equality variants include:

* c.=
* c.22=
* g.123=
* p.Cys22=
* p.(=)

Substitution
------------

.. note:: TODO: add some noncoding ('n.' variants) to the examples.

MAVE-HGVS supports substitutions of a single nucleotide or amino acid.

MAVE-HGVS does not support extension variants, which extend an amino acid sequence to the N- or C- terminal end
(e.g. :code:`p.Met1ext-4` for gain of an upstream start or :code:`p.Ter345Lysext5` for a new downstream termination
codon).
Expand All @@ -126,24 +143,20 @@ Substitutions of more than one base at a time are covered under `Deletion-Insert
Examples of valid substitutions include:

* g.48C>A
* c.=
* c.122-6T>A
* c.*33G>C
* p.Glu27Trp
* p.Ter345Lys
* p.Cys22=
* r.22g>u
* r.33+12a>c

Examples of valid HGVS substitutions that are invalid in MAVE-HGVS:

* g.48C>W
* c.22=
* c.122=/T>A
* p.(Glu27Trp)
* p.*345Lys
* p.Glu23Xaa
* r.84=
* r.spl

Deletion
Expand Down
29 changes: 22 additions & 7 deletions mavehgvs/patterns/dna.py
Expand Up @@ -8,26 +8,34 @@
This does not include IUPAC ambiguity characters.
"""

dna_sub_c: str = rf"(?P<dna_sub_c>(?:(?P<position>{pos_intron_utr})(?P<ref>{dna_nt})>(?P<new>{dna_nt}))|(?P<equal>=))"
dna_equal_c: str = rf"(?P<dna_equal_c>(?:(?:(?P<start>{pos_intron_utr})_(?P<end>{pos_intron_utr}))|(?P<position>{pos_intron_utr}))?(?P<equal>=))"
"""str: Pattern matching DNA equality with numeric, intronic, or UTR positions.
"""

dna_sub_c: str = rf"(?P<dna_sub_c>(?P<position>{pos_intron_utr})(?P<ref>{dna_nt})>(?P<new>{dna_nt}))"
"""str: Pattern matching a DNA substitution with numeric, intronic, or UTR positions.
"""

dna_del_c: str = rf"(?P<dna_del_c>(?:(?:(?P<start>{pos_intron_utr})_(?P<end>{pos_intron_utr}))|(?P<pos>{pos_intron_utr}))del)"
dna_del_c: str = rf"(?P<dna_del_c>(?:(?:(?P<start>{pos_intron_utr})_(?P<end>{pos_intron_utr}))|(?P<position>{pos_intron_utr}))del)"
"""str: Pattern matching a DNA deletion with numeric, intronic, or UTR positions.
"""

dna_dup_c: str = rf"(?P<dna_dup_c>(?:(?:(?P<start>{pos_intron_utr})_(?P<end>{pos_intron_utr}))|(?P<pos>{pos_intron_utr}))dup)"
dna_dup_c: str = rf"(?P<dna_dup_c>(?:(?:(?P<start>{pos_intron_utr})_(?P<end>{pos_intron_utr}))|(?P<position>{pos_intron_utr}))dup)"
"""str: Pattern matching a DNA duplication with numeric, intronic, or UTR positions.
"""

dna_ins_c: str = rf"(?P<dna_ins_c>(?P<start>{pos_intron_utr})_(?P<end>{pos_intron_utr})ins(?P<seq>{dna_nt}+))"
"""str: Pattern matching a DNA insertion with numeric, intronic, or UTR positions.
"""

dna_delins_c: str = rf"(?P<dna_delins_c>(?:(?:(?P<start>{pos_intron_utr})_(?P<end>{pos_intron_utr}))|(?P<pos>{pos_intron_utr}))delins(?P<seq>{dna_nt}+))"
dna_delins_c: str = rf"(?P<dna_delins_c>(?:(?:(?P<start>{pos_intron_utr})_(?P<end>{pos_intron_utr}))|(?P<position>{pos_intron_utr}))delins(?P<seq>{dna_nt}+))"
"""str: Pattern matching a DNA deletion-insertion with numeric, intronic, or UTR positions.
"""

dna_equal_n: str = rf"(?P<dna_equal_n>(?P<equal>=))"
"""str: Pattern matching DNA equality with no position support.
"""

dna_sub_n: str = dna_sub_c.replace(pos_intron_utr, pos_intron).replace(
"(?P<dna_sub_c>", "(?P<dna_sub_n>"
)
Expand Down Expand Up @@ -58,6 +66,12 @@
"""str: Pattern matching a DNA deletion-insertion with numeric or intron positions for non-coding variants.
"""

dna_equal_gmo: str = dna_equal_c.replace(pos_intron_utr, pos).replace(
"(?P<dna_equal_c>", "(?P<dna_equal_gmo>"
)
"""str: Pattern matching a DNA substitution with only numeric positions for genomic-style variants.
"""

dna_sub_gmo: str = dna_sub_c.replace(pos_intron_utr, pos).replace(
"(?P<dna_sub_c>", "(?P<dna_sub_gmo>"
)
Expand Down Expand Up @@ -89,19 +103,20 @@
"""

dna_variant_c: str = combine_patterns(
[dna_sub_c, dna_del_c, dna_dup_c, dna_ins_c, dna_delins_c], None
[dna_equal_c, dna_sub_c, dna_del_c, dna_dup_c, dna_ins_c, dna_delins_c], None
)
"""str: Pattern matching any of the coding DNA variants.
"""

dna_variant_n: str = combine_patterns(
[dna_sub_n, dna_del_n, dna_dup_n, dna_ins_n, dna_delins_n], None
[dna_equal_n, dna_sub_n, dna_del_n, dna_dup_n, dna_ins_n, dna_delins_n], None
)
"""str: Pattern matching any of the non-coding DNA variants.
"""

dna_variant_gmo: str = combine_patterns(
[dna_sub_gmo, dna_del_gmo, dna_dup_gmo, dna_ins_gmo, dna_delins_gmo], None
[dna_equal_gmo, dna_sub_gmo, dna_del_gmo, dna_dup_gmo, dna_ins_gmo, dna_delins_gmo],
None,
)
"""str: Pattern matching any of the genomic-style DNA variants.
"""
Expand Down
14 changes: 9 additions & 5 deletions mavehgvs/patterns/protein.py
Expand Up @@ -12,28 +12,32 @@
"""str: Pattern matching an amino acid code followed by a position.
"""

pro_sub: str = rf"(?P<pro_sub>(?:(?P<position>{aa_pos})(?P<new>{amino_acid}|=))|(?P<equal>=)|(?P<equal_sy>\(=\)))"
pro_equal: str = rf"(?P<pro_equal>(?:(?P<position>{aa_pos})?(?P<equal>=))|(?P<equal_sy>\(=\)))"
"""str: Pattern matching protein equality or synonymous variant.
"""

pro_sub: str = rf"(?P<pro_sub>(?P<position>{aa_pos})(?P<new>{amino_acid}))"
"""str: Pattern matching a protein substitution.
"""

pro_del: str = rf"(?P<pro_del>(?:(?P<start>{aa_pos})_(?P<end>{aa_pos})del)|(?:(?P<pos>{aa_pos})del))"
pro_del: str = rf"(?P<pro_del>(?:(?P<start>{aa_pos})_(?P<end>{aa_pos})del)|(?:(?P<position>{aa_pos})del))"
"""str: Pattern matching a protein deletion.
"""

pro_dup: str = rf"(?P<pro_dup>(?:(?P<start>{aa_pos})_(?P<end>{aa_pos})dup)|(?:(?P<pos>{aa_pos})dup))"
pro_dup: str = rf"(?P<pro_dup>(?:(?P<start>{aa_pos})_(?P<end>{aa_pos})dup)|(?:(?P<position>{aa_pos})dup))"
"""str: Pattern matching a protein duplication.
"""

pro_ins: str = rf"(?P<pro_ins>(?P<start>{aa_pos})_(?P<end>{aa_pos})ins(?P<seq>{amino_acid}+))"
"""str: Pattern matching a protein insertion.
"""

pro_delins: str = rf"(?P<pro_delins>(?:(?:(?P<start>{aa_pos})_(?P<end>{aa_pos}))|(?P<pos>{aa_pos}))delins(?P<seq>{amino_acid}+))"
pro_delins: str = rf"(?P<pro_delins>(?:(?:(?P<start>{aa_pos})_(?P<end>{aa_pos}))|(?P<position>{aa_pos}))delins(?P<seq>{amino_acid}+))"
"""str: Pattern matching a protein deletion-insertion.
"""

pro_variant: str = combine_patterns(
[pro_sub, pro_del, pro_dup, pro_ins, pro_delins], None
[pro_equal, pro_sub, pro_del, pro_dup, pro_ins, pro_delins], None
)
"""str: Pattern matching any single protein variant event.
"""
Expand Down
14 changes: 9 additions & 5 deletions mavehgvs/patterns/rna.py
Expand Up @@ -8,28 +8,32 @@
This does not include IUPAC ambiguity characters.
"""

rna_sub: str = rf"(?P<rna_sub>(?:(?P<position>{pos_intron})(?P<ref>{rna_nt})>(?P<new>{rna_nt}))|(?P<equal>=))"
rna_equal: str = rf"(?P<rna_equal>(?:(?:(?P<start>{pos_intron})_(?P<end>{pos_intron}))|(?P<position>{pos_intron}))?(?P<equal>=))"
"""str: Pattern matching RNA equality with numeric or relative-to-transcript positions.
"""

rna_sub: str = rf"(?P<rna_sub>(?P<position>{pos_intron})(?P<ref>{rna_nt})>(?P<new>{rna_nt}))"
"""str: Pattern matching a RNA substitution with numeric or relative-to-transcript positions.
"""

rna_del: str = rf"(?P<rna_del>(?:(?:(?P<start>{pos_intron})_(?P<end>{pos_intron}))|(?P<pos>{pos_intron}))del)"
rna_del: str = rf"(?P<rna_del>(?:(?:(?P<start>{pos_intron})_(?P<end>{pos_intron}))|(?P<position>{pos_intron}))del)"
"""str: Pattern matching a RNA deletion with numeric or relative-to-transcript positions.
"""

rna_dup: str = rf"(?P<rna_dup>(?:(?:(?P<start>{pos_intron})_(?P<end>{pos_intron})dup)|(?P<pos>{pos_intron}))dup)"
rna_dup: str = rf"(?P<rna_dup>(?:(?:(?P<start>{pos_intron})_(?P<end>{pos_intron})dup)|(?P<position>{pos_intron}))dup)"
"""str: Pattern matching a RNA duplication with numeric or relative-to-transcript positions.
"""

rna_ins: str = rf"(?P<rna_ins>(?P<start>{pos_intron})_(?P<end>{pos_intron})ins(?P<seq>{rna_nt}+))"
"""str: Pattern matching a RNA insertion with numeric or relative-to-transcript positions.
"""

rna_delins: str = rf"(?P<rna_delins>(?:(?:(?P<start>{pos_intron})_(?P<end>{pos_intron}))|(?P<pos>{pos_intron}))delins(?P<seq>{rna_nt}+))"
rna_delins: str = rf"(?P<rna_delins>(?:(?:(?P<start>{pos_intron})_(?P<end>{pos_intron}))|(?P<position>{pos_intron}))delins(?P<seq>{rna_nt}+))"
"""str: Pattern matching a RNA deletion-insertion with numeric or relative-to-transcript positions.
"""

rna_variant: str = combine_patterns(
[rna_sub, rna_del, rna_dup, rna_ins, rna_delins], None
[rna_equal, rna_sub, rna_del, rna_dup, rna_ins, rna_delins], None
)
"""str: Pattern matching any single RNA variant event.
"""
Expand Down
5 changes: 4 additions & 1 deletion mavehgvs/position.py
Expand Up @@ -123,7 +123,10 @@ def __lt__(self, other: "VariantPosition") -> bool:
"""
if self.utr == other.utr:
if self.position == other.position:
if self.intronic_position == other.intronic_position:
if (
self.intronic_position == other.intronic_position
): # pragma: no cover
# this case is covered by __eq__
return False
elif self.intronic_position is None:
return other.intronic_position > 0
Expand Down

0 comments on commit f2b82ed

Please sign in to comment.