In [None]:
import re
import string
from typing import Tuple, List
import pandas as pd


df = pd.read_csv("./hgvs_searches.csv")
non_resolve_mask = df["can_resolve"] == False
hgvs_errors_df = df[non_resolve_mask]

hgvs_errors_df = hgvs_errors_df.sort_values("hgvs")
  
# dropping ALL duplicate values
hgvs_errors_df.drop_duplicates(subset="hgvs",
                               keep=False, inplace=True)

hgvs_errors_df.to_csv("hgvs_errors_uniq.csv")



In [None]:

hgvs_errors_df

In [None]:



pattern_kind_no_colon = re.compile(r"(c|g|m|n|p)\.(\d+)")
pattern_kind_no_dot = re.compile(r":(c|g|m|n|p)(\d+)")
pattern_gene_symbol = re.compile(r"^[A-Z0-9-]+$|^C[0-9XY]+orf[0-9]+") # HGNC gene symbol - https://www.biostars.org/p/60118/#65063


# Copy/pasted from pyhgvs
# The RefSeq standard for naming contigs/transcripts/proteins:
# http://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly  # nopep8
REFSEQ_PREFIXES = [
    ('AC_', 'genomic',
     'Complete genomic molecule, usually alternate assembly'),
    ('NC_', 'genomic',
     'Complete genomic molecule, usually reference assembly'),
    ('NG_', 'genomic', 'Incomplete genomic region'),
    ('NT_', 'genomic', 'Contig or scaffold, clone-based or WGS'),
    ('NW_', 'genomic', 'Contig or scaffold, primarily WGS'),
    ('NS_', 'genomic', 'Environmental sequence'),
    ('NZ_', 'genomic', 'Unfinished WGS'),
    ('NM_', 'mRNA', ''),
    ('NR_', 'RNA', ''),
    ('XM_', 'mRNA', 'Predicted model'),
    ('XR_', 'RNA', 'Predicted model'),
    ('AP_', 'Protein', 'Annotated on AC_ alternate assembly'),
    ('NP_', 'Protein', 'Associated with an NM_ or NC_ accession'),
    ('YP_', 'Protein', ''),
    ('XP_', 'Protein', 'Predicted model, associated with an XM_ accession'),
    ('ZP_', 'Protein', 'Predicted model, annotated on NZ_ genomic records'),
]




def remove_non_printable_characters(hgvs_string):
    return re.sub(f'[^{re.escape(string.printable)}]', '', hgvs_string)

def remove_whitespace(hgvs_string):
    """ This would be covered in remove_invalid_characters but this gives a nicer message """
    return re.sub("\s", '', hgvs_string)

def remove_invalid_characters(hgvs_string):
    return re.sub("[^A-Za-z0-9-_\(\)\>=]", '', hgvs_string)


def clean_kind(hgvs_string):
    # Fix common typos
    
    # c, -> c.  
    # ;c -> :c  semicolon
    
    
    return hgvs_string
    

def add_unmatched_brackets(hgvs_string):
    return hgvs_string

def add_missing_colon(hgvs_string):
    # GLA c.
    # NM_001205293.2(CACNA1E):c.4165C>T'
    
    return hgvs_string

def remove_duplicates(hgvs_string):
    hgvs_string = re.sub("::+", ":", hgvs_string)
    hgvs_string = re.sub("\.\.+", ".", hgvs_string)
    return hgvs_string


def fix_allele_case(allele_string):
    allele_keyworks = [
        'del',
        'delins',
        'dup',
        'ins',
        'inv',
    ]
    for ak in allele_keyworks:
        allele_string = re.sub(ak, ak, allele_string, flags=re.IGNORECASE)
    return allele_string
    

GLOBAL_CLEAN = {
    "remove_non_printable_characters": remove_non_printable_characters,
    "remove_whitespace": remove_whitespace,
    "remove_invalid_characters", remove_invalid_characters,
    "remove duplicates": remove_duplicates,
}


    # Optional - remove gene symbol - (for clingen and biocommons HGVS) 
    # "remove_gene_symbol": remove_gene_symbol,

#    "clean_kind": clean_kind,
#    "add_unmatched_brackets": add_unmatched_brackets,
#    "add_missing_colon": add_missing_colon,


test_hgvs = [
    "c.4165C>T", # This should fail as it has no transcript/gene
    "CACNA1E:c.4165C>T'", # gene name - it's resolution that is trick here
    "CACNA1E c.4165C>T'", # extra space, missing colon 
    "CACNA1Ec.4165C>T'", # missing colon
    "NM_001205293.2 :c.4165C>T'", # whitespace
    "NM_001205293.2(CACNA1E):c.4165C>T'", # 
    "NM_001205293.2 :c.4165C>T'", # whitespace
]



def clean_hgvs(original_hgvs_string) -> Tuple[str, List[str]]:
    hgvs_string = original_hgvs_string
    clean_messages = []

    for clean_method_desc, clean_hgvs_func in GLOBAL_CLEAN.items():
        cleaned_hgvs_string = clean_hgvs_func(hgvs_string) # hgvs_method)
        if cleaned_hgvs_string != hgvs_string:
            clean_messages.append(clean_method_desc)
        hgvs_string = cleaned_hgvs_string


    # Now we split it up into reference/kind/allele
        
        
    return hgvs_string, clean_messages
            

In [None]:
# I think we should first break it up into 
# reference / kind / allele
# 


original_hgvs_string = "GLA c.1277_1278delAA"
hgvs_string, clean_messages = clean_hgvs(original_hgvs_string)
print(f"{original_hgvs_string} -> {hgvs_string} ")
for msg in clean_messages:
    print(msg)
    


In [None]:
# This is from VariantGrid code


        BAD_HGVS = [
            "NM_000038.6;c.4332A>T"  # Semicolon instead
            "NM_205768 c.44A>G",  # Missing colon (no version)
            "NM_005629.3:c1403A>C",  # Missing dot after kind
            "NM_001101.4 c.95C>G",  # Missing colon
            "NM_00380.3: c.648_649delGA",  # space after colon
            "NC_000023.10:g. 31496384G>A",
            "NM_004245: :c.337G>T",  # Double colon
            "NC_000017.10:g.21085664 G>C",  # Space after numbers
            "NC_000023.10:g. 133547943G>A",  # Space after g.
            # Missing transcript underscore, Missing colon, Missing dot after g
            # Space between position and reference base
            "NC000002.10g39139341 C>T",
            # Unbalanced brackets
            "NM_001754.5):c.557T>A",
            "(NM_004991.4:c.2577+4A>T",
            # Good brackets HGVS (just testing gene symbol)
            "NM_001754.5(RUNX1):c.1415T>C",
            "NM_032638:c.1126_1133DUP",  # Case
            "NM_001754.5:557T>A",  # Missing "c."
            "NC_000007.13:117199563G>T",  # Missing "g."
        ]

        for bad_hgvs in BAD_HGVS:
            try:
                HGVSName(bad_hgvs)
                self.fail(f"Expected '{bad_hgvs}' to fail!")
            except:
                pass  # Expected

            fixed_hgvs = HGVSMatcher.clean_hgvs(bad_hgvs)[0]
            HGVSName(fixed_hgvs)


