In [13]:
import yaml

with open('data/genes.yaml', 'r') as file:
    original_data = yaml.safe_load(file)
    print(original_data[:5])


[{'name': 'ELL', 'synonyms': ['c19orf17', 'ell1', 'men', 'ppp1r68', 'ell', 'elongation factor for rna polymerase ii', 'rna polymerase ii elongation factor ell']}, {'name': 'NUP214', 'synonyms': ['cain', 'can', 'd9s46e', 'n214', 'nucleoporin 214kd (cain)', 'nucleoporin 214kda', 'nup214', 'nucleoporin 214', 'nuclear pore complex protein nup214']}, {'name': 'MET', 'synonyms': ['dfnb97', 'hgfr', 'met', 'met proto-oncogene', 'rccp2', 'met proto-oncogene, receptor tyrosine kinase', 'hepatocyte growth factor receptor', 'auts9', 'mesenchymal-epithelial transition factor']}, {'name': 'SPP1', 'synonyms': ['bnsp', 'bone sialoprotein i', 'bspi', 'eta-1', 'opn', 'osteopontin', 'spp1', 'secreted phosphoprotein 1']}, {'name': 'RAD54B', 'synonyms': ['rad54b', 'rdh54', 'rad54 homolog b (s. cerevisiae)', 'dna repair and recombination protein rad54b']}]


In [14]:
len(original_data), original_data[0]

(20,
 {'name': 'ELL',
  'synonyms': ['c19orf17',
   'ell1',
   'men',
   'ppp1r68',
   'ell',
   'elongation factor for rna polymerase ii',
   'rna polymerase ii elongation factor ell']})

In [21]:
def get_substring_coordinates(main_string: str, substring: str) -> list:
    coordinates = []
    start = 0
    
    main_string = main_string.lower()
    substring = substring.lower()
    while True:
        start = main_string.find(substring, start)
        if start == -1:
            break
        end = start + len(substring)
        coordinates.append((start, end))
        start += 1
    
    return coordinates


def check_gene(input_str: str, gene: dict) -> dict | None:
    all_gene_coords = []
    for synonym in gene.get('synonyms', ''):
        synonym_coords = get_substring_coordinates(input_str, synonym)
        all_gene_coords.extend(synonym_coords)
    if all_gene_coords:
        gene_found = {
            'name': gene.get('name'),
            'positions': all_gene_coords
        }
        return gene_found
        
        
    
def get_genes(input_str: str) -> dict:
    genes_found = []
    for gene in original_data:
        result = check_gene(input_str, gene)
        if result is not None:
            genes_found.append(result)
    return {
        "genes": genes_found
    }

In [22]:
get_genes('The human CAN protein, a putative oncogene product associated with myeloid leukemogenesis')

{'genes': [{'name': 'NUP214', 'positions': [(10, 13)]}]}