In [1]:
import yaml

with open('data/genes.yaml', 'r') as file:
    original_data = yaml.safe_load(file)
    print(original_data[:5])


[{'name': 'ELL', 'synonyms': ['c19orf17', 'ell1', 'men', 'ppp1r68', 'ell', 'elongation factor for rna polymerase ii', 'rna polymerase ii elongation factor ell']}, {'name': 'NUP214', 'synonyms': ['cain', 'can', 'd9s46e', 'n214', 'nucleoporin 214kd (cain)', 'nucleoporin 214kda', 'nup214', 'nucleoporin 214', 'nuclear pore complex protein nup214']}, {'name': 'MET', 'synonyms': ['dfnb97', 'hgfr', 'met', 'met proto-oncogene', 'rccp2', 'met proto-oncogene, receptor tyrosine kinase', 'hepatocyte growth factor receptor', 'auts9', 'mesenchymal-epithelial transition factor']}, {'name': 'SPP1', 'synonyms': ['bnsp', 'bone sialoprotein i', 'bspi', 'eta-1', 'opn', 'osteopontin', 'spp1', 'secreted phosphoprotein 1']}, {'name': 'RAD54B', 'synonyms': ['rad54b', 'rdh54', 'rad54 homolog b (s. cerevisiae)', 'dna repair and recombination protein rad54b']}]


In [2]:
def get_substring_coordinates(main_string: str, substring: str) -> list:
    """
    Finds all occurrences of a substring within a string and returns their start and end indices.

    Parameters:
    - main_string (str): The string in which to search for the substring.
    - substring (str): The substring to find within the main_string.

    Returns:
    - list: A list of lists, where each sublist contains the start and end indices (inclusive, exclusive) of each occurrence of the substring.
    """
    coordinates = []
    start = 0
    
    main_string = main_string.lower()
    substring = substring.lower()
    while True:
        start = main_string.find(substring, start)
        if start == -1:
            break
        end = start + len(substring)
        coordinates.append([start, end])
        start += 1
    
    return coordinates


def check_gene(input_str: str, gene: dict) -> dict | None:
    """
    Checks if a gene is present in the input string and returns its information if found.

    Parameters:
    - input_str (str): The input string to search for the gene.
    - gene (dict): A dictionary containing information about the gene, including its synonyms.

    Returns:
    - dict or None: A dictionary containing the gene information if found, or None if not found.
    """
    all_gene_coords = []
    synonyms_to_check = gene.get('synonyms', [])
    if 0 == sum([gene['name'] in synonym for synonym in synonyms_to_check]):
        synonyms_to_check += [gene['name']]
    for synonym in synonyms_to_check:
        synonym_coords = get_substring_coordinates(input_str, synonym)
        for synonym_coord in synonym_coords: # [[substr1_coords], [substr1_other_coords]]
            if synonym_coord[0] not in [coord[0] for coord in all_gene_coords]:
                all_gene_coords.extend(synonym_coords)
            else:
                for coord in all_gene_coords:
                    if coord[0] == synonym_coord[0]:
                        coord[1] = max(coord[1], synonym_coord[1])
    if all_gene_coords:
        gene_found = {
            'name': gene.get('name'),
            'positions': all_gene_coords
        }
        return gene_found
        
        
    
def get_genes(input_str: str) -> dict:
    """
    Finds genes present in the input string and returns their information.

    Parameters:
    - input_str (str): The input string to search for genes.

    Returns:
    - dict: A dictionary containing information about genes found in the input string.
    """
    genes_found = []
    for gene in original_data:
        result = check_gene(input_str, gene)
        if result is not None:
            genes_found.append(result)
    return {
        "genes": genes_found
    }

In [3]:
get_genes('The human CAN protein, a putative oncogene product associated with myeloid leukemogenesis')

{'genes': [{'name': 'NUP214', 'positions': [[10, 13]]}]}