## Utils

In [1]:
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()
import re
import json
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import jellyfish



In [2]:
from joblib import Parallel, delayed

def parallel_apply(series, func, n_jobs=-1):
    results = Parallel(n_jobs=n_jobs)(delayed(func)(x) for x in tqdm(series, desc="Processing", total=len(series)))
    return pd.Series(results)

## Extract data

In [4]:
def clean_name(name):
    name = name.lower()
    name = re.sub(r'\/', ' ', name)
    return name

In [5]:
# Check obsolet
def check_obsolete(x):
    if x.find('obsolete') != -1:
        return True
    else:
        return False

### MPO

https://www.informatics.jax.org/downloads/reports/VOC_MammalianPhenotype.rpt

To rename in mpo_info.txt and put in a folder "data"

In [6]:
# Read in the MPO data
mpo_df = pd.read_table('data/mpo_info.txt', header=None)
mpo_df.columns = ['MPO_ID', 'MPO_Name', 'MPO_Definition']
print(len(mpo_df))
mpo_df.head()

14155


Unnamed: 0,MPO_ID,MPO_Name,MPO_Definition
0,MP:0000001,mammalian phenotype,"the observable morphological, physiological, b..."
1,MP:0000002,obsolete Morphology,OBSOLETE.
2,MP:0000003,abnormal adipose tissue morphology,any structural anomaly of the connective tissu...
3,MP:0000005,increased brown adipose tissue amount,increased amount of the thermogenic form of ad...
4,MP:0000008,increased white adipose tissue amount,increased quantity of fat-storing cells/tissue


In [7]:
# Transform "/" in " " for the name
mpo_df['MPO_Name'] = mpo_df['MPO_Name'].apply(lambda x: clean_name(x))

# Remove obsolete
mpo_df['Obsolete'] = mpo_df['MPO_Name'].apply(lambda x: check_obsolete(x))
mpo_df = mpo_df[mpo_df['Obsolete'] == False].reset_index(drop=True)
del mpo_df['Obsolete']

print(len(mpo_df))
mpo_df.head()

13708


Unnamed: 0,MPO_ID,MPO_Name,MPO_Definition
0,MP:0000001,mammalian phenotype,"the observable morphological, physiological, b..."
1,MP:0000003,abnormal adipose tissue morphology,any structural anomaly of the connective tissu...
2,MP:0000005,increased brown adipose tissue amount,increased amount of the thermogenic form of ad...
3,MP:0000008,increased white adipose tissue amount,increased quantity of fat-storing cells/tissue
4,MP:0000010,abnormal abdominal fat pad morphology,any structural anomaly of the encapsulated adi...


### HPO

https://hpo.jax.org/app/data/ontology

Download the json file and put it in data folder

In [8]:
with open('data/hp.json') as f:
    hp = json.load(f)
    
hpo_df = pd.DataFrame(hp['graphs'][0]['nodes'])


hpo_df['ref'] = hpo_df['id'].apply(lambda x: x.split('/')[-1])
hpo_df['code_type'] = hpo_df['ref'].apply(lambda x: x[:2])
hpo_df = hpo_df[(hpo_df['code_type'] == 'HP') & (hpo_df['type'] == 'CLASS')].reset_index(drop=True)

def extract_definition(x):
    try:
        return x['definition']['val']
    except:
        return None
    
def extract_synonyms(x):
    try:
        synonyms = x['synonyms']
        synonyms = [s['val'] for s in synonyms]
        return synonyms
    except:
        return None

hpo_df['definition'] = hpo_df['meta'].progress_apply(lambda x: extract_definition(x))
hpo_df['synonyms'] = hpo_df['meta'].progress_apply(lambda x: extract_synonyms(x))

del hpo_df['id']
del hpo_df['type']
del hpo_df['meta']
del hpo_df['code_type']

hpo_df.columns = ['HPO_Name', 'HPO_ID', 'HPO_Definition', 'HPO_Synonyms']
cols_reorder = ['HPO_ID', 'HPO_Name', 'HPO_Definition', 'HPO_Synonyms']
hpo_df = hpo_df[cols_reorder]
print(len(hpo_df))
hpo_df.head()

100%|██████████| 17513/17513 [00:00<00:00, 713812.21it/s]
100%|██████████| 17513/17513 [00:00<00:00, 463320.59it/s]

17513





Unnamed: 0,HPO_ID,HPO_Name,HPO_Definition,HPO_Synonyms
0,HP_0000001,All,,
1,HP_0000002,Abnormality of body height,Deviation from the norm of height with respect...,[Abnormality of body height]
2,HP_0000003,Multicystic kidney dysplasia,Multicystic dysplasia of the kidney is charact...,"[Multicystic dysplastic kidney, Multicystic ki..."
3,HP_0000005,Mode of inheritance,The pattern in which a particular genetic trai...,[Inheritance]
4,HP_0000006,Autosomal dominant inheritance,A mode of inheritance that is observed for tra...,"[Autosomal dominant, monoallelic_autosomal, Au..."


In [9]:
# clean the name
hpo_df['HPO_Name'] = hpo_df['HPO_Name'].apply(lambda x: clean_name(x))

# clean the synonyms
hpo_df['HPO_Synonyms'] = hpo_df['HPO_Synonyms'].apply(lambda x: [clean_name(s) for s in x] if x is not None else None)

hpo_df.head()

Unnamed: 0,HPO_ID,HPO_Name,HPO_Definition,HPO_Synonyms
0,HP_0000001,all,,
1,HP_0000002,abnormality of body height,Deviation from the norm of height with respect...,[abnormality of body height]
2,HP_0000003,multicystic kidney dysplasia,Multicystic dysplasia of the kidney is charact...,"[multicystic dysplastic kidney, multicystic ki..."
3,HP_0000005,mode of inheritance,The pattern in which a particular genetic trai...,[inheritance]
4,HP_0000006,autosomal dominant inheritance,A mode of inheritance that is observed for tra...,"[autosomal dominant, monoallelic_autosomal, au..."


## Find potential matches

### Get all the terms

In [12]:
# Get all words in the MPO name
all_mpo_name = mpo_df['MPO_Name'].tolist()
all_mpo_name = [name.split(' ') for name in all_mpo_name]
all_mpo_terms = [item for sublist in all_mpo_name for item in sublist]
count_mpo_terms = pd.Series(all_mpo_terms).value_counts()
dict_mpo_terms = count_mpo_terms.to_dict()
unique_mpo_terms = list(dict_mpo_terms.keys())

In [13]:
# Get all words in the HPO name
all_hpo_name = hpo_df['HPO_Name'].tolist()
all_hpo_name = [name.split(' ') for name in all_hpo_name]
all_hpo_name = [item for sublist in all_hpo_name for item in sublist]
# Get all words in the HPO synonyms
all_hpo_synonyms = hpo_df['HPO_Synonyms'].tolist()
all_hpo_synonyms = [synonyms_list for synonyms_list in all_hpo_synonyms if synonyms_list is not None]
all_hpo_synonyms = [item for sublist in all_hpo_synonyms for item in sublist]
# Count
all_hpo_terms = all_hpo_name + all_hpo_synonyms
count_hpo_terms = pd.Series(all_hpo_terms).value_counts()
dict_hpo_terms = count_hpo_terms.to_dict()
unique_hpo_terms = list(dict_hpo_terms.keys())

### Compute all potential matching

Levenshtein distance of 1

In [11]:
def get_pot_hpo_term(mpo_term, unique_hpo_terms=unique_hpo_terms, thresh=1):
    pot_hpo_terms = []
    for hpo_term in unique_hpo_terms:
        dist = jellyfish.damerau_levenshtein_distance(mpo_term, hpo_term)
        if dist <= thresh:
            pot_hpo_terms.append(hpo_term)
    return pot_hpo_terms


sclerosis


['sclerosis']

In [12]:
mpo_associated_hpo_terms = {}
for mpo_term in tqdm(unique_mpo_terms, desc='Processing'):
    pot_hpo_terms = get_pot_hpo_term(mpo_term)
    mpo_associated_hpo_terms[mpo_term]= pot_hpo_terms

Processing: 100%|██████████| 4568/4568 [08:42<00:00,  8.74it/s]


### Get potential HPO

In [13]:
def get_all_hpo_terms(x):
    name = x['HPO_Name']
    synonyms = x['HPO_Synonyms']
    all_terms = name.split(' ')
    if synonyms is not None:
        for syn in synonyms:
            all_terms += syn.split(' ')
    return all_terms

hpo_df['all_hpo_terms'] = hpo_df.apply(lambda x: get_all_hpo_terms(x), axis=1)

In [14]:
def get_all_mpo_terms(x):
    name = x['MPO_Name']
    all_terms = name.split(' ')
    return all_terms

mpo_df['all_mpo_terms'] = mpo_df.apply(lambda x: get_all_mpo_terms(x), axis=1)

In [None]:
# Remove classic MPO terms
to_rem_mpo_terms = ['abnormal',
 'morphology',
 'increased',
 'decreased',
 'cell',
 'level',
 'absent',
 'number',
 'circulating',
 'gland',
 'to',
 'incidence',
 'of',
 ]

In [42]:
def check_if_elements_in_list_in_column(x, list_to_check):
    for element in x:
        if element in list_to_check:
            return True
    return False

def get_pot_hpo(mpo_terms, mpo_associated_hpo_terms=mpo_associated_hpo_terms, hpo_df=hpo_df):
    temp_hpo = hpo_df.copy()

    pot_hpo_terms = []
    for mpo_term in mpo_terms:
        if mpo_term not in to_rem_mpo_terms:
            pot_hpo_terms += mpo_associated_hpo_terms[mpo_term]
    pot_hpo_terms = list(set(pot_hpo_terms))
    
    temp_hpo['has_element'] = temp_hpo['all_hpo_terms'].apply(lambda x: check_if_elements_in_list_in_column(x, pot_hpo_terms))
    pot_hpo_ids = temp_hpo[temp_hpo['has_element']]['HPO_ID'].to_list()
    return pot_hpo_ids

# mpo_df['pot_hpo'] = mpo_df['all_mpo_terms'].progress_apply(lambda x: get_pot_hpo(x, mpo_associated_hpo_terms, hpo_df))

result = parallel_apply(mpo_df['all_mpo_terms'], get_pot_hpo, n_jobs=-1)
mpo_df['pot_hpo'] = result

Processing: 100%|██████████| 13708/13708 [13:04<00:00, 17.47it/s]


In [43]:
# Save as json
mpo_df.to_json('results/pot_matches.json', orient='records')

In [10]:
mpo_df = pd.read_json('results/pot_matches.json', orient='records')
mpo_df

Unnamed: 0,MPO_ID,MPO_Name,MPO_Definition,all_mpo_terms,pot_hpo
0,MP:0000001,mammalian phenotype,"the observable morphological, physiological, b...","[mammalian, phenotype]","[HP_0003812, HP_0025354, HP_0031153, HP_003411..."
1,MP:0000003,abnormal adipose tissue morphology,any structural anomaly of the connective tissu...,"[abnormal, adipose, tissue, morphology]","[HP_0000212, HP_0000287, HP_0000291, HP_000029..."
2,MP:0000005,increased brown adipose tissue amount,increased amount of the thermogenic form of ad...,"[increased, brown, adipose, tissue, amount]","[HP_0000212, HP_0000287, HP_0000291, HP_000029..."
3,MP:0000008,increased white adipose tissue amount,increased quantity of fat-storing cells/tissue,"[increased, white, adipose, tissue, amount]","[HP_0000212, HP_0000287, HP_0000291, HP_000029..."
4,MP:0000010,abnormal abdominal fat pad morphology,any structural anomaly of the encapsulated adi...,"[abnormal, abdominal, fat, pad, morphology]","[HP_0000062, HP_0000272, HP_0000287, HP_000029..."
...,...,...,...,...,...
13703,MP:0031464,abnormal spontaneous fetal mouth movement,any anomaly in the spontaneous neuromuscular m...,"[abnormal, spontaneous, fetal, mouth, movement]","[HP_0000153, HP_0000154, HP_0000155, HP_000016..."
13704,MP:3000001,abnormal gastrula morphology,any structural anomaly of the pear shaped tril...,"[abnormal, gastrula, morphology]",[]
13705,MP:3000003,abnormal ebner's gland morphology,any structural anomaly of the serous salivary ...,"[abnormal, ebner's, gland, morphology]",[]
13706,MP:3000004,abnormal nictitating membrane morphology,any structural anomaly of the translucent fold...,"[abnormal, nictitating, membrane, morphology]","[HP_0000145, HP_0000433, HP_0000434, HP_000052..."


## Score with potential matches

In [14]:
def get_levenstein_score(mpo_name, hpo_name):
    score = fuzz.ratio(mpo_name, hpo_name)
    return score

def get_similarity_score(mpo_name, hpo_name, dict_mpo_name=dict_mpo_terms, dict_hpo_name=dict_hpo_terms):
    mpo_words = mpo_name.split(' ')
    hpo_words = hpo_name.split(' ')
    try:
        score_mpo = 0
        for word in mpo_words:
            if word in dict_mpo_name:
                score_mpo += 1/dict_mpo_name[word]
        
        score_hpo = 0
        for word in hpo_words:
            if word in dict_hpo_name:
                score_hpo += 1/dict_hpo_name[word]
        
        score_union = score_mpo + score_hpo

        score_inter = 0
        for word in mpo_words:
            if word in hpo_words:
                score_inter += 1/dict_mpo_name[word]
        for word in hpo_words:
            if word in hpo_words:
                score_inter += 1/dict_hpo_name[word]
        
        score = score_inter/score_union

        return score
    except:
        return 0

In [29]:
mpo_df['info'] = mpo_df.apply(lambda row: (row['pot_hpo'], row['MPO_Name']), axis=1)

def get_hpo_correspondance(mpo_info, hpo_df=hpo_df):
    pot_hpo = mpo_info[0]
    mpo_name = mpo_info[1]

    temp_hpo_df = hpo_df[hpo_df['HPO_ID'].isin(pot_hpo)].copy()
    pot_correspondance = []

    for hpo_id in temp_hpo_df['HPO_ID']:
        hpo = temp_hpo_df[temp_hpo_df['HPO_ID']==hpo_id].iloc[0]
        hpo_name = hpo['HPO_Name']
        hpo_synonyms = hpo['HPO_Synonyms']
        score_levenstein = get_levenstein_score(mpo_name, hpo_name)
        score_similarity = get_similarity_score(mpo_name, hpo_name)
        if score_levenstein == 100 or (score_levenstein > 95 and score_similarity > 0.85) or (score_levenstein > 80 and score_similarity > 0.92):
            pot_correspondance.append((hpo_id, hpo_name, score_levenstein, score_similarity))
        elif hpo_synonyms is not None:
            for synonym in hpo_synonyms:
                score_levenstein = get_levenstein_score(mpo_name, synonym)
                score_similarity = get_similarity_score(mpo_name, synonym)
                if score_levenstein == 100 or (score_levenstein > 95 and score_similarity > 0.85) or (score_levenstein > 70 and score_similarity > 0.92):
                    pot_correspondance.append((hpo_id, hpo_name, score_levenstein, score_similarity))
                    break
    # get unique correspondance
    pot_correspondance = sorted(pot_correspondance, key=lambda x: x[1], reverse=True)

    return pot_correspondance

get_hpo_correspondance(mpo_df.iloc[1]['info'])

[('HP_0009124', 'abnormal adipose tissue morphology', 100, 1.0)]

In [30]:
try:
    result = parallel_apply(mpo_df['info'], get_hpo_correspondance, n_jobs=-1)
    mpo_df['res'] = result
except:
    mpo_df['res'] = mpo_df['info'].progress_apply(lambda x: get_hpo_correspondance(x), axis=1)

Processing: 100%|██████████| 13708/13708 [09:11<00:00, 24.86it/s]


## Format output

### HPO file

In [31]:
mpo_df['hpo_correspondance_id'] = mpo_df['res'].apply(lambda x: [y[0] for y in x])
mpo_df['hpo_correspondance_name'] = mpo_df['res'].apply(lambda x: [y[1] for y in x])
len(mpo_df[mpo_df['hpo_correspondance_id'].apply(lambda x: len(x)) > 0])

3850

In [34]:
mpo_df_match = mpo_df[mpo_df['hpo_correspondance_id'].apply(lambda x: len(x)) > 0].reset_index(drop=True).copy()
mpo_df_match.head()

Unnamed: 0,MPO_ID,MPO_Name,MPO_Definition,all_mpo_terms,pot_hpo,info,res,hpo_correspondance_id,hpo_correspondance_name
0,MP:0000003,abnormal adipose tissue morphology,any structural anomaly of the connective tissu...,"[abnormal, adipose, tissue, morphology]","[HP_0000212, HP_0000287, HP_0000291, HP_000029...","([HP_0000212, HP_0000287, HP_0000291, HP_00002...","[(HP_0009124, abnormal adipose tissue morpholo...",[HP_0009124],[abnormal adipose tissue morphology]
1,MP:0000013,abnormal adipose tissue distribution,alterations in the normal placement of body fat,"[abnormal, adipose, tissue, distribution]","[HP_0000212, HP_0000287, HP_0000291, HP_000029...","([HP_0000212, HP_0000287, HP_0000291, HP_00002...","[(HP_0007552, abnormal subcutaneous fat tissue...",[HP_0007552],[abnormal subcutaneous fat tissue distribution]
2,MP:0000015,abnormal ear pigmentation,anomaly in the coloration of the skin of the o...,"[abnormal, ear, pigmentation]","[HP_0000356, HP_0000357, HP_0000358, HP_000035...","([HP_0000356, HP_0000357, HP_0000358, HP_00003...","[(HP_0001106, periorbital hyperpigmentation, 7...","[HP_0001106, HP_0030790]","[periorbital hyperpigmentation, abnormal cerum..."
3,MP:0000017,big ears,outer ears of a greater than normal size,"[big, ears]","[HP_0000256, HP_0000293, HP_0000303, HP_000035...","([HP_0000256, HP_0000293, HP_0000303, HP_00003...","[(HP_0000256, macrocephaly, 75, 0.960297766749...",[HP_0000256],[macrocephaly]
4,MP:0000018,small ears,outer ears of a smaller than normal size,"[small, ears]","[HP_0000013, HP_0000046, HP_0000050, HP_000005...","([HP_0000013, HP_0000046, HP_0000050, HP_00000...","[(HP_0000385, small earlobe, 83, 0.92393332069...","[HP_0000385, HP_0008551]","[small earlobe, microtia]"


In [58]:
mpo_hpo_dict = mpo_df_match[['MPO_ID', "hpo_correspondance_id"]].set_index('MPO_ID').to_dict()['hpo_correspondance_id']

In [51]:
all_hpo = hpo_df['HPO_ID'].to_list()
hpo_mpo_dict = {}
for hpo in all_hpo:
    hpo_mpo_dict[hpo] = []
for mpo in mpo_hpo_dict:
    for hpo in mpo_hpo_dict[mpo]:
        hpo_mpo_dict[hpo].append(mpo)

In [56]:
hpo_df['mpo_correspondance_id'] = hpo_df['HPO_ID'].apply(lambda x: hpo_mpo_dict[x])
hpo_df['mpo_correspondance_name'] = hpo_df['mpo_correspondance_id'].progress_apply(lambda x: [mpo_df[mpo_df['MPO_ID']==mpo]['MPO_Name'].iloc[0] for mpo in x])

100%|██████████| 17513/17513 [00:06<00:00, 2572.28it/s]


### Final table

In [64]:
result_list1 = [(key, value) for key in mpo_hpo_dict for value in mpo_hpo_dict[key]]
result_list2 = [(key, value) for key in hpo_mpo_dict for value in hpo_mpo_dict[key]]
assert len(result_list1) == len(result_list2)

In [66]:
final_table = pd.DataFrame(result_list1, columns=['MPO_ID', 'HPO_ID'])
final_table = final_table.merge(mpo_df[['MPO_ID', 'MPO_Name']], on='MPO_ID', how='left').merge(hpo_df[['HPO_ID', 'HPO_Name', 'HPO_Synonyms']], on='HPO_ID', how='left')
final_table.head()

Unnamed: 0,MPO_ID,HPO_ID,MPO_Name,HPO_Name,HPO_Synonyms
0,MP:0000003,HP_0009124,abnormal adipose tissue morphology,abnormal adipose tissue morphology,"[abnormality of adipose tissue, abnormality of..."
1,MP:0000013,HP_0007552,abnormal adipose tissue distribution,abnormal subcutaneous fat tissue distribution,[abnormal fat tissue distribution below the skin]
2,MP:0000015,HP_0001106,abnormal ear pigmentation,periorbital hyperpigmentation,"[dark circles around the eyes, dark circles un..."
3,MP:0000015,HP_0030790,abnormal ear pigmentation,abnormal cerumen color,"[abnormal cerumen colour, abnormal cerumen pig..."
4,MP:0000017,HP_0000256,big ears,macrocephaly,"[big calvaria, big cranium, big head, big skul..."


## Save file

In [68]:
# Save as excel
# Save hpo_df and mpo_df to excel
with pd.ExcelWriter('results/hpo_mpo_correspondance.xlsx') as writer:
    hpo_df.to_excel(writer, sheet_name='hpo')
    mpo_df.to_excel(writer, sheet_name='mpo')
    final_table.to_excel(writer, sheet_name='final_table')