This notebook delves into MeSH mapping using two approaches in an attempt to map as much of recognized entities into a MeSH ID:
1. MeSH ID lookup using `mesh_lookup_table.csv` which is produced by `mesh_id_parsing.ipynb`
2. UMLS API which is to be used as a secondary search in case the first match fails; the `MeSH_ID_UMLS_Lookup.ipynb` has code that goes into detail about this approach

# Data

Data load and setup required before MeSH lookups

In [24]:
import pandas as pd
import ast
import time
import requests

In [4]:
# Path for datasets

datapath = '../data/'

In [5]:
# Load datasets

df_train = pd.read_csv(f'{datapath}' + 'OfficialTrainingSet1.csv')
df_val = pd.read_csv(f'{datapath}' + 'OfficialValidationSet1.csv')
df_test = pd.read_csv(f'{datapath}' + 'OfficialTestSet1.csv')

print("Shape of train dataset:", df_train.shape)
print("Shape of validation dataset:", df_val.shape)
print("Shape of test dataset:", df_test.shape)

df_train.head(3)

Shape of train dataset: (500, 13)
Shape of validation dataset: (500, 13)
Shape of test dataset: (500, 13)


Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","['Naloxone', 'clonidine', 'clonidine', 'nalozo...","['hypertensive', 'hypotensive', 'hypertensive'...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","['D009270', 'D003000', 'D003000', '-1', 'D0087...","['D006973', 'D007022', 'D006973', 'D006973']",['D008750'],['D007022']
1,354896,Lidocaine-induced cardiac asystole.,Intravenous administration of a single 50-mg b...,"['Lidocaine', 'lidocaine', 'lidocaine']","['cardiac asystole', 'depression', 'bradyarrhy...","['0', '90', '409']","['9', '99', '418']","['18', '142', '331']","['34', '152', '347']","['D008012', 'D008012', 'D008012']","['D006323', 'D003866', 'D001919']",['D008012'],['D006323']
2,435349,Suxamethonium infusion rate and observed fasci...,Suxamethonium chloride (Sch) was administered ...,"['Suxamethonium', 'Suxamethonium chloride', 'S...","['fasciculations', 'tetanic', 'Fasciculations'...","['0', '80', '104', '312']","['13', '102', '107', '315']","['41', '265', '395', '483', '523', '538', '561...","['55', '272', '409', '496', '536', '544', '568...","['D013390', 'D013390', 'D013390', 'D013390']","['D005207', 'D013746', 'D005207', 'D005207', '...",['D013390'],['D005207']


In [6]:
# Data transformation functions

def convert_col_to_list(string):
    """
    Converts all string columns that look like lists (col index 3 to end) into actual lists 
    """
    return ast.literal_eval(string)


def lowercase_cols(lst):
    """
    Converts chemicals and diseases column to lowercase
    """
    return [item.lower() for item in lst]


def map_cid_to_chemical_name(row):
    """
    Maps CID of chemical in the CID_chemical column into the actual name of the chemical
    """
    cid_chemicals = row['CID_chemical']
    chemical_ids = row['chemical_ids']
    chemicals = row['chemicals']
    
    chemical_names = []
    
    for cid in cid_chemicals:
        if cid in chemical_ids:
            idx = chemical_ids.index(cid)
            chemical_names.append(chemicals[idx])
        else:
            chemical_names.append('unknown')
    
    return chemical_names


def map_cid_to_disease_name(row):
    """
    Maps CID of disease in the CID_disease column into the actual name of the disease
    """
    cid_diseases = row['CID_disease']
    disease_ids = row['disease_ids']
    diseases = row['diseases']
    
    disease_names = []
    
    for cid in cid_diseases:
        if cid in disease_ids:
            idx = disease_ids.index(cid) 
            disease_names.append(diseases[idx]) 
        else:
            disease_names.append('unknown')
    
    return disease_names


# Function to handle "unknown" for chemical names
def map_cid_to_chemical_name_unknown(data):
    '''
    Addresses 'unknown' instances of CID_chemical_names caused by chemicals with pipe (|) notation
    '''
    chemical_id_map = {}
    for i, row in data.iterrows():
        for cid, chemical in zip(row['chemical_ids'], row['chemicals']):
            chemical_id_map[cid] = chemical
    
    # Function to map "unknown" to the correct chemical name if possible
    def resolve_unknown_chemical_name(cids):
        names = []
        for cid in cids:
            # Split combined IDs (separated by '|') and check for matches in the map
            split_ids = cid.split('|')
            name = ' | '.join([chemical_id_map.get(split_id, 'unknown') for split_id in split_ids])
            names.append(name)
        return names

    # Apply the function only to rows where CID_chemical_name has "unknown"
    data['CID_chemical_name'] = data.apply(lambda row: resolve_unknown_chemical_name(row['CID_chemical']) 
                                       if 'unknown' in row['CID_chemical_name'] else row['CID_chemical_name'], axis=1)
    return data

# Function to handle "Unknown" for disease names
def map_cid_to_disease_name_unknown(data):
    '''
    Addresses 'unknown' instances of CID_disease_names caused by diseases with pipe (|) notation
    '''
    disease_id_map = {}
    for i, row in data.iterrows():
        for cid, disease in zip(row['disease_ids'], row['diseases']):
            disease_id_map[cid] = disease
    
    # Function to map "unknown" to the correct disease name if possible
    def resolve_unknown_disease_name(cids):
        names = []
        for cid in cids:
            # Split combined IDs (separated by '|') and check for matches in the map
            split_ids = cid.split('|')
            name = ' | '.join([disease_id_map.get(split_id, 'unknown') for split_id in split_ids])
            names.append(name)
        return names

    # Apply the function only to rows where CID_disease_name has "Unknown"
    data['CID_disease_name'] = data.apply(lambda row: resolve_unknown_disease_name(row['CID_disease']) 
                                      if 'unknown' in row['CID_disease_name'] else row['CID_disease_name'], axis=1)
    return data

In [9]:
# Apply the data transformations functions to all three datasets

list_columns = ['chemicals', 'diseases', 'chemical_ids', 'disease_ids', 'CID_chemical', 'CID_disease']
for col in list_columns:
    df_train[col] = df_train[col].apply(convert_col_to_list) 
    df_val[col] = df_val[col].apply(convert_col_to_list) 
    df_test[col] = df_test[col].apply(convert_col_to_list) 

df_train['chemicals'] = df_train['chemicals'].apply(lowercase_cols)
df_train['diseases'] = df_train['diseases'].apply(lowercase_cols)
df_val['chemicals'] = df_val['chemicals'].apply(lowercase_cols)
df_val['diseases'] = df_val['diseases'].apply(lowercase_cols)
df_test['chemicals'] = df_test['chemicals'].apply(lowercase_cols)
df_test['diseases'] = df_test['diseases'].apply(lowercase_cols)

df_train['CID_chemical_name'] = df_train.apply(map_cid_to_chemical_name, axis=1)
df_train['CID_disease_name'] = df_train.apply(map_cid_to_disease_name, axis=1)
df_val['CID_chemical_name'] = df_val.apply(map_cid_to_chemical_name, axis=1)
df_val['CID_disease_name'] = df_val.apply(map_cid_to_disease_name, axis=1)
df_test['CID_chemical_name'] = df_test.apply(map_cid_to_chemical_name, axis=1)
df_test['CID_disease_name'] = df_test.apply(map_cid_to_disease_name, axis=1)

df_train = map_cid_to_chemical_name_unknown(df_train)
df_train = map_cid_to_disease_name_unknown(df_train)
df_val = map_cid_to_chemical_name_unknown(df_val)
df_val = map_cid_to_disease_name_unknown(df_val)
df_test = map_cid_to_chemical_name_unknown(df_test)
df_test = map_cid_to_disease_name_unknown(df_test)

df_train.head(3)

Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease,CID_chemical_name,CID_disease_name
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","[naloxone, clonidine, clonidine, nalozone, alp...","[hypertensive, hypotensive, hypertensive, hype...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","[D009270, D003000, D003000, -1, D008750, D0092...","[D006973, D007022, D006973, D006973]",[D008750],[D007022],[alpha-methyldopa],[hypotensive]
1,354896,Lidocaine-induced cardiac asystole.,Intravenous administration of a single 50-mg b...,"[lidocaine, lidocaine, lidocaine]","[cardiac asystole, depression, bradyarrhythmias]","['0', '90', '409']","['9', '99', '418']","['18', '142', '331']","['34', '152', '347']","[D008012, D008012, D008012]","[D006323, D003866, D001919]",[D008012],[D006323],[lidocaine],[cardiac asystole]
2,435349,Suxamethonium infusion rate and observed fasci...,Suxamethonium chloride (Sch) was administered ...,"[suxamethonium, suxamethonium chloride, sch, sch]","[fasciculations, tetanic, fasciculations, fasc...","['0', '80', '104', '312']","['13', '102', '107', '315']","['41', '265', '395', '483', '523', '538', '561...","['55', '272', '409', '496', '536', '544', '568...","[D013390, D013390, D013390, D013390]","[D005207, D013746, D005207, D005207, D005207, ...",[D013390],[D005207],[suxamethonium],[fasciculations]


# MeSH ID Lookup Table Approach

MeSH database download: https://www.nlm.nih.gov/databases/download/mesh.html

2024 Descriptor MeSH XML: https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2024.xml

In [10]:
# MeSH Lookup Table
mesh_lookup_df = pd.read_csv('mesh_lookup_table.csv')

In [15]:
# MeSH Lookup Table reversed so it finds MeSH ID from entity

mesh_lookup_expanded = mesh_lookup_df.assign(names=mesh_lookup_df['Names/Entry Terms'].str.split('|')).explode('names')
mesh_lookup_expanded['names'] = mesh_lookup_expanded['names'].str.strip().str.lower()
term_to_mesh = pd.Series(mesh_lookup_expanded['MeSH ID'].values, index=mesh_lookup_expanded['names']).to_dict()
term_to_mesh

{'calcimycin': 'D000001',
 '4-benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1h-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6s-(6alpha(2s*,3s*),8beta(r*),9beta,11alpha))-': 'D000001',
 'a-23187': 'D000001',
 'a 23187': 'D000001',
 'antibiotic a23187': 'D000001',
 'a23187, antibiotic': 'D000001',
 'a23187': 'D000001',
 'temefos': 'D000002',
 'temephos': 'D000002',
 'difos': 'D000002',
 'abate': 'D000002',
 'abattoirs': 'D000003',
 'abattoir': 'D000003',
 'slaughterhouses': 'D000003',
 'slaughterhouse': 'D000003',
 'slaughter houses': 'D000003',
 'house, slaughter': 'D000003',
 'houses, slaughter': 'D000003',
 'slaughter house': 'D000003',
 'abbreviations as topic': 'D000004',
 'acronyms as topic': 'D000004',
 'abdomen': 'D000005',
 'abdomens': 'D000005',
 'abdomen, acute': 'D000006',
 'abdomens, acute': 'D000006',
 'acute abdomen': 'D000006',
 'acute abdomens': 'D000006',
 'abdominal injuries': 'D000007',
 'injuries, abdominal': 'D0

In [16]:
# Function to map terms to MeSH IDs
def get_mesh_id(terms, lookup):
    mesh_ids = []
    for term in terms:
        term = term.lower()
        mesh_id = lookup.get(term, '-1') # Returning -1 if no match is found
        mesh_ids.append(mesh_id)
    return mesh_ids

In [17]:
# Apply the function to the chemicals and diseases columns to get their MeSH IDs
df_train['mapped_chemical_ids'] = df_train['chemicals'].apply(lambda x: get_mesh_id(x, term_to_mesh))
df_train['mapped_disease_ids'] = df_train['diseases'].apply(lambda x: get_mesh_id(x, term_to_mesh))

In [18]:
def get_mismatched_chemicals(row):
    mismatches = []
    for chemical, mapped_id in zip(row['chemicals'], row['mapped_chemical_ids']):
        if mapped_id == '-1':
            mismatches.append(chemical)
    return mismatches

df_train['mismatched_chemicals'] = df_train.apply(get_mismatched_chemicals, axis=1)

mismatch_individual_chemicals = df_train[df_train['mismatched_chemicals'].apply(lambda x: len(x) > 0)]
mismatch_individual_chemicals[['chemicals', 'chemical_ids', 'mapped_chemical_ids', 'mismatched_chemicals']]

Unnamed: 0,chemicals,chemical_ids,mapped_chemical_ids,mismatched_chemicals
0,"[naloxone, clonidine, clonidine, nalozone, alp...","[D009270, D003000, D003000, -1, D008750, D0092...","[D009270, D003000, D003000, -1, D008750, D0092...","[nalozone, 3h-naloxone, 3h-dihydroergocryptine]"
2,"[suxamethonium, suxamethonium chloride, sch, sch]","[D013390, D013390, D013390, D013390]","[D013390, D013390, -1, -1]","[sch, sch]"
4,"[lithium, lithium, lithium, lithium, lithium, ...","[D008094, D008094, D008094, D008094, D008094, ...","[D008094, D008094, D008094, D008094, D008094, ...","[li, li, li]"
10,"[chloroacetaldehyde, cyclophosphamide, ifosfam...","[C004656, D003520, D007069, C004656, C004656, ...","[-1, D003520, D007069, -1, -1, -1, -1, -1, D01...","[chloroacetaldehyde, chloroacetaldehyde, caa, ..."
12,"[clotiazepam, clotiazepam, thienodiazepine, cl...","[C084599, C084599, C013295, C084599, D001569, ...","[-1, -1, -1, -1, D001569, -1, -1, -1, D001569]","[clotiazepam, clotiazepam, thienodiazepine, cl..."
...,...,...,...,...
489,"[lamivudine, hepatitis-b surface antigen, hbsa...","[D019259, D006514, D006514, D019259, D019259, ...","[D019259, -1, D006514, D019259, D019259, D0192...","[hepatitis-b surface antigen, lamivudin]"
490,"[ginsenoside rg1, morphine, rg1, ginsenoside, ...","[C035054, D009020, C035054, D036145, C035054, ...","[-1, D009020, -1, D036145, -1, D009020, -1, D0...","[ginsenoside rg1, rg1, rg1, rg1, rg1, rg1, rg1..."
494,"[gum arabic, gentamicin, gum arabic, gentamici...","[D006170, D005839, D006170, D005839, D005839, ...","[D006170, D005839, D006170, D005839, -1, D0061...","[gm, gm, gsh, gm, gm, gsh, gm, gm, gm, gm, gm]"
496,"[tyrosine, pan, tyrosine, puromycin aminonucle...","[D014443, D011692, D014443, D011692, D011692, ...","[D014443, -1, D014443, D011692, D011692, -1, -...","[pan, pan, pan, pan]"


In [19]:
id_to_names = mesh_lookup_expanded.groupby('MeSH ID')['names'].apply(lambda x: list(x)).to_dict()

def get_actual_chemical_names(mismatched_chemicals, chemical_ids):
    actual_names = []
    for chem, chem_id in zip(mismatched_chemicals, chemical_ids):
        if chem_id != '-1' and chem_id in id_to_names:
            actual_names.append(id_to_names[chem_id])
        else:
            actual_names.append(['-1'])  # In case we can't find a match
    return actual_names

mismatch_individual_chemicals['actual_chemical_names'] = mismatch_individual_chemicals.apply(
    lambda row: get_actual_chemical_names(row['mismatched_chemicals'], row['chemical_ids']), axis=1)

mismatch_individual_chemicals[['chemicals', 'chemical_ids', 'mapped_chemical_ids', 'mismatched_chemicals', 'actual_chemical_names']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mismatch_individual_chemicals['actual_chemical_names'] = mismatch_individual_chemicals.apply(


Unnamed: 0,chemicals,chemical_ids,mapped_chemical_ids,mismatched_chemicals,actual_chemical_names
0,"[naloxone, clonidine, clonidine, nalozone, alp...","[D009270, D003000, D003000, -1, D008750, D0092...","[D009270, D003000, D003000, -1, D008750, D0092...","[nalozone, 3h-naloxone, 3h-dihydroergocryptine]","[[naloxone, naloxone, narcan, narcanti, naloxo..."
2,"[suxamethonium, suxamethonium chloride, sch, sch]","[D013390, D013390, D013390, D013390]","[D013390, D013390, -1, -1]","[sch, sch]","[[succinylcholine, succinylcholine, dicholine ..."
4,"[lithium, lithium, lithium, lithium, lithium, ...","[D008094, D008094, D008094, D008094, D008094, ...","[D008094, D008094, D008094, D008094, D008094, ...","[li, li, li]","[[lithium, lithium, lithium-7, lithium 7], [li..."
10,"[chloroacetaldehyde, cyclophosphamide, ifosfam...","[C004656, D003520, D007069, C004656, C004656, ...","[-1, D003520, D007069, -1, -1, -1, -1, -1, D01...","[chloroacetaldehyde, chloroacetaldehyde, caa, ...","[[-1], [cyclophosphamide, cyclophosphamide, cy..."
12,"[clotiazepam, clotiazepam, thienodiazepine, cl...","[C084599, C084599, C013295, C084599, D001569, ...","[-1, -1, -1, -1, D001569, -1, -1, -1, D001569]","[clotiazepam, clotiazepam, thienodiazepine, cl...","[[-1], [-1], [-1], [-1], [benzodiazepines, ben..."
...,...,...,...,...,...
489,"[lamivudine, hepatitis-b surface antigen, hbsa...","[D019259, D006514, D006514, D019259, D019259, ...","[D019259, -1, D006514, D019259, D019259, D0192...","[hepatitis-b surface antigen, lamivudin]","[[lamivudine, lamivudine, 2',3'-dideoxy-3'-thi..."
490,"[ginsenoside rg1, morphine, rg1, ginsenoside, ...","[C035054, D009020, C035054, D036145, C035054, ...","[-1, D009020, -1, D036145, -1, D009020, -1, D0...","[ginsenoside rg1, rg1, rg1, rg1, rg1, rg1, rg1...","[[-1], [morphine, morphine, morphia, morphine ..."
494,"[gum arabic, gentamicin, gum arabic, gentamici...","[D006170, D005839, D006170, D005839, D005839, ...","[D006170, D005839, D006170, D005839, -1, D0061...","[gm, gm, gsh, gm, gm, gsh, gm, gm, gm, gm, gm]","[[gum arabic, gum arabic, arabic, gum, acacia ..."
496,"[tyrosine, pan, tyrosine, puromycin aminonucle...","[D014443, D011692, D014443, D011692, D011692, ...","[D014443, -1, D014443, D011692, D011692, -1, -...","[pan, pan, pan, pan]","[[tyrosine, tyrosine, l-tyrosine, l tyrosine, ..."


In [20]:
def get_mismatched_diseases(row):
    mismatches = []
    for disease, mapped_id in zip(row['diseases'], row['mapped_disease_ids']):
        if mapped_id == '-1':
            mismatches.append(disease)
    return mismatches

df_train['mismatched_diseases'] = df_train.apply(get_mismatched_diseases, axis=1)

mismatch_individual_diseases = df_train[df_train['mismatched_diseases'].apply(lambda x: len(x) > 0)]
mismatch_individual_diseases[['diseases', 'disease_ids', 'mapped_disease_ids', 'mismatched_diseases']]

Unnamed: 0,diseases,disease_ids,mapped_disease_ids,mismatched_diseases
0,"[hypertensive, hypotensive, hypertensive, hype...","[D006973, D007022, D006973, D006973]","[-1, -1, -1, -1]","[hypertensive, hypotensive, hypertensive, hype..."
1,"[cardiac asystole, depression, bradyarrhythmias]","[D006323, D003866, D001919]","[-1, D003863, D001919]",[cardiac asystole]
2,"[fasciculations, tetanic, fasciculations, fasc...","[D005207, D013746, D005207, D005207, D005207, ...","[D005207, -1, D005207, D005207, D005207, -1, D...","[tetanic, twitch]"
3,[overdosage],[D062787],[-1],[overdosage]
4,"[chronic renal failure, nephropathy, renal fai...","[D007676, D007674, D051437, D011507, D006973, ...","[D007676, -1, D051437, D011507, D006973, -1, D...","[nephropathy, glomerulosclerosis, nephropathy]"
...,...,...,...,...
493,"[acute psychosis, psychosis, epileptic, psycho...","[D011605, D011605, D004827, D011605, D014277, ...","[-1, D011618, -1, D011618, D014277, -1, -1, D0...","[acute psychosis, epileptic, psychotic symptom..."
494,"[nephrotoxicity, acute renal failure, nephroto...","[D007674, D058186, D007674, D007674, D007683, ...","[-1, D058186, -1, -1, -1, -1, D007676]","[nephrotoxicity, nephrotoxicity, nephrotoxicit..."
496,"[nephrosis, glomerular injury, nephrosis, prot...","[D009401, D007674, D009401, D011507, D011507, ...","[D009401, -1, D009401, D011507, D011507, -1]","[glomerular injury, glomerulosclerosis]"
497,"[aplastic anemia, aplastic anemia, agranulocyt...","[D000741, D000741, D000380, D001855, D000741]","[D000741, D000741, D000380, -1, D000741]",[bone marrow suppression]


In [21]:
id_to_names = mesh_lookup_expanded.groupby('MeSH ID')['names'].apply(lambda x: list(x)).to_dict()

def get_actual_disease_names(mismatched_diseases, disease_ids):
    actual_names = []
    for dis, dis_id in zip(mismatched_diseases, disease_ids):
        if dis_id != '-1' and dis_id in id_to_names:
            actual_names.append(id_to_names[dis_id])
        else:
            actual_names.append(['-1'])  # In case we can't find a match
    return actual_names

mismatch_individual_diseases['actual_disease_names'] = mismatch_individual_diseases.apply(
    lambda row: get_actual_disease_names(row['mismatched_diseases'], row['disease_ids']), axis=1)

mismatch_individual_diseases[['diseases', 'disease_ids', 'mapped_disease_ids', 'mismatched_diseases', 'actual_disease_names']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mismatch_individual_diseases['actual_disease_names'] = mismatch_individual_diseases.apply(


Unnamed: 0,diseases,disease_ids,mapped_disease_ids,mismatched_diseases,actual_disease_names
0,"[hypertensive, hypotensive, hypertensive, hype...","[D006973, D007022, D006973, D006973]","[-1, -1, -1, -1]","[hypertensive, hypotensive, hypertensive, hype...","[[hypertension, hypertension, blood pressure, ..."
1,"[cardiac asystole, depression, bradyarrhythmias]","[D006323, D003866, D001919]","[-1, D003863, D001919]",[cardiac asystole],"[[heart arrest, heart arrest, arrest, heart, a..."
2,"[fasciculations, tetanic, fasciculations, fasc...","[D005207, D013746, D005207, D005207, D005207, ...","[D005207, -1, D005207, D005207, D005207, -1, D...","[tetanic, twitch]","[[fasciculation, fasciculation, fasciculations..."
3,[overdosage],[D062787],[-1],[overdosage],"[[drug overdose, drug overdose, drug overdoses..."
4,"[chronic renal failure, nephropathy, renal fai...","[D007676, D007674, D051437, D011507, D006973, ...","[D007676, -1, D051437, D011507, D006973, -1, D...","[nephropathy, glomerulosclerosis, nephropathy]","[[kidney failure, chronic, kidney failure, chr..."
...,...,...,...,...,...
493,"[acute psychosis, psychosis, epileptic, psycho...","[D011605, D011605, D004827, D011605, D014277, ...","[-1, D011618, -1, D011618, D014277, -1, -1, D0...","[acute psychosis, epileptic, psychotic symptom...","[[psychoses, substance-induced, psychoses, sub..."
494,"[nephrotoxicity, acute renal failure, nephroto...","[D007674, D058186, D007674, D007674, D007683, ...","[-1, D058186, -1, -1, -1, -1, D007676]","[nephrotoxicity, nephrotoxicity, nephrotoxicit...","[[kidney diseases, kidney diseases, disease, k..."
496,"[nephrosis, glomerular injury, nephrosis, prot...","[D009401, D007674, D009401, D011507, D011507, ...","[D009401, -1, D009401, D011507, D011507, -1]","[glomerular injury, glomerulosclerosis]","[[nephrosis, nephrosis, nephroses], [kidney di..."
497,"[aplastic anemia, aplastic anemia, agranulocyt...","[D000741, D000741, D000380, D001855, D000741]","[D000741, D000741, D000380, -1, D000741]",[bone marrow suppression],"[[anemia, aplastic, anemia, aplastic, aplastic..."


In [23]:
def calculate_unmatched_percentage(mapped_ids_column):
    # Flatten the list of lists for easier counting
    flat_mapped_ids = [item for sublist in mapped_ids_column for item in sublist]
    
    # Total number of IDs
    total_count = len(flat_mapped_ids)
    
    # Count how many are '-1'
    unmatched_count = flat_mapped_ids.count('-1')
    
    # Calculate the percentage
    if total_count == 0:
        return 0  # Prevent division by zero
    return (unmatched_count / total_count) * 100

# Calculate the percentage of '-1' mappings for chemicals and diseases
chemical_unmatched_percentage = calculate_unmatched_percentage(df_train['mapped_chemical_ids'])

print(f"Percentage of '-1' mappings among all individual chemicals: {chemical_unmatched_percentage:.2f}%")

disease_unmatched_percentage = calculate_unmatched_percentage(df_train['mapped_disease_ids'])

print(f"Percentage of '-1' mappings among all individual diseases: {disease_unmatched_percentage:.2f}%")


Percentage of '-1' mappings among all individual chemicals: 24.57%
Percentage of '-1' mappings among all individual diseases: 38.03%


# UMLS API Lookup Approach

**UMLS page:**
- Create account at https://www.nlm.nih.gov/research/umls/index.html.  Using Gmail is sufficient but need to sign up first.
- 1 - 2 day wait before getting approval usually, API key comes in by default.

**API instructions:**
- https://documentation.uts.nlm.nih.gov/rest/home.html


**API Terms of Use:**
- See: https://documentation.uts.nlm.nih.gov/terms-of-service.html
- In order to avoid overloading our servers, NLM requires that users **send no more than 20 requests per second per IP address**.
- Requests that exceed this limit may not be serviced, and **service will not be restored until the request rate falls beneath the limit**.
- To limit the number of requests that you send to the APIs, NLM **recommends caching results for a 12-24 hour period**. 

In [26]:
API_key = '08e3b919-7434-4261-b41a-c5d67a6bd95d' #ENTER YOUR KEY HERE

In [27]:
def UMLS_retrieve_CUID(entries, API_key, wait_time = 0.1, verbose = False, partial = False):
    '''
    Function to retrieve CUID from UMLS.  Extracts simply the first returned entry at the moment.
    
    Inputs:
    entries: string (single entity) or list of chemical/disease names (single or multiple entities)
    API_key: UMLS api key, string format
    wait_time: wait_time between each call to the API, defaulted to 0.1 which leaves some buffer for the 
               20 requests per second per IP address cap
    verbose: whether to provide additional info for returned entries.
    partial: whether to consider partial matches from UMLS. #### NOT YET IMPLEMENTED 
    
    Returns: 
    CUIDs in the original format (list or string)    
    '''
    base_template = 'https://uts-ws.nlm.nih.gov/rest'
    CUID_template = '/search/current?string='
    CUID_page = '/content/current/CUI/'
    API_template = '&apiKey=' + API_key
    
    assert type(entries) == str or type(entries) == list, f"Search term(s) for entry {entries} should be string or list."
    
    string_entered = False
    
    if type(entries) == str:
        entries = [entries]
        string_entered = True
        
    CUIDs = []
    
    for entry in entries:
        CUID_query = base_template + CUID_template + entry + API_template
        CUID_response = requests.get(CUID_query)
        
        assert CUID_response.status_code == 200, f"Error in calling API for entry {entry}, please check connection or API key."
        
        CUID_pages = CUID_response.json()
        
        if CUID_pages['result']['recCount'] == 0:
            if not partial:
                CUID = '-1'
                if verbose:
                    print(f"No complete match found for entry {entry}")
        
        if CUID_pages['result']['recCount'] > 0:       
            try:
                CUID = CUID_pages['result']['results'][0]['uri'][(CUID_pages['result']['results'][0]['uri'].find('CUI') + 4):]
            except:
                print(f"Error in reading the returned JSON on the CUID query for entry {entry}.  Call format may have changed?")
            
        CUIDs.append(CUID)
        
        time.sleep(wait_time)
        
    assert len(CUIDs) == len(entries), f"The returned number of CUIDs is {len(CUIDs)}, less than that of the inputs ({len(entries)})"
        
    if string_entered:
        CUIDs = str(CUIDs[0])
        
    return CUIDs   

In [28]:
def UMLS_retrieve_MeSHID(entries, entity_type, API_key, wait_time = 0.1001, verbose = False, partial = False):
    '''
    Function to retrieve MeSH ID from UMLS.  Extracts simply the first returned "suitable" entry at the moment.
    
    Specifically, suitable means:
    (i)   the termType is "MH"
    (ii)  "MSH" is found in the descriptor link
    (iii) phrase similar to "/D" is found in the descriptor link
    (iv)  semantic type of the CUID page matches that of the declared entity type
    
    Inputs:
    entries: string (single entity) or list of CUIDs (single or multiple entities),
             presumably coming from the "UMLS_retrieve_CUID" function
    entity_type: 'chemical' or 'disease'
    API_key: UMLS api key, string format
    wait_time: wait_time between each call to the API, defaulted to 0.1 which leaves some buffer for the 
               20 requests per second per IP address cap
    verbose: whether to provide additional info for returned entries.
    partial: whether to consider partial matches from UMLS. #### NOT YET IMPLEMENTED 
    
    Returns: 
    CUIDs in the original format (list or string)    
    '''
    base_template = 'https://uts-ws.nlm.nih.gov/rest'
    CUID_template = '/search/current?string='
    CUID_page = '/content/current/CUI/'
    API_template = '&apiKey=' + API_key
    ## manual selection from https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt
    chemical_typings = ['amino acid, peptide, or protein',
                        'amino acid sequence',
                        'antibiotic',
                        'biologically active substance',
                        'body substance',
                        'chemical',
                        'chemical viewed functionally',
                        'chemical viewed structurally',
                        'clinical drug',
                        'carbohydrate sequence',
                        'element, ion, or isotope',
                        'enzyme',
                        'hazardous or poisonous substance',
                        'hormone',
                        'inorganic chemical',
                        'indicator, reagent, or diagnostic aid',
                        'molecular sequence',
                        'nucleic acid, nucleoside, or nucleotide',
                        'nucleotide sequence',
                        'organic chemical',
                        'pharmacologic substance',
                        'plant',
                        'substance',
                        'vitamin']
    ## manual selection from https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt
    disease_typings = ['acquired abnormality',
                       'anatomical abnormality',
                       'bacterium',
                       'congenital abnormality',
                       'clinical attribute',
                       'cell or molecular dysfunction',
                       'disease or syndrome',
                       'experimental model of disease',
                       'event', ## "Fungus",
                       'injury or poisoning',
                       'mental or behavioral dysfunction',
                       'pathologic function', ## Patient or Disabled Group,
                       'sign or symptom',
                       'virus']
    
    assert type(entries) == str or type(entries) == list, f"Search CUIDs for entry {entries} should be string or list."
    assert entity_type in ['chemical', 'disease'], f"Entity type must be 'chemical' or 'disease' for entry {entries}"
    
    string_entered = False
    
    if type(entries) == str:
        entries = [entries]
        string_entered = True
        
    MeSH_IDs = []
    
    for entry in entries:
        ## entry of -1 gives MeSH ID of -1
        if entry == "-1":
            MeSH_ID = "-1"
            MeSH_IDs.append(MeSH_ID)
            continue
            
        ## Checking entity type
        CUID_info_query = base_template + CUID_page + entry + '?' + API_template[1:]
        CUID_info = requests.get(CUID_info_query)
        
        assert CUID_info.status_code == 200, f"Error in calling API for entry {entry}, please check connection or API key."
        semantic_type = CUID_info.json()['result']['semanticTypes'][0]['name']
        
        if entity_type == 'chemical' and semantic_type.lower() not in chemical_typings:
            MeSH_ID = "-1"
            MeSH_IDs.append(MeSH_ID)
            continue
        ## for disease, wonder what are the possible types?  
        ## There are terms like "abnormality", "dysfunction" and "injury or poisoning"
        ## See:https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/SemanticTypes_2018AB.txt
        elif entity_type == 'disease' and semantic_type.lower() not in disease_typings:
            MeSH_ID = "-1"
            MeSH_IDs.append(MeSH_ID)
            continue
            
        else:
            MeSHID_query = base_template + CUID_page + entry + '/atoms?' + API_template[1:]
            MeSHID_response = requests.get(MeSHID_query)
            
            found = False
            
            assert MeSHID_response.status_code == 200, f"Error in calling API for entry {entry}, please check connection or API key."

            ## Looping through the results to find the first entry with 'MH'
            MeSH_results = MeSHID_response.json()['result']

            for result in MeSH_results:
                if result['termType'] == 'MH' or 'MSH':
                    concept_url = result['sourceDescriptor']
                    if '/D' in concept_url and 'MSH' in concept_url:
                        MeSH_ID = concept_url[concept_url.find('/D') + 1:]
                        MeSH_IDs.append(MeSH_ID)
                        found =  True
                        break
                        
            if not found:
                MeSH_ID = "Not found"
                MeSH_IDs.append(MeSH_ID)
        
        time.sleep(wait_time)
    
    assert len(MeSH_IDs) == len(entries), f"The returned number of MeSH IDs is {len(MeSH_IDs)}, less than that of the inputs ({len(entries)})"
        
    if string_entered:
        MeSH_IDs = str(MeSH_IDs[0])
        
    return MeSH_IDs

In [38]:
# Example
entries = df_train['chemicals'][0]

CUID_testcases = UMLS_retrieve_CUID(entries, API_key)
MeSH_testcases = UMLS_retrieve_MeSHID(CUID_testcases, 'chemical', API_key)

MeSH_testcases

['D009270',
 'D003000',
 'D003000',
 '-1',
 'D008750',
 'D009270',
 'D009270',
 'D003000',
 '-1',
 'D009270',
 'D003000',
 '-1',
 'D009270',
 'D003000',
 'D003000',
 'D008750']

# Testing UMLS Approach to Mismatched Entities

In [36]:

mismatch_individual_chemicals

Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease,CID_chemical_name,CID_disease_name,mapped_chemical_ids,mapped_disease_ids,mismatched_chemicals,actual_chemical_names
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","[naloxone, clonidine, clonidine, nalozone, alp...","[hypertensive, hypotensive, hypertensive, hype...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","[D009270, D003000, D003000, -1, D008750, D0092...","[D006973, D007022, D006973, D006973]",[D008750],[D007022],[alpha-methyldopa],[hypotensive],"[D009270, D003000, D003000, -1, D008750, D0092...","[-1, -1, -1, -1]","[nalozone, 3h-naloxone, 3h-dihydroergocryptine]","[[naloxone, naloxone, narcan, narcanti, naloxo..."
2,435349,Suxamethonium infusion rate and observed fasci...,Suxamethonium chloride (Sch) was administered ...,"[suxamethonium, suxamethonium chloride, sch, sch]","[fasciculations, tetanic, fasciculations, fasc...","['0', '80', '104', '312']","['13', '102', '107', '315']","['41', '265', '395', '483', '523', '538', '561...","['55', '272', '409', '496', '536', '544', '568...","[D013390, D013390, D013390, D013390]","[D005207, D013746, D005207, D005207, D005207, ...",[D013390],[D005207],[suxamethonium],[fasciculations],"[D013390, D013390, -1, -1]","[D005207, -1, D005207, D005207, D005207, -1, D...","[sch, sch]","[[succinylcholine, succinylcholine, dicholine ..."
4,1378968,Effects of uninephrectomy and high protein fee...,Rats with lithium-induced nephropathy were sub...,"[lithium, lithium, lithium, lithium, lithium, ...","[chronic renal failure, nephropathy, renal fai...","['54', '111', '362', '520', '581', '608', '632...","['61', '118', '369', '527', '588', '615', '639...","['70', '127', '309', '975', '1000', '1027', '1...","['91', '138', '322', '986', '1012', '1045', '1...","[D008094, D008094, D008094, D008094, D008094, ...","[D007676, D007674, D051437, D011507, D006973, ...","[D008094, D008094, D008094]","[D006973, D011507, D007676]","[lithium, lithium, lithium]","[hypertension, proteinuria, chronic renal fail...","[D008094, D008094, D008094, D008094, D008094, ...","[D007676, -1, D051437, D011507, D006973, -1, D...","[li, li, li]","[[lithium, lithium, lithium-7, lithium 7], [li..."
10,2505783,Chloroacetaldehyde and its contribution to uro...,"Based on clinical data, indicating that chloro...","[chloroacetaldehyde, cyclophosphamide, ifosfam...","[hemorrhagic cystitis, bladder damage]","['0', '77', '97', '192', '212', '349', '423', ...","['18', '93', '107', '210', '215', '352', '426'...","['375', '476']","['395', '490']","[C004656, D003520, D007069, C004656, C004656, ...","[D006470|D003556, D001745]","[C004656, C004656]","[D003556, D006470]","[chloroacetaldehyde, chloroacetaldehyde]","[cystitis, bleeding]","[-1, D003520, D007069, -1, -1, -1, -1, -1, D01...","[D000096722, -1]","[chloroacetaldehyde, chloroacetaldehyde, caa, ...","[[-1], [cyclophosphamide, cyclophosphamide, cy..."
12,2572625,Clotiazepam-induced acute hepatitis.,We report the case of a patient who developed ...,"[clotiazepam, clotiazepam, thienodiazepine, cl...","[hepatitis, hepatitis, extensive hepatocellula...","['0', '185', '200', '228', '314', '353', '471'...","['11', '196', '215', '239', '329', '364', '482...","['26', '89', '104', '432', '500', '546']","['35', '98', '137', '441', '509', '560']","[C084599, C084599, C013295, C084599, D001569, ...","[D056486, D056486, D047508, D056486, D056486, ...",[C084599],[D056486],[clotiazepam],[hepatitis],"[-1, -1, -1, -1, D001569, -1, -1, -1, D001569]","[D006505, D006505, -1, D006505, D006505, -1]","[clotiazepam, clotiazepam, thienodiazepine, cl...","[[-1], [-1], [-1], [-1], [benzodiazepines, ben..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
489,18464113,Lamivudine for the prevention of hepatitis B v...,Hepatitis B virus (HBV) is one of the major ca...,"[lamivudine, hepatitis-b surface antigen, hbsa...","[hepatitis b, cancer, hepatitis b, liver disea...","['0', '67', '96', '576', '666', '724', '890', ...","['10', '94', '101', '586', '676', '734', '899'...","['33', '116', '167', '229', '254', '316', '458...","['44', '122', '178', '242', '260', '336', '464...","[D019259, D006514, D006514, D019259, D019259, ...","[D006509, D009369, D006509, D008107, D009369, ...",[D006514],[D006509],[hepatitis-b surface antigen],[hepatitis b],"[D019259, -1, D006514, D019259, D019259, D0192...","[D006509, D009369, D006509, D008107, D009369, ...","[hepatitis-b surface antigen, lamivudin]","[[lamivudine, lamivudine, 2',3'-dideoxy-3'-thi..."
490,18308784,Ginsenoside Rg1 restores the impairment of lea...,"Rg1, as a ginsenoside extracted from Panax gin...","[ginsenoside rg1, morphine, rg1, ginsenoside, ...","[impairment of learning, learning impairment, ...","['0', '71', '104', '114', '242', '329', '394',...","['15', '79', '107', '125', '245', '337', '397'...","['29', '181', '401']","['51', '200', '420']","[C035054, D009020, C035054, D036145, C035054, ...","[D007859, D007859, D007859]",[D009020],[D007859],[morphine],[impairment of learning],"[-1, D009020, -1, D036145, -1, D009020, -1, D0...","[-1, -1, -1]","[ginsenoside rg1, rg1, rg1, rg1, rg1, rg1, rg1...","[[-1], [morphine, morphine, morphia, morphine ..."
494,12617329,The effect of treatment with gum Arabic on gen...,In the present work we assessed the effect of ...,"[gum arabic, gentamicin, gum arabic, gentamici...","[nephrotoxicity, acute renal failure, nephroto...","['29', '43', '168', '213', '225', '340', '395'...","['39', '53', '178', '223', '227', '350', '405'...","['54', '182', '229', '552', '1106', '1378', '1...","['68', '201', '243', '566', '1122', '1392', '1...","[D006170, D005839, D006170, D005839, D005839, ...","[D007674, D058186, D007674, D007674, D007683, ...","[D005839, D006170]","[D058186, D058186]","[gentamicin, gum arabic]","[acute renal failure, acute renal failure]","[D006170, D005839, D006170, D005839, -1, D0061...","[-1, D058186, -1, -1, -1, -1, D007676]","[gm, gm, gsh, gm, gm, gsh, gm, gm, gm, gm, gm]","[[gum arabic, gum arabic, arabic, gum, acacia ..."
496,11961407,GLEPP1 receptor tyrosine phosphatase (Ptpro) i...,Glomerular epithelial protein 1 (GLEPP1) is a ...,"[tyrosine, pan, tyrosine, puromycin aminonucle...","[nephrosis, glomerular injury, nephrosis, prot...","['16', '52', '183', '646', '733', '760', '849'...","['24', '55', '191', '671', '758', '763', '852'...","['56', '451', '672', '904', '993', '1021']","['65', '468', '681', '915', '1004', '1039']","[D014443, D011692, D014443, D011692, D011692, ...","[D009401, D007674, D009401, D011507, D011507, ...","[D011692, D011692]","[D011507, D009401]","[pan, pan]","[proteinuria, nephrosis]","[D014443, -1, D014443, D011692, D011692, -1, -...","[D009401, -1, D009401, D011507, D011507, -1]","[pan, pan, pan, pan]","[[tyrosine, tyrosine, l-tyrosine, l tyrosine, ..."


In [44]:
# Get list of unique mismatched chemicals

all_chemicals = list(set([item for row in df_train['chemicals'] for item in row]))
unique_mismatched_chemicals = list(set([item for sublist in mismatch_individual_chemicals['mismatched_chemicals'] for item in sublist]))
print(len(all_chemicals))
print(len(unique_mismatched_chemicals))
unique_mismatched_chemicals

1008
398


['ifn',
 '3r-1-2-(3,4-dichlorophenyl)ethyl-1,4-diazabicyclo4.3.0nonane',
 'iva',
 'mpep',
 'testosterone heptylate',
 'nacl',
 'bromperidol',
 'chloroacetaldehyde',
 'cacl2',
 'appetite-suppressants',
 'hcfcs',
 'picloxydine',
 'neurotensin type-1 receptor antagonist',
 'dnr',
 'trp',
 "4'-0-tetrahydropyranyladriamycin",
 'p-coumaric acid',
 'alpha,beta-meatp',
 'dex',
 'hexafluorodiethyl ether',
 'fluoropyrimidines',
 '3-benzenesulfonyl-7-(4-methyl-piperazin-1-yl)1h-indole',
 'caa',
 'n-pyrimidinyl-2-phenoxyacetamides',
 'basd  ne',
 '2-methoxy-4-amino-5-chlorobenzoic acid 2-(diethylamino)ethyl ester',
 'zyban',
 'posaconazole',
 'ginsenoside rg1',
 'cpa',
 'be',
 'mfl',
 '5-hydroxytriptamine',
 'benzylpenicilloate',
 'pentose phosphate',
 '3-aminopropyl-diethoxy-methyl-phosphinic acid',
 'methyl beta-carboline-3-carboxylate',
 'op',
 'hydrogen cyanamide',
 'um-272',
 'pg-9',
 'lr132',
 'apraclonidine',
 'udca',
 'hepatitis b virus e antigen',
 'gm',
 'zdv',
 'co2',
 'amb',
 'estradio

In [42]:
# Find UMLS MeSH

entries = unique_mismatched_chemicals

CUID_testcases = UMLS_retrieve_CUID(entries, API_key)
MeSH_testcases = UMLS_retrieve_MeSHID(CUID_testcases, 'chemical', API_key)

MeSH_testcases

['D007372',
 '-1',
 'Not found',
 'Not found',
 'Not found',
 'D012965',
 'Not found',
 'Not found',
 'D002122',
 'Not found',
 'Not found',
 'Not found',
 '-1',
 '-1',
 'D014364',
 '-1',
 'Not found',
 'Not found',
 'D003915',
 'D005481',
 'Not found',
 'Not found',
 '-1',
 '-1',
 '-1',
 'Not found',
 'D016642',
 'Not found',
 'Not found',
 '-1',
 '-1',
 '-1',
 '-1',
 'Not found',
 '-1',
 '-1',
 'Not found',
 'D005469',
 '-1',
 'Not found',
 'Not found',
 '-1',
 'Not found',
 'D014580',
 'D006513',
 '-1',
 'D015215',
 '-1',
 'Not found',
 '-1',
 'Not found',
 'Not found',
 '-1',
 '-1',
 'D012964',
 '-1',
 'Not found',
 'Not found',
 'Not found',
 '-1',
 'D000431',
 '-1',
 '-1',
 'Not found',
 '-1',
 '-1',
 'D016191',
 '-1',
 'D006426',
 'D000069449',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 '-1',
 '-1',
 'Not found',
 'Not found',
 '-1',
 'Not found',
 'Not found',
 '-1',
 '-1',
 '-1',
 'D020888',
 'Not found',
 '-1',
 'Not found',
 'D001374',
 'Not found',
 'Not found

In [46]:
sum(1 for item in MeSH_testcases if item not in {'Not found', '-1'})

80

It only found 80 out of 398 unmatched chemicals

In [48]:
# Get list of unique mismatched diseases

all_diseases = list(set([item for row in df_train['diseases'] for item in row]))
unique_mismatched_diseases = list(set([item for sublist in mismatch_individual_diseases['mismatched_diseases'] for item in sublist]))
print(len(all_diseases))
print(len(unique_mismatched_diseases))

# Find UMLS MeSH

entries = unique_mismatched_diseases

CUID_testcases = UMLS_retrieve_CUID(entries, API_key)
MeSH_testcases = UMLS_retrieve_MeSHID(CUID_testcases, 'disease', API_key)

MeSH_testcases

1397
728


['-1',
 '-1',
 '-1',
 'Not found',
 'Not found',
 '-1',
 '-1',
 '-1',
 'D010319',
 '-1',
 '-1',
 'D020833',
 'Not found',
 'D008569',
 'Not found',
 'Not found',
 '-1',
 'D003967',
 'D003866',
 'Not found',
 'Not found',
 '-1',
 '-1',
 '-1',
 '-1',
 'Not found',
 '-1',
 'D020428',
 'Not found',
 '-1',
 'Not found',
 '-1',
 'Not found',
 'D006209',
 '-1',
 'Not found',
 '-1',
 '-1',
 'D003920',
 'Not found',
 '-1',
 'Not found',
 'Not found',
 '-1',
 'D007037',
 'D007681',
 '-1',
 '-1',
 'D003095',
 'D062787',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 'D008103',
 'Not found',
 'D006470',
 '-1',
 'D009128',
 '-1',
 'Not found',
 '-1',
 'D020300',
 'Not found',
 '-1',
 '-1',
 'Not found',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 '-1',
 'Not found',
 '-1',
 'Not found',
 'D011618',
 '-1',
 '-1',
 '-1',
 'Not found',
 '-1',
 'Not found',
 '-1',
 '-1',
 'D000787',
 '-1',
 'D009408',
 '-1',
 'Not found',
 '-1',
 '-1',
 'D007676',
 'Not found',
 '-1',
 'Not found',
 'D002546',
 'Not found',
 'D009157',
 

In [49]:
sum(1 for item in MeSH_testcases if item not in {'Not found', '-1'})

139

It only found 139 out of 728 unmatched diseases

Above query took about ~7 minutes, and it fails sometimes due to connection error