# MeSH ID Parsing Code

MeSH database download: https://www.nlm.nih.gov/databases/download/mesh.html

2024 Descriptor MeSH XML: https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2024.xml

Some discoveries about XML structure
1. MeSH ID always first, followed by the entity name
2. Synonyms are known as "Entry Term", found in the XML near the end of each entity

In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import csv

In [39]:
# Global settings

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

## Parse MeSH XML into Lookup Table

### 'D' MeSH IDs

In [None]:
url_d = 'https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2024.xml'

response_d = requests.get(url_d)
response_d.raise_for_status()
root_d = ET.fromstring(response_d.content)

In [3]:
# Parsing function

def parse_mesh_xml(root):
    for record in root.findall('DescriptorRecord'):
        mesh_id = record.find('DescriptorUI').text
        
        # Find the primary name of the chemical/disease
        name = record.find('DescriptorName/String').text
        
        # Initialize a list to hold the entry terms (starting with the primary name)
        entry_terms = [name]
        
        # Look for entry terms (synonyms)
        for entry_term in record.findall('.//TermList/Term/String'):
            entry_terms.append(entry_term.text)
        
        # Store the MeSH ID along with all names and entry terms
        mesh_lookup[mesh_id] = entry_terms

    return mesh_lookup

In [4]:
mesh_lookup = {}

#mesh_lookup_table = parse_mesh_xml(root)
mesh_lookup_table_d = parse_mesh_xml(root_d)

for mesh_id, terms in list(mesh_lookup_table.items())[:25]:
    print(f"MeSH ID: {mesh_id}, Names: {' | '.join(terms)}")

NameError: name 'root_c' is not defined

In [13]:
# Save as csv

csv_file = 'mesh_lookup_table.csv'

with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['MeSH ID', 'Names/Entry Terms'])
    
    for mesh_id, terms in mesh_lookup_table.items():
        writer.writerow([mesh_id, ' | '.join(terms)])

print(f"CSV file '{csv_file}' has been created.")

CSV file 'mesh_lookup_table.csv' has been created.


### 'C' MeSH IDs

In [3]:
# url_c = 'https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/supp2024.xml'

# response_c = requests.get(url_c)
# response_c.raise_for_status()

In [4]:
# # Iterative parsing for "C" MeSH IDs

# def parse_mesh_xml_iteratively(xml_content):
#     mesh_lookup = {}
#     # Set up an incremental parser that processes elements one at a time
#     for event, elem in ET.iterparse(xml_content, events=('start', 'end')):
#         # We're interested in the 'SupplementalRecord' elements (for 'C' MeSH IDs)
#         if event == 'end' and elem.tag == 'SupplementalRecord':
#             # Get the MeSH ID for 'C' records
#             mesh_id = elem.find('SupplementalRecordUI').text
            
#             # Find the primary name of the chemical/disease
#             name = elem.find('SupplementalRecordName/String').text
            
#             # Initialize a list to hold the entry terms (starting with the primary name)
#             entry_terms = [name]
            
#             # Look for entry terms (synonyms)
#             for entry_term in elem.findall('.//TermList/Term/String'):
#                 entry_terms.append(entry_term.text)
            
#             # Store the MeSH ID along with all names and entry terms
#             mesh_lookup[mesh_id] = entry_terms

#             # Clear the element from memory to save resources
#             elem.clear()

#     return mesh_lookup


In [5]:
# mesh_lookup_table_c = parse_mesh_xml_iteratively(response_c.content)

: 

In [None]:
# for mesh_id, terms in list(mesh_lookup_table_c.items())[:25]:
#     print(f"MeSH ID: {mesh_id}, Names: {' | '.join(terms)}")

In [None]:
# # Save as csv

# csv_file = 'mesh_lookup_table_c.csv'

# with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
#     writer = csv.writer(file)
#     writer.writerow(['MeSH ID', 'Names/Entry Terms'])
    
#     for mesh_id, terms in mesh_lookup_table_c.items():
#         writer.writerow([mesh_id, ' | '.join(terms)])

# print(f"CSV file '{csv_file}' has been created.")

### csv to df

In [2]:
mesh_lookup_df_d = pd.read_csv('mesh_lookup_table.csv')
mesh_lookup_df_d.head(25)

Unnamed: 0,MeSH ID,Names/Entry Terms
0,D000001,Calcimycin | Calcimycin | 4-Benzoxazolecarboxy...
1,D000002,Temefos | Temefos | Temephos | Difos | Abate
2,D000003,Abattoirs | Abattoirs | Abattoir | Slaughterho...
3,D000004,Abbreviations as Topic | Abbreviations as Topi...
4,D000005,Abdomen | Abdomen | Abdomens
5,D000006,"Abdomen, Acute | Abdomen, Acute | Abdomens, Ac..."
6,D000007,Abdominal Injuries | Abdominal Injuries | Inju...
7,D000008,Abdominal Neoplasms | Abdominal Neoplasms | Ab...
8,D000009,Abdominal Muscles | Abdominal Muscles | Abdomi...
9,D000010,"Abducens Nerve | Abducens Nerve | Nerve, Abduc..."


In [3]:
mesh_lookup_df_d.shape

(30764, 2)

In [None]:
# mesh_lookup_df_c = pd.read_csv('mesh_lookup_table_c.csv')
# mesh_lookup_df_c.head(25)

In [4]:
mesh_lookup_df = mesh_lookup_df_d #+ mesh_lookup_df_c
mesh_lookup_df.head(25)

Unnamed: 0,MeSH ID,Names/Entry Terms
0,D000001,Calcimycin | Calcimycin | 4-Benzoxazolecarboxy...
1,D000002,Temefos | Temefos | Temephos | Difos | Abate
2,D000003,Abattoirs | Abattoirs | Abattoir | Slaughterho...
3,D000004,Abbreviations as Topic | Abbreviations as Topi...
4,D000005,Abdomen | Abdomen | Abdomens
5,D000006,"Abdomen, Acute | Abdomen, Acute | Abdomens, Ac..."
6,D000007,Abdominal Injuries | Abdominal Injuries | Inju...
7,D000008,Abdominal Neoplasms | Abdominal Neoplasms | Ab...
8,D000009,Abdominal Muscles | Abdominal Muscles | Abdomi...
9,D000010,"Abducens Nerve | Abducens Nerve | Nerve, Abduc..."


## Test MeSH ID Normalization

In [5]:
import ast

In [6]:
# Path for datasets

datapath = '../data/'

In [7]:
# Load datasets

df_train = pd.read_csv(f'{datapath}' + 'OfficialTrainingSet1.csv')
df_val = pd.read_csv(f'{datapath}' + 'OfficialValidationSet1.csv')
df_test = pd.read_csv(f'{datapath}' + 'OfficialTestSet1.csv')

print("Shape of train dataset:", df_train.shape)
print("Shape of validation dataset:", df_val.shape)
print("Shape of test dataset:", df_test.shape)

df_train.head(3)

Shape of train dataset: (500, 13)
Shape of validation dataset: (500, 13)
Shape of test dataset: (500, 13)


Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","['Naloxone', 'clonidine', 'clonidine', 'nalozo...","['hypertensive', 'hypotensive', 'hypertensive'...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","['D009270', 'D003000', 'D003000', '-1', 'D0087...","['D006973', 'D007022', 'D006973', 'D006973']",['D008750'],['D007022']
1,354896,Lidocaine-induced cardiac asystole.,Intravenous administration of a single 50-mg b...,"['Lidocaine', 'lidocaine', 'lidocaine']","['cardiac asystole', 'depression', 'bradyarrhy...","['0', '90', '409']","['9', '99', '418']","['18', '142', '331']","['34', '152', '347']","['D008012', 'D008012', 'D008012']","['D006323', 'D003866', 'D001919']",['D008012'],['D006323']
2,435349,Suxamethonium infusion rate and observed fasci...,Suxamethonium chloride (Sch) was administered ...,"['Suxamethonium', 'Suxamethonium chloride', 'S...","['fasciculations', 'tetanic', 'Fasciculations'...","['0', '80', '104', '312']","['13', '102', '107', '315']","['41', '265', '395', '483', '523', '538', '561...","['55', '272', '409', '496', '536', '544', '568...","['D013390', 'D013390', 'D013390', 'D013390']","['D005207', 'D013746', 'D005207', 'D005207', '...",['D013390'],['D005207']


In [8]:
# Data transformation functions

def convert_col_to_list(string):
    """
    Converts all string columns that look like lists (col index 3 to end) into actual lists 
    """
    return ast.literal_eval(string)


def lowercase_cols(lst):
    """
    Converts chemicals and diseases column to lowercase
    """
    return [item.lower() for item in lst]


def map_cid_to_chemical_name(row):
    """
    Maps CID of chemical in the CID_chemical column into the actual name of the chemical
    """
    cid_chemicals = row['CID_chemical']
    chemical_ids = row['chemical_ids']
    chemicals = row['chemicals']
    
    chemical_names = []
    
    for cid in cid_chemicals:
        if cid in chemical_ids:
            idx = chemical_ids.index(cid)
            chemical_names.append(chemicals[idx])
        else:
            chemical_names.append('unknown')
    
    return chemical_names


def map_cid_to_disease_name(row):
    """
    Maps CID of disease in the CID_disease column into the actual name of the disease
    """
    cid_diseases = row['CID_disease']
    disease_ids = row['disease_ids']
    diseases = row['diseases']
    
    disease_names = []
    
    for cid in cid_diseases:
        if cid in disease_ids:
            idx = disease_ids.index(cid) 
            disease_names.append(diseases[idx]) 
        else:
            disease_names.append('unknown')
    
    return disease_names


# Function to handle "unknown" for chemical names
def map_cid_to_chemical_name_unknown(data):
    '''
    Addresses 'unknown' instances of CID_chemical_names caused by chemicals with pipe (|) notation
    '''
    chemical_id_map = {}
    for i, row in data.iterrows():
        for cid, chemical in zip(row['chemical_ids'], row['chemicals']):
            chemical_id_map[cid] = chemical
    
    # Function to map "unknown" to the correct chemical name if possible
    def resolve_unknown_chemical_name(cids):
        names = []
        for cid in cids:
            # Split combined IDs (separated by '|') and check for matches in the map
            split_ids = cid.split('|')
            name = ' | '.join([chemical_id_map.get(split_id, 'unknown') for split_id in split_ids])
            names.append(name)
        return names

    # Apply the function only to rows where CID_chemical_name has "unknown"
    data['CID_chemical_name'] = data.apply(lambda row: resolve_unknown_chemical_name(row['CID_chemical']) 
                                       if 'unknown' in row['CID_chemical_name'] else row['CID_chemical_name'], axis=1)
    return data

# Function to handle "Unknown" for disease names
def map_cid_to_disease_name_unknown(data):
    '''
    Addresses 'unknown' instances of CID_disease_names caused by diseases with pipe (|) notation
    '''
    disease_id_map = {}
    for i, row in data.iterrows():
        for cid, disease in zip(row['disease_ids'], row['diseases']):
            disease_id_map[cid] = disease
    
    # Function to map "unknown" to the correct disease name if possible
    def resolve_unknown_disease_name(cids):
        names = []
        for cid in cids:
            # Split combined IDs (separated by '|') and check for matches in the map
            split_ids = cid.split('|')
            name = ' | '.join([disease_id_map.get(split_id, 'unknown') for split_id in split_ids])
            names.append(name)
        return names

    # Apply the function only to rows where CID_disease_name has "Unknown"
    data['CID_disease_name'] = data.apply(lambda row: resolve_unknown_disease_name(row['CID_disease']) 
                                      if 'unknown' in row['CID_disease_name'] else row['CID_disease_name'], axis=1)
    return data

In [9]:
# Apply the data transformations functions to all three datasets

list_columns = ['chemicals', 'diseases', 'chemical_ids', 'disease_ids', 'CID_chemical', 'CID_disease']
for col in list_columns:
    df_train[col] = df_train[col].apply(convert_col_to_list) 
    df_val[col] = df_val[col].apply(convert_col_to_list) 
    df_test[col] = df_test[col].apply(convert_col_to_list) 

df_train['chemicals'] = df_train['chemicals'].apply(lowercase_cols)
df_train['diseases'] = df_train['diseases'].apply(lowercase_cols)
df_val['chemicals'] = df_val['chemicals'].apply(lowercase_cols)
df_val['diseases'] = df_val['diseases'].apply(lowercase_cols)
df_test['chemicals'] = df_test['chemicals'].apply(lowercase_cols)
df_test['diseases'] = df_test['diseases'].apply(lowercase_cols)

df_train['CID_chemical_name'] = df_train.apply(map_cid_to_chemical_name, axis=1)
df_train['CID_disease_name'] = df_train.apply(map_cid_to_disease_name, axis=1)
df_val['CID_chemical_name'] = df_val.apply(map_cid_to_chemical_name, axis=1)
df_val['CID_disease_name'] = df_val.apply(map_cid_to_disease_name, axis=1)
df_test['CID_chemical_name'] = df_test.apply(map_cid_to_chemical_name, axis=1)
df_test['CID_disease_name'] = df_test.apply(map_cid_to_disease_name, axis=1)

df_train = map_cid_to_chemical_name_unknown(df_train)
df_train = map_cid_to_disease_name_unknown(df_train)
df_val = map_cid_to_chemical_name_unknown(df_val)
df_val = map_cid_to_disease_name_unknown(df_val)
df_test = map_cid_to_chemical_name_unknown(df_test)
df_test = map_cid_to_disease_name_unknown(df_test)

df_train.head(3)

Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease,CID_chemical_name,CID_disease_name
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","[naloxone, clonidine, clonidine, nalozone, alp...","[hypertensive, hypotensive, hypertensive, hype...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","[D009270, D003000, D003000, -1, D008750, D0092...","[D006973, D007022, D006973, D006973]",[D008750],[D007022],[alpha-methyldopa],[hypotensive]
1,354896,Lidocaine-induced cardiac asystole.,Intravenous administration of a single 50-mg b...,"[lidocaine, lidocaine, lidocaine]","[cardiac asystole, depression, bradyarrhythmias]","['0', '90', '409']","['9', '99', '418']","['18', '142', '331']","['34', '152', '347']","[D008012, D008012, D008012]","[D006323, D003866, D001919]",[D008012],[D006323],[lidocaine],[cardiac asystole]
2,435349,Suxamethonium infusion rate and observed fasci...,Suxamethonium chloride (Sch) was administered ...,"[suxamethonium, suxamethonium chloride, sch, sch]","[fasciculations, tetanic, fasciculations, fasc...","['0', '80', '104', '312']","['13', '102', '107', '315']","['41', '265', '395', '483', '523', '538', '561...","['55', '272', '409', '496', '536', '544', '568...","[D013390, D013390, D013390, D013390]","[D005207, D013746, D005207, D005207, D005207, ...",[D013390],[D005207],[suxamethonium],[fasciculations]


### Match to `chemicals` and `diseases`

In [10]:
mesh_lookup_df.head()

Unnamed: 0,MeSH ID,Names/Entry Terms
0,D000001,Calcimycin | Calcimycin | 4-Benzoxazolecarboxy...
1,D000002,Temefos | Temefos | Temephos | Difos | Abate
2,D000003,Abattoirs | Abattoirs | Abattoir | Slaughterho...
3,D000004,Abbreviations as Topic | Abbreviations as Topi...
4,D000005,Abdomen | Abdomen | Abdomens


In [11]:
# Split the 'Names/Entry Terms' column into individual terms
mesh_lookup_expanded = mesh_lookup_df.assign(names=mesh_lookup_df['Names/Entry Terms'].str.split('|')).explode('names')

# Lowercase the names and strip any leading/trailing whitespace for consistency
mesh_lookup_expanded['names'] = mesh_lookup_expanded['names'].str.strip().str.lower()

# Now create a reverse lookup (term to MeSH ID)
term_to_mesh = pd.Series(mesh_lookup_expanded['MeSH ID'].values, index=mesh_lookup_expanded['names']).to_dict()

# term_to_mesh now behaves similarly to the original dictionary
term_to_mesh

{'calcimycin': 'D000001',
 '4-benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1h-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6s-(6alpha(2s*,3s*),8beta(r*),9beta,11alpha))-': 'D000001',
 'a-23187': 'D000001',
 'a 23187': 'D000001',
 'antibiotic a23187': 'D000001',
 'a23187, antibiotic': 'D000001',
 'a23187': 'D000001',
 'temefos': 'D000002',
 'temephos': 'D000002',
 'difos': 'D000002',
 'abate': 'D000002',
 'abattoirs': 'D000003',
 'abattoir': 'D000003',
 'slaughterhouses': 'D000003',
 'slaughterhouse': 'D000003',
 'slaughter houses': 'D000003',
 'house, slaughter': 'D000003',
 'houses, slaughter': 'D000003',
 'slaughter house': 'D000003',
 'abbreviations as topic': 'D000004',
 'acronyms as topic': 'D000004',
 'abdomen': 'D000005',
 'abdomens': 'D000005',
 'abdomen, acute': 'D000006',
 'abdomens, acute': 'D000006',
 'acute abdomen': 'D000006',
 'acute abdomens': 'D000006',
 'abdominal injuries': 'D000007',
 'injuries, abdominal': 'D0

In [18]:
# # Lowercase lookup table
# mesh_lookup_lower = {mesh_id: [term.lower() for term in terms] for mesh_id, terms in mesh_lookup_table.items()}

# # Create reverse lookup from term to MeSH ID
# term_to_mesh = {}
# for mesh_id, terms in mesh_lookup_lower.items():
#     for term in terms:
#         term_to_mesh[term] = mesh_id

# term_to_mesh

In [12]:
# Function to map terms to MeSH IDs
def get_mesh_id(terms, lookup):
    mesh_ids = []
    for term in terms:
        term = term.lower()
        mesh_id = lookup.get(term, '-1') # Returning -1 if no match is found
        mesh_ids.append(mesh_id)
    return mesh_ids

In [13]:
# Apply the function to the chemicals and diseases columns to get their MeSH IDs
df_train['mapped_chemical_ids'] = df_train['chemicals'].apply(lambda x: get_mesh_id(x, term_to_mesh))
df_train['mapped_disease_ids'] = df_train['diseases'].apply(lambda x: get_mesh_id(x, term_to_mesh))

# Now, compare the generated MeSH IDs with the existing chemical_ids and disease_ids columns
df_train['chemical_ids_match'] = df_train['mapped_chemical_ids'] == df_train['chemical_ids']
df_train['disease_ids_match'] = df_train['mapped_disease_ids'] == df_train['disease_ids']

In [14]:
print(len(df_train))

500


In [15]:
df_train[['chemicals', 'chemical_ids', 'mapped_chemical_ids', 'chemical_ids_match']].head(10)

Unnamed: 0,chemicals,chemical_ids,mapped_chemical_ids,chemical_ids_match
0,"[naloxone, clonidine, clonidine, nalozone, alp...","[D009270, D003000, D003000, -1, D008750, D0092...","[D009270, D003000, D003000, -1, D008750, D0092...",True
1,"[lidocaine, lidocaine, lidocaine]","[D008012, D008012, D008012]","[D008012, D008012, D008012]",True
2,"[suxamethonium, suxamethonium chloride, sch, sch]","[D013390, D013390, D013390, D013390]","[D013390, D013390, -1, -1]",False
3,"[galanthamine hydrobromide, scopolamine, hyosc...","[D005702, D012601, D012601, D005702, D012601, ...","[D005702, D012601, D012601, D005702, D012601, ...",True
4,"[lithium, lithium, lithium, lithium, lithium, ...","[D008094, D008094, D008094, D008094, D008094, ...","[D008094, D008094, D008094, D008094, D008094, ...",False
5,"[fusidic acid, cyclosporin, cyclosporin, fusid...","[D005672, D016572, D016572, D005672, D005672, ...","[D005672, D016572, D016572, D005672, D005672, ...",True
6,"[cocaine, cocaine, cocaine]","[D003042, D003042, D003042]","[D003042, D003042, D003042]",True
7,"[sulpiride, sulpiride, antidepressant, sulpiri...","[D013469, D013469, D000928, D013469, D013469, ...","[D013469, D013469, D000928, D013469, D013469, ...",True
8,"[desferrioxamine, desferrioxamine, desferrioxa...","[D003676, D003676, D003676, D003676, -1, D003676]","[D003676, D003676, D003676, D003676, D000535, ...",False
9,"[magnesium, magnesium, magnesium, magnesium, a...","[D008274, D008274, D008274, D008274, D000109, ...","[D008274, D008274, D008274, D008274, D000109, ...",True


In [16]:
df_train_chemical_mismatches = df_train[df_train['chemical_ids_match'] == False]
df_train_chemical_mismatches[['chemicals', 'chemical_ids', 'mapped_chemical_ids', 'chemical_ids_match']]

Unnamed: 0,chemicals,chemical_ids,mapped_chemical_ids,chemical_ids_match
2,"[suxamethonium, suxamethonium chloride, sch, sch]","[D013390, D013390, D013390, D013390]","[D013390, D013390, -1, -1]",False
4,"[lithium, lithium, lithium, lithium, lithium, ...","[D008094, D008094, D008094, D008094, D008094, ...","[D008094, D008094, D008094, D008094, D008094, ...",False
8,"[desferrioxamine, desferrioxamine, desferrioxa...","[D003676, D003676, D003676, D003676, -1, D003676]","[D003676, D003676, D003676, D003676, D000535, ...",False
10,"[chloroacetaldehyde, cyclophosphamide, ifosfam...","[C004656, D003520, D007069, C004656, C004656, ...","[-1, D003520, D007069, -1, -1, -1, -1, -1, D01...",False
12,"[clotiazepam, clotiazepam, thienodiazepine, cl...","[C084599, C084599, C013295, C084599, D001569, ...","[-1, -1, -1, -1, D001569, -1, -1, -1, D001569]",False
...,...,...,...,...
490,"[ginsenoside rg1, morphine, rg1, ginsenoside, ...","[C035054, D009020, C035054, D036145, C035054, ...","[-1, D009020, -1, D036145, -1, D009020, -1, D0...",False
494,"[gum arabic, gentamicin, gum arabic, gentamici...","[D006170, D005839, D006170, D005839, D005839, ...","[D006170, D005839, D006170, D005839, -1, D0061...",False
495,"[zonisamide, zonisamide, zonisamide, zonisamid...","[C022189, C022189, C022189, C022189, C022189]","[D000078305, D000078305, D000078305, D00007830...",False
496,"[tyrosine, pan, tyrosine, puromycin aminonucle...","[D014443, D011692, D014443, D011692, D011692, ...","[D014443, -1, D014443, D011692, D011692, -1, -...",False


In [31]:
def calculate_unmatched_percentage(mapped_ids_column):
    # Flatten the list of lists for easier counting
    flat_mapped_ids = [item for sublist in mapped_ids_column for item in sublist]
    
    # Total number of IDs
    total_count = len(flat_mapped_ids)
    
    # Count how many are '-1'
    unmatched_count = flat_mapped_ids.count('-1')
    
    # Calculate the percentage
    if total_count == 0:
        return 0  # Prevent division by zero
    return (unmatched_count / total_count) * 100

# Calculate the percentage of '-1' mappings for chemicals
chemical_unmatched_percentage = calculate_unmatched_percentage(df_train['mapped_chemical_ids'])

print(f"Percentage of '-1' mappings for chemicals: {chemical_unmatched_percentage:.2f}%")

Percentage of '-1' mappings for chemicals: 24.57%


In [17]:
def get_mismatched_chemicals(row):
    mismatches = []
    for chemical, mapped_id in zip(row['chemicals'], row['mapped_chemical_ids']):
        if mapped_id == '-1':
            mismatches.append(chemical)
    return mismatches

df_train['mismatched_chemicals'] = df_train.apply(get_mismatched_chemicals, axis=1)

mismatch_individual_chemicals = df_train[df_train['mismatched_chemicals'].apply(lambda x: len(x) > 0)]
mismatch_individual_chemicals[['chemicals', 'chemical_ids', 'mapped_chemical_ids', 'mismatched_chemicals']]


Unnamed: 0,chemicals,chemical_ids,mapped_chemical_ids,mismatched_chemicals
0,"[naloxone, clonidine, clonidine, nalozone, alp...","[D009270, D003000, D003000, -1, D008750, D0092...","[D009270, D003000, D003000, -1, D008750, D0092...","[nalozone, 3h-naloxone, 3h-dihydroergocryptine]"
2,"[suxamethonium, suxamethonium chloride, sch, sch]","[D013390, D013390, D013390, D013390]","[D013390, D013390, -1, -1]","[sch, sch]"
4,"[lithium, lithium, lithium, lithium, lithium, ...","[D008094, D008094, D008094, D008094, D008094, ...","[D008094, D008094, D008094, D008094, D008094, ...","[li, li, li]"
10,"[chloroacetaldehyde, cyclophosphamide, ifosfam...","[C004656, D003520, D007069, C004656, C004656, ...","[-1, D003520, D007069, -1, -1, -1, -1, -1, D01...","[chloroacetaldehyde, chloroacetaldehyde, caa, ..."
12,"[clotiazepam, clotiazepam, thienodiazepine, cl...","[C084599, C084599, C013295, C084599, D001569, ...","[-1, -1, -1, -1, D001569, -1, -1, -1, D001569]","[clotiazepam, clotiazepam, thienodiazepine, cl..."
...,...,...,...,...
489,"[lamivudine, hepatitis-b surface antigen, hbsa...","[D019259, D006514, D006514, D019259, D019259, ...","[D019259, -1, D006514, D019259, D019259, D0192...","[hepatitis-b surface antigen, lamivudin]"
490,"[ginsenoside rg1, morphine, rg1, ginsenoside, ...","[C035054, D009020, C035054, D036145, C035054, ...","[-1, D009020, -1, D036145, -1, D009020, -1, D0...","[ginsenoside rg1, rg1, rg1, rg1, rg1, rg1, rg1..."
494,"[gum arabic, gentamicin, gum arabic, gentamici...","[D006170, D005839, D006170, D005839, D005839, ...","[D006170, D005839, D006170, D005839, -1, D0061...","[gm, gm, gsh, gm, gm, gsh, gm, gm, gm, gm, gm]"
496,"[tyrosine, pan, tyrosine, puromycin aminonucle...","[D014443, D011692, D014443, D011692, D011692, ...","[D014443, -1, D014443, D011692, D011692, -1, -...","[pan, pan, pan, pan]"


In [21]:
id_to_names = mesh_lookup_expanded.groupby('MeSH ID')['names'].apply(lambda x: list(x)).to_dict()

def get_actual_chemical_names(mismatched_chemicals, chemical_ids):
    actual_names = []
    for chem, chem_id in zip(mismatched_chemicals, chemical_ids):
        if chem_id != '-1' and chem_id in id_to_names:
            actual_names.append(id_to_names[chem_id])
        else:
            actual_names.append(['-1'])  # In case we can't find a match
    return actual_names

mismatch_individual_chemicals['actual_chemical_names'] = mismatch_individual_chemicals.apply(
    lambda row: get_actual_chemical_names(row['mismatched_chemicals'], row['chemical_ids']), axis=1)

mismatch_individual_chemicals[['chemicals', 'chemical_ids', 'mapped_chemical_ids', 'mismatched_chemicals', 'actual_chemical_names']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mismatch_individual_chemicals['actual_chemical_names'] = mismatch_individual_chemicals.apply(


Unnamed: 0,chemicals,chemical_ids,mapped_chemical_ids,mismatched_chemicals,actual_chemical_names
0,"[naloxone, clonidine, clonidine, nalozone, alp...","[D009270, D003000, D003000, -1, D008750, D0092...","[D009270, D003000, D003000, -1, D008750, D0092...","[nalozone, 3h-naloxone, 3h-dihydroergocryptine]","[[naloxone, naloxone, narcan, narcanti, naloxo..."
2,"[suxamethonium, suxamethonium chloride, sch, sch]","[D013390, D013390, D013390, D013390]","[D013390, D013390, -1, -1]","[sch, sch]","[[succinylcholine, succinylcholine, dicholine ..."
4,"[lithium, lithium, lithium, lithium, lithium, ...","[D008094, D008094, D008094, D008094, D008094, ...","[D008094, D008094, D008094, D008094, D008094, ...","[li, li, li]","[[lithium, lithium, lithium-7, lithium 7], [li..."
10,"[chloroacetaldehyde, cyclophosphamide, ifosfam...","[C004656, D003520, D007069, C004656, C004656, ...","[-1, D003520, D007069, -1, -1, -1, -1, -1, D01...","[chloroacetaldehyde, chloroacetaldehyde, caa, ...","[[Unknown], [cyclophosphamide, cyclophosphamid..."
12,"[clotiazepam, clotiazepam, thienodiazepine, cl...","[C084599, C084599, C013295, C084599, D001569, ...","[-1, -1, -1, -1, D001569, -1, -1, -1, D001569]","[clotiazepam, clotiazepam, thienodiazepine, cl...","[[Unknown], [Unknown], [Unknown], [Unknown], [..."
...,...,...,...,...,...
489,"[lamivudine, hepatitis-b surface antigen, hbsa...","[D019259, D006514, D006514, D019259, D019259, ...","[D019259, -1, D006514, D019259, D019259, D0192...","[hepatitis-b surface antigen, lamivudin]","[[lamivudine, lamivudine, 2',3'-dideoxy-3'-thi..."
490,"[ginsenoside rg1, morphine, rg1, ginsenoside, ...","[C035054, D009020, C035054, D036145, C035054, ...","[-1, D009020, -1, D036145, -1, D009020, -1, D0...","[ginsenoside rg1, rg1, rg1, rg1, rg1, rg1, rg1...","[[Unknown], [morphine, morphine, morphia, morp..."
494,"[gum arabic, gentamicin, gum arabic, gentamici...","[D006170, D005839, D006170, D005839, D005839, ...","[D006170, D005839, D006170, D005839, -1, D0061...","[gm, gm, gsh, gm, gm, gsh, gm, gm, gm, gm, gm]","[[gum arabic, gum arabic, arabic, gum, acacia ..."
496,"[tyrosine, pan, tyrosine, puromycin aminonucle...","[D014443, D011692, D014443, D011692, D011692, ...","[D014443, -1, D014443, D011692, D011692, -1, -...","[pan, pan, pan, pan]","[[tyrosine, tyrosine, l-tyrosine, l tyrosine, ..."


In [23]:
mismatch_individual_chemicals[['chemicals', 'chemical_ids', 'mapped_chemical_ids', 'mismatched_chemicals', 'actual_chemical_names']].to_csv('mismatch_individual_chemicals.csv', index=False)

In [24]:
df_train[['diseases', 'disease_ids', 'mapped_disease_ids', 'disease_ids_match']].head(10)

Unnamed: 0,diseases,disease_ids,mapped_disease_ids,disease_ids_match
0,"[hypertensive, hypotensive, hypertensive, hype...","[D006973, D007022, D006973, D006973]","[-1, -1, -1, -1]",False
1,"[cardiac asystole, depression, bradyarrhythmias]","[D006323, D003866, D001919]","[-1, D003863, D001919]",False
2,"[fasciculations, tetanic, fasciculations, fasc...","[D005207, D013746, D005207, D005207, D005207, ...","[D005207, -1, D005207, D005207, D005207, -1, D...",False
3,[overdosage],[D062787],[-1],False
4,"[chronic renal failure, nephropathy, renal fai...","[D007676, D007674, D051437, D011507, D006973, ...","[D007676, -1, D051437, D011507, D006973, -1, D...",False
5,"[crohn's disease, crohn's disease, crohn's dis...","[D003424, D003424, D003424, D009325, D003424, ...","[D003424, D003424, D003424, D009325, D003424, ...",True
6,"[myocardial injury, schizophrenic, myocardial ...","[D009202, D012559, D009202, D009203, D007511, ...","[-1, -1, -1, D009203, D007511, D002037]",False
7,"[tardive dystonia, tardive dyskinesia, parkins...","[D004421, D004409, D010302, D004421, D004421]","[D000071057, D000071057, D020734, D004421, D00...",False
8,"[ocular and auditory toxicity, audiovisual tox...","[D014786|D006311, D014786|D006311, D014786|D00...","[-1, -1, -1, -1, -1, -1, -1, D000081015, -1, D...",False
9,"[myasthenia gravis, neuromuscular disease, qua...","[D009157, D009468, D011782, D011225, D009468, ...","[D009157, D009468, -1, D011225, -1, D010243, D...",False


In [25]:
df_train_disease_mismatches = df_train[df_train['disease_ids_match'] == False]
df_train_disease_mismatches[['diseases', 'disease_ids', 'mapped_disease_ids', 'disease_ids_match']]

Unnamed: 0,diseases,disease_ids,mapped_disease_ids,disease_ids_match
0,"[hypertensive, hypotensive, hypertensive, hype...","[D006973, D007022, D006973, D006973]","[-1, -1, -1, -1]",False
1,"[cardiac asystole, depression, bradyarrhythmias]","[D006323, D003866, D001919]","[-1, D003863, D001919]",False
2,"[fasciculations, tetanic, fasciculations, fasc...","[D005207, D013746, D005207, D005207, D005207, ...","[D005207, -1, D005207, D005207, D005207, -1, D...",False
3,[overdosage],[D062787],[-1],False
4,"[chronic renal failure, nephropathy, renal fai...","[D007676, D007674, D051437, D011507, D006973, ...","[D007676, -1, D051437, D011507, D006973, -1, D...",False
...,...,...,...,...
493,"[acute psychosis, psychosis, epileptic, psycho...","[D011605, D011605, D004827, D011605, D014277, ...","[-1, D011618, -1, D011618, D014277, -1, -1, D0...",False
494,"[nephrotoxicity, acute renal failure, nephroto...","[D007674, D058186, D007674, D007674, D007683, ...","[-1, D058186, -1, -1, -1, -1, D007676]",False
496,"[nephrosis, glomerular injury, nephrosis, prot...","[D009401, D007674, D009401, D011507, D011507, ...","[D009401, -1, D009401, D011507, D011507, -1]",False
497,"[aplastic anemia, aplastic anemia, agranulocyt...","[D000741, D000741, D000380, D001855, D000741]","[D000741, D000741, D000380, -1, D000741]",False


In [32]:
disease_unmatched_percentage = calculate_unmatched_percentage(df_train['mapped_disease_ids'])

print(f"Percentage of '-1' mappings for diseases: {disease_unmatched_percentage:.2f}%")


Percentage of '-1' mappings for diseases: 38.03%


In [20]:
def get_mismatched_diseases(row):
    mismatches = []
    for disease, mapped_id in zip(row['diseases'], row['mapped_disease_ids']):
        if mapped_id == '-1':
            mismatches.append(disease)
    return mismatches

df_train['mismatched_diseases'] = df_train.apply(get_mismatched_diseases, axis=1)

mismatch_individual_diseases = df_train[df_train['mismatched_diseases'].apply(lambda x: len(x) > 0)]
mismatch_individual_diseases[['diseases', 'disease_ids', 'mapped_disease_ids', 'mismatched_diseases']]


Unnamed: 0,diseases,disease_ids,mapped_disease_ids,mismatched_diseases
0,"[hypertensive, hypotensive, hypertensive, hype...","[D006973, D007022, D006973, D006973]","[-1, -1, -1, -1]","[hypertensive, hypotensive, hypertensive, hype..."
1,"[cardiac asystole, depression, bradyarrhythmias]","[D006323, D003866, D001919]","[-1, D003863, D001919]",[cardiac asystole]
2,"[fasciculations, tetanic, fasciculations, fasc...","[D005207, D013746, D005207, D005207, D005207, ...","[D005207, -1, D005207, D005207, D005207, -1, D...","[tetanic, twitch]"
3,[overdosage],[D062787],[-1],[overdosage]
4,"[chronic renal failure, nephropathy, renal fai...","[D007676, D007674, D051437, D011507, D006973, ...","[D007676, -1, D051437, D011507, D006973, -1, D...","[nephropathy, glomerulosclerosis, nephropathy]"
...,...,...,...,...
493,"[acute psychosis, psychosis, epileptic, psycho...","[D011605, D011605, D004827, D011605, D014277, ...","[-1, D011618, -1, D011618, D014277, -1, -1, D0...","[acute psychosis, epileptic, psychotic symptom..."
494,"[nephrotoxicity, acute renal failure, nephroto...","[D007674, D058186, D007674, D007674, D007683, ...","[-1, D058186, -1, -1, -1, -1, D007676]","[nephrotoxicity, nephrotoxicity, nephrotoxicit..."
496,"[nephrosis, glomerular injury, nephrosis, prot...","[D009401, D007674, D009401, D011507, D011507, ...","[D009401, -1, D009401, D011507, D011507, -1]","[glomerular injury, glomerulosclerosis]"
497,"[aplastic anemia, aplastic anemia, agranulocyt...","[D000741, D000741, D000380, D001855, D000741]","[D000741, D000741, D000380, -1, D000741]",[bone marrow suppression]


In [29]:
id_to_names = mesh_lookup_expanded.groupby('MeSH ID')['names'].apply(lambda x: list(x)).to_dict()

def get_actual_disease_names(mismatched_diseases, disease_ids):
    actual_names = []
    for dis, dis_id in zip(mismatched_diseases, disease_ids):
        if dis_id != '-1' and dis_id in id_to_names:
            actual_names.append(id_to_names[dis_id])
        else:
            actual_names.append(['-1'])  # In case we can't find a match
    return actual_names

mismatch_individual_diseases['actual_disease_names'] = mismatch_individual_diseases.apply(
    lambda row: get_actual_disease_names(row['mismatched_diseases'], row['disease_ids']), axis=1)

mismatch_individual_diseases[['diseases', 'disease_ids', 'mapped_disease_ids', 'mismatched_diseases', 'actual_disease_names']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mismatch_individual_diseases['actual_disease_names'] = mismatch_individual_diseases.apply(


Unnamed: 0,diseases,disease_ids,mapped_disease_ids,mismatched_diseases,actual_disease_names
0,"[hypertensive, hypotensive, hypertensive, hype...","[D006973, D007022, D006973, D006973]","[-1, -1, -1, -1]","[hypertensive, hypotensive, hypertensive, hype...","[[hypertension, hypertension, blood pressure, ..."
1,"[cardiac asystole, depression, bradyarrhythmias]","[D006323, D003866, D001919]","[-1, D003863, D001919]",[cardiac asystole],"[[heart arrest, heart arrest, arrest, heart, a..."
2,"[fasciculations, tetanic, fasciculations, fasc...","[D005207, D013746, D005207, D005207, D005207, ...","[D005207, -1, D005207, D005207, D005207, -1, D...","[tetanic, twitch]","[[fasciculation, fasciculation, fasciculations..."
3,[overdosage],[D062787],[-1],[overdosage],"[[drug overdose, drug overdose, drug overdoses..."
4,"[chronic renal failure, nephropathy, renal fai...","[D007676, D007674, D051437, D011507, D006973, ...","[D007676, -1, D051437, D011507, D006973, -1, D...","[nephropathy, glomerulosclerosis, nephropathy]","[[kidney failure, chronic, kidney failure, chr..."
...,...,...,...,...,...
493,"[acute psychosis, psychosis, epileptic, psycho...","[D011605, D011605, D004827, D011605, D014277, ...","[-1, D011618, -1, D011618, D014277, -1, -1, D0...","[acute psychosis, epileptic, psychotic symptom...","[[psychoses, substance-induced, psychoses, sub..."
494,"[nephrotoxicity, acute renal failure, nephroto...","[D007674, D058186, D007674, D007674, D007683, ...","[-1, D058186, -1, -1, -1, -1, D007676]","[nephrotoxicity, nephrotoxicity, nephrotoxicit...","[[kidney diseases, kidney diseases, disease, k..."
496,"[nephrosis, glomerular injury, nephrosis, prot...","[D009401, D007674, D009401, D011507, D011507, ...","[D009401, -1, D009401, D011507, D011507, -1]","[glomerular injury, glomerulosclerosis]","[[nephrosis, nephrosis, nephroses], [kidney di..."
497,"[aplastic anemia, aplastic anemia, agranulocyt...","[D000741, D000741, D000380, D001855, D000741]","[D000741, D000741, D000380, -1, D000741]",[bone marrow suppression],"[[anemia, aplastic, anemia, aplastic, aplastic..."


In [30]:
mismatch_individual_diseases[['diseases', 'disease_ids', 'mapped_disease_ids', 'mismatched_diseases', 'actual_disease_names']].to_csv('mismatch_individual_diseases.csv', index=False)

### Match to `CID_chemical_name` and `CID_disease_name`

In [39]:
# def get_single_mesh_id(term, lookup):
#     if isinstance(term, list):
#         term = term[0]  # If it's a list, pick the first element
#     term = term.lower() if isinstance(term, str) else ''  # Ensure term is a string and lowercase
#     return lookup.get(term, '-1') 

In [34]:
df_train['mapped_CID_chemical_ids'] = df_train['CID_chemical_name'].apply(lambda x: get_mesh_id(x, term_to_mesh) if isinstance(x, list) else get_mesh_id([x], term_to_mesh))
df_train['mapped_CID_disease_ids'] = df_train['CID_disease_name'].apply(lambda x: get_mesh_id(x, term_to_mesh) if isinstance(x, list) else get_mesh_id([x], term_to_mesh))

df_train['CID_chemical_ids_match'] = df_train['mapped_CID_chemical_ids'] == df_train['CID_chemical']
df_train['CID_disease_ids_match'] = df_train['mapped_CID_disease_ids'] == df_train['CID_disease']

In [35]:
df_train[['CID_chemical_name', 'CID_chemical', 'mapped_CID_chemical_ids', 'CID_chemical_ids_match']].head(10)

Unnamed: 0,CID_chemical_name,CID_chemical,mapped_CID_chemical_ids,CID_chemical_ids_match
0,[alpha-methyldopa],[D008750],[D008750],True
1,[lidocaine],[D008012],[D008012],True
2,[suxamethonium],[D013390],[D013390],True
3,[scopolamine],[D012601],[D012601],True
4,"[lithium, lithium, lithium]","[D008094, D008094, D008094]","[D008094, D008094, D008094]",True
5,[fusidic acid],[D005672],[D005672],True
6,"[cocaine, cocaine]","[D003042, D003042]","[D003042, D003042]",True
7,[sulpiride],[D013469],[D013469],True
8,"[desferrioxamine, desferrioxamine, desferrioxa...","[D003676, D003676, D003676]","[D003676, D003676, D003676]",True
9,[magnesium],[D008274],[D008274],True


In [36]:
df_train[['CID_disease_name', 'CID_disease', 'mapped_CID_disease_ids', 'CID_disease_ids_match']].head(10)

Unnamed: 0,CID_disease_name,CID_disease,mapped_CID_disease_ids,CID_disease_ids_match
0,[hypotensive],[D007022],[-1],False
1,[cardiac asystole],[D006323],[-1],False
2,[fasciculations],[D005207],[D005207],True
3,[overdosage],[D062787],[-1],False
4,"[hypertension, proteinuria, chronic renal fail...","[D006973, D011507, D007676]","[D006973, D011507, D007676]",True
5,[nausea],[D009325],[D009325],True
6,"[myocardial infarction, bundle branch block]","[D009203, D002037]","[D009203, D002037]",True
7,[tardive dystonia],[D004421],[D000071057],False
8,"[pigmentary retinal deposits, visual toxicity,...","[D012164, D014786, D006319]","[-1, -1, -1]",False
9,[myasthenia gravis],[D009157],[D009157],True


In [37]:
cid_chemical_unmatched_percentage = calculate_unmatched_percentage(df_train['mapped_CID_chemical_ids'])
cid_disease_unmatched_percentage = calculate_unmatched_percentage(df_train['mapped_CID_disease_ids'])

print(f"Percentage of '-1' mappings for CID relationship chemicals: {cid_chemical_unmatched_percentage:.2f}%")
print(f"Percentage of '-1' mappings for CID relationship diseases: {cid_disease_unmatched_percentage:.2f}%")

Percentage of '-1' mappings for CID relationship chemicals: 12.62%
Percentage of '-1' mappings for CID relationship diseases: 29.87%


## Fuzzy Match

1. Search for exact match in the lookup table
2. If match is not found, algorithm calculates similarity score between input word and words in the lookup table
3. If the closest match is above a certain threshold, it maps to that word

In [46]:
from rapidfuzz import process 

all_names_in_lookup = mesh_lookup_expanded['names'].unique()

def match_term_fuzzy(term, lookup_dict, all_names, threshold=90):
    term = term.lower().strip()  # Ensure term is lowercased and stripped of spaces

    # Step 1: Try exact match first
    if term in lookup_dict:
        return lookup_dict[term]

    # Step 2: Apply fuzzy matching if no exact match is found
    match_info = process.extractOne(term, all_names)
    if match_info:
        match, score, _ = match_info  # Unpack the match, score, and index from extractOne()
        if score >= threshold:
            return lookup_dict.get(match, '-1')  # Return matched MeSH ID or '-1' if not found
    return '-1'  # Return '-1' if no match is found

def get_mesh_id_with_fuzzy_matching(terms, lookup_dict, all_names):
    return [match_term_fuzzy(term, lookup_dict, all_names) for term in terms]

In [47]:
df_train['fuzzy_mapped_chemical_ids'] = df_train['chemicals'].apply(lambda x: get_mesh_id_with_fuzzy_matching(x, term_to_mesh_exact, all_names_in_lookup))
df_train['fuzzy_mapped_disease_ids'] = df_train['diseases'].apply(lambda x: get_mesh_id_with_fuzzy_matching(x, term_to_mesh_exact, all_names_in_lookup))

df_train['chemical_ids_fuzzy_match'] = df_train['fuzzy_mapped_chemical_ids'] == df_train['chemical_ids']
df_train['disease_ids_fuzzy_match'] = df_train['fuzzy_mapped_disease_ids'] == df_train['disease_ids']

In [48]:
df_train[['chemicals', 'chemical_ids', 'fuzzy_mapped_chemical_ids', 'chemical_ids_fuzzy_match']].head()

Unnamed: 0,chemicals,chemical_ids,fuzzy_mapped_chemical_ids,chemical_ids_fuzzy_match
0,"[naloxone, clonidine, clonidine, nalozone, alp...","[D009270, D003000, D003000, -1, D008750, D0092...","[D009270, D003000, D003000, D010126, D008750, ...",False
1,"[lidocaine, lidocaine, lidocaine]","[D008012, D008012, D008012]","[D008012, D008012, D008012]",True
2,"[suxamethonium, suxamethonium chloride, sch, sch]","[D013390, D013390, D013390, D013390]","[D013390, D013390, D000326, D000326]",False
3,"[galanthamine hydrobromide, scopolamine, hyosc...","[D005702, D012601, D012601, D005702, D012601, ...","[D005702, D012601, D012601, D005702, D012601, ...",True
4,"[lithium, lithium, lithium, lithium, lithium, ...","[D008094, D008094, D008094, D008094, D008094, ...","[D008094, D008094, D008094, D008094, D008094, ...",False


In [49]:
df_train[['diseases', 'disease_ids', 'fuzzy_mapped_disease_ids', 'disease_ids_fuzzy_match']].head()

Unnamed: 0,diseases,disease_ids,fuzzy_mapped_disease_ids,disease_ids_fuzzy_match
0,"[hypertensive, hypotensive, hypertensive, hype...","[D006973, D007022, D006973, D006973]","[D000959, D004561, D000959, D000959]",False
1,"[cardiac asystole, depression, bradyarrhythmias]","[D006323, D003866, D001919]","[D001336, D003863, D001919]",False
2,"[fasciculations, tetanic, fasciculations, fasc...","[D005207, D013746, D005207, D005207, D005207, ...","[D005207, D012032, D005207, D005207, D005207, ...",False
3,[overdosage],[D062787],[-1],False
4,"[chronic renal failure, nephropathy, renal fai...","[D007676, D007674, D051437, D011507, D006973, ...","[D007676, D005922, D051437, D011507, D006973, ...",False


In [50]:
# Function to calculate the percentage of correct mappings for a list of chemicals
def calculate_correct_mapping_percentage(original_ids, mapped_ids):
    correct_matches = 0
    total_count = len(original_ids)
    
    # Compare individual chemicals/diseases with mapped IDs
    for original_id, mapped_id in zip(original_ids, mapped_ids):
        if original_id == mapped_id:
            correct_matches += 1
    
    if total_count == 0:
        return 0  # Avoid division by zero
    return (correct_matches / total_count) * 100

# Apply the function to both chemicals and diseases columns
df_train['chemical_mapping_accuracy'] = df_train.apply(lambda row: calculate_correct_mapping_percentage(row['chemical_ids'], row['fuzzy_mapped_chemical_ids']), axis=1)
df_train['disease_mapping_accuracy'] = df_train.apply(lambda row: calculate_correct_mapping_percentage(row['disease_ids'], row['fuzzy_mapped_disease_ids']), axis=1)

# Print overall accuracy for chemicals and diseases
overall_chemical_accuracy = df_train['chemical_mapping_accuracy'].mean()
overall_disease_accuracy = df_train['disease_mapping_accuracy'].mean()

print(f"Overall chemical mapping accuracy: {overall_chemical_accuracy:.2f}%")
print(f"Overall disease mapping accuracy: {overall_disease_accuracy:.2f}%")

# Optionally, display the first few rows to check individual accuracy
print(df_train[['chemicals', 'chemical_ids', 'fuzzy_mapped_chemical_ids', 'chemical_mapping_accuracy']].head())
print(df_train[['diseases', 'disease_ids', 'fuzzy_mapped_disease_ids', 'disease_mapping_accuracy']].head())

Overall chemical mapping accuracy: 81.40%
Overall disease mapping accuracy: 60.11%
                                           chemicals  \
0  [naloxone, clonidine, clonidine, nalozone, alp...   
1                  [lidocaine, lidocaine, lidocaine]   
2  [suxamethonium, suxamethonium chloride, sch, sch]   
3  [galanthamine hydrobromide, scopolamine, hyosc...   
4  [lithium, lithium, lithium, lithium, lithium, ...   

                                        chemical_ids  \
0  [D009270, D003000, D003000, -1, D008750, D0092...   
1                        [D008012, D008012, D008012]   
2               [D013390, D013390, D013390, D013390]   
3  [D005702, D012601, D012601, D005702, D012601, ...   
4  [D008094, D008094, D008094, D008094, D008094, ...   

                           fuzzy_mapped_chemical_ids  \
0  [D009270, D003000, D003000, D010126, D008750, ...   
1                        [D008012, D008012, D008012]   
2               [D013390, D013390, D000326, D000326]   
3  [D005702, D01260