In [3]:
import pandas as pd 
import os

In [4]:
mesh_df = pd.read_csv('mesh_lookup_table.csv')
dsstox_df = pd.read_csv('dsstox_lookup_table.csv')

In [5]:
dsstox_df.shape

(1218248, 2)

# Clean out DSSTOX

In [6]:
# Remove 'nan' rows

cleaned_chunks = []
chunk_size = 10000
chunk_count = 1

def filter_invalid_identifiers(row):
    # Split IDENTIFIER by ' | ' and check if the first term is 'nan' (case insensitive)
    identifiers = row['IDENTIFIER'].split(' | ')
    return identifiers[0].strip().lower() != 'nan'

for chunk in pd.read_csv('dsstox_lookup_table.csv', chunksize=chunk_size):
    # Filter out rows where the first term in IDENTIFIER is 'nan'
    print('Removing "nan" chemicals in chunk', chunk_count)
    chunk = chunk[chunk.apply(filter_invalid_identifiers, axis=1)]
    cleaned_chunks.append(chunk)
    chunk_count += 1

dsstox_df_clean = pd.concat(cleaned_chunks, ignore_index=True)

Removing "nan" chemicals in chunk 1
Removing "nan" chemicals in chunk 2
Removing "nan" chemicals in chunk 3
Removing "nan" chemicals in chunk 4
Removing "nan" chemicals in chunk 5
Removing "nan" chemicals in chunk 6
Removing "nan" chemicals in chunk 7
Removing "nan" chemicals in chunk 8
Removing "nan" chemicals in chunk 9
Removing "nan" chemicals in chunk 10
Removing "nan" chemicals in chunk 11
Removing "nan" chemicals in chunk 12
Removing "nan" chemicals in chunk 13
Removing "nan" chemicals in chunk 14
Removing "nan" chemicals in chunk 15
Removing "nan" chemicals in chunk 16
Removing "nan" chemicals in chunk 17
Removing "nan" chemicals in chunk 18
Removing "nan" chemicals in chunk 19
Removing "nan" chemicals in chunk 20
Removing "nan" chemicals in chunk 21
Removing "nan" chemicals in chunk 22
Removing "nan" chemicals in chunk 23
Removing "nan" chemicals in chunk 24
Removing "nan" chemicals in chunk 25
Removing "nan" chemicals in chunk 26
Removing "nan" chemicals in chunk 27
Removing "

In [8]:
dsstox_df_clean.head(5)

Unnamed: 0,PREFERRED_NAME,IDENTIFIER
0,"3-Pyridinecarboxylic acid, 2-(3-(trifluorometh...",2-[3-(Trifluoromethyl)phenoxy]pyridine-3-carbo...
1,"Octadecanoic acid, compd. with urea",Octadecanoic acid--urea (1/1) | C19H40N2O3 | O...
2,Furfuryl phenylacetate,(Furan-2-yl)methyl phenylacetate | C13H12O3 | ...
3,"Benzimidazole, 6-chloro-4-nitro-2-(trifluorome...",6-Chloro-4-nitro-2-(trifluoromethyl)-1H-benzim...
4,"Benzimidazole, 5-fluoro-2-(trifluoromethyl)-",6-Fluoro-2-(trifluoromethyl)-1H-benzimidazole ...
...,...,...
1072771,"1,5-Diphenyl-4-penten-1-one","1,5-Diphenylpent-4-en-1-one | C17H16O | 1,5-Di..."
1072772,N-(1-Methylethyl)-2-propanamine hydrochloride ...,N-(Propan-2-yl)propan-2-amine--hydrogen chlori...
1072773,2-Phenyl-4-(phenylmethylene)-5(4H)-oxazolone,"4-Benzylidene-2-phenyl-1,3-oxazol-5(4H)-one | ..."
1072774,4-(Hexadecylamino)-4-oxo-2-butenoic acid,4-(Hexadecylimino)-4-hydroxybut-2-enoic acid |...


In [10]:
# Remove rows that start with number

import re

def filter_starting_with_number(row):
    return not bool(re.match(r'^\d', row['PREFERRED_NAME']))

dsstox_df_clean = dsstox_df_clean[dsstox_df_clean.apply(filter_starting_with_number, axis=1)]

dsstox_df_clean.head(5)

Unnamed: 0,PREFERRED_NAME,IDENTIFIER
1,"Octadecanoic acid, compd. with urea",Octadecanoic acid--urea (1/1) | C19H40N2O3 | O...
2,Furfuryl phenylacetate,(Furan-2-yl)methyl phenylacetate | C13H12O3 | ...
3,"Benzimidazole, 6-chloro-4-nitro-2-(trifluorome...",6-Chloro-4-nitro-2-(trifluoromethyl)-1H-benzim...
4,"Benzimidazole, 5-fluoro-2-(trifluoromethyl)-",6-Fluoro-2-(trifluoromethyl)-1H-benzimidazole ...
5,"Benzimidazole, 5-bromo-2-(trifluoromethyl)-",6-Bromo-2-(trifluoromethyl)-1H-benzimidazole |...
...,...,...
1072756,"S-{2-[(1R,4S)-4-Methyl-2-oxocyclohexyl]-2-prop...","S-{2-[(1R,4S)-4-Methyl-2-oxocyclohexyl]propan-..."
1072761,(+)-Norcisapride,"4-Amino-5-chloro-2-methoxy-N-[(3S,4R)-3-methox..."
1072764,Bis(4-chlorobenzyl) ether,"1,1'-[Oxybis(methylene)]bis(4-chlorobenzene) |..."
1072767,(2S)-4-Carbamoyl-2-([[(9H-fluoren-9-yl)methoxy...,N-{[(9H-Fluoren-9-yl)methoxy](hydroxy)methylid...


In [11]:
# Remove rows with chemicals more than 50 characters long

def filter_name_length(row):
    return len(row['PREFERRED_NAME']) <= 50

dsstox_df_clean = dsstox_df_clean[dsstox_df_clean.apply(filter_name_length, axis=1)]

dsstox_df_clean.head(5)

Unnamed: 0,PREFERRED_NAME,IDENTIFIER
1,"Octadecanoic acid, compd. with urea",Octadecanoic acid--urea (1/1) | C19H40N2O3 | O...
2,Furfuryl phenylacetate,(Furan-2-yl)methyl phenylacetate | C13H12O3 | ...
4,"Benzimidazole, 5-fluoro-2-(trifluoromethyl)-",6-Fluoro-2-(trifluoromethyl)-1H-benzimidazole ...
5,"Benzimidazole, 5-bromo-2-(trifluoromethyl)-",6-Bromo-2-(trifluoromethyl)-1H-benzimidazole |...
6,"Benzimidazole-5-carbonitrile, 2-(trifluorometh...",2-(Trifluoromethyl)-1H-benzimidazole-6-carboni...
...,...,...
1072747,Sodium N-acetoacetylsulfanilate,Sodium 4-(3-oxobutanamido)benzene-1-sulfonate ...
1072749,(2E)-3-Methyl-2-heptene,(2E)-3-Methylhept-2-ene | C8H16 | (2E)-3-Methy...
1072750,Pentose,"Pentose | C5H10O5 | Pentose | 2,3,4,5-Tetrahyd..."
1072752,Homocarnosine,N-(4-Amino-1-hydroxybutylidene)-L-histidine | ...


In [14]:
dsstox_df_clean.to_csv('dsstox_lookup_table_v2.csv', index=False)

In [15]:
dsstox_df_clean = pd.read_csv('dsstox_lookup_table_v2.csv')

# MeSH Lookup + DSSTox

In [16]:
dsstox_dict = {}
chunk_count = 1

chunk_size = 10000  # Adjust based on available memory
for chunk in pd.read_csv('dsstox_lookup_table_v2.csv', chunksize=chunk_size):
    print('processing chunk', chunk_count)
    # Process each chunk
    for _, row in chunk.iterrows():
        preferred_name = row['PREFERRED_NAME'].strip().lower()  # Ensure lowercase for case-insensitive matching
        identifiers = set(map(str.strip, row['IDENTIFIER'].split(' | ')))  # Split identifiers into set of terms

        # Add to dsstox_dict or update if preferred_name already exists
        if preferred_name in dsstox_dict:
            dsstox_dict[preferred_name].update(identifiers)
        else:
            dsstox_dict[preferred_name] = identifiers
    chunk_count += 1

def extend_names(terms):
    # Split and lowercase mesh terms
    term_list = [term.strip() for term in terms.split(' | ')]
    official_name = term_list[0].lower()  # Lowercase for matching
    all_terms = set(term_list)  # Convert to set to avoid duplicates

    # Check if the official name exists in dsstox_dict
    if official_name in dsstox_dict:
        # Append identifiers from dsstox_dict while preserving the order
        all_terms.update(dsstox_dict[official_name])

    # Preserve the original official name at the beginning
    all_terms_ordered = [term_list[0]] + [term for term in all_terms if term.lower() != official_name]
    return ' | '.join(all_terms_ordered)

mesh_df['Names/Entry Terms'] = mesh_df['Names/Entry Terms'].apply(extend_names)

processing chunk 1
processing chunk 2
processing chunk 3
processing chunk 4
processing chunk 5
processing chunk 6
processing chunk 7
processing chunk 8
processing chunk 9
processing chunk 10
processing chunk 11
processing chunk 12
processing chunk 13
processing chunk 14
processing chunk 15
processing chunk 16
processing chunk 17
processing chunk 18
processing chunk 19
processing chunk 20
processing chunk 21
processing chunk 22
processing chunk 23
processing chunk 24
processing chunk 25
processing chunk 26
processing chunk 27
processing chunk 28


In [20]:
mesh_df.head(20)

Unnamed: 0,MeSH ID,Names/Entry Terms
0,D000001,"Calcimycin | 4-Benzoxazolecarboxylic acid, 5-(..."
1,D000002,Temefos | Difos | Temephos | Abate
2,D000003,Abattoirs | Slaughterhouses | Abattoir | Slaug...
3,D000004,Abbreviations as Topic | Acronyms as Topic
4,D000005,Abdomen | Abdomens
5,D000006,"Abdomen, Acute | Acute Abdomen | Acute Abdomen..."
6,D000007,"Abdominal Injuries | Injuries, Abdominal | Abd..."
7,D000008,"Abdominal Neoplasms | Neoplasm, Abdominal | Ne..."
8,D000009,"Abdominal Muscles | Muscles, Cremaster | Abdom..."
9,D000010,"Abducens Nerve | Nerve VIs, Cranial | Nervus A..."


In [19]:
mesh_df.iloc[0][1]

'Calcimycin | 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-[[(2R,3R,6S,8S,9R,11R)-3,9,11-trimethyl-8-[(1S)-1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl]-1,7-dioxaspiro[5.5]undec-2-yl]methyl]- | A-23187 | A 23187 | [6S-[6α(2S*,3S*),8β(R*),9β,11α]]-5-(Methylamino)-2-[[3,9,11-trimethyl-8-[1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl]-1,7-dioxaspiro[5,5]undec-2-yl]methyl]-4-benzoxazolecarboxylic acid | 4-Benzoxazolcarbonsaure, 5-(Methylamino)-2-[[3,9,11-trimethyl-8-[1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl]-1,7-dioxaspiro[5.5]undec-2-yl]methyl]-, [6S-[6α(2S*,3S*),8β(R*),9β,11α]]- | 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-[[3,9,11-trimethyl-8-[1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl]-1, | 5-(Methylamino)-2-({(2R,3R,6S,8S,9R,11R)-3,9,11-trimethyl-8-[(2S)-1-oxo-1-(1H-pyrrol-2-yl)propan-2-yl]-1,7-dioxaspiro[5.5]undecan-2-yl}methyl)-1,3-benzoxazole-4-carboxylic acid | C29H37N3O6 | Antibiotic A23187 | A23187 | 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H

Calcimycin

Before:

"Calcimycin | Calcimycin | 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6S-(6alpha(2S*,3S*),8beta(R*),9beta,11alpha))- | A-23187 | A 23187 | Antibiotic A23187 | A23187, Antibiotic | A23187"

After:

'Calcimycin | 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-[[(2R,3R,6S,8S,9R,11R)-3,9,11-trimethyl-8-[(1S)-1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl]-1,7-dioxaspiro[5.5]undec-2-yl]methyl]- | A-23187 | A 23187 | [6S-[6α(2S*,3S*),8β(R*),9β,11α]]-5-(Methylamino)-2-[[3,9,11-trimethyl-8-[1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl]-1,7-dioxaspiro[5,5]undec-2-yl]methyl]-4-benzoxazolecarboxylic acid | 4-Benzoxazolcarbonsaure, 5-(Methylamino)-2-[[3,9,11-trimethyl-8-[1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl]-1,7-dioxaspiro[5.5]undec-2-yl]methyl]-, [6S-[6α(2S*,3S*),8β(R*),9β,11α]]- | 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-[[3,9,11-trimethyl-8-[1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl]-1, | 5-(Methylamino)-2-({(2R,3R,6S,8S,9R,11R)-3,9,11-trimethyl-8-[(2S)-1-oxo-1-(1H-pyrrol-2-yl)propan-2-yl]-1,7-dioxaspiro[5.5]undecan-2-yl}methyl)-1,3-benzoxazole-4-carboxylic acid | C29H37N3O6 | Antibiotic A23187 | A23187 | 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-((3,9,11-trimethyl-8-(1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl)-1,7-dioxaspiro(5.5)undec-2-yl)methyl)-, (6S-(6alpha(2S*,3S*),8beta(R*),9beta,11alpha))- | 4-Benzoxazolecarboxylic acid, 5-(methylamino)-2-[[(2R,3R,8S,9R,11R)-3,9,11-trimethyl-8-[(1S)-1-methyl-2-oxo-2-(1H-pyrrol-2-yl)ethyl]-1,7-dioxaspiro[5.5]undec-2-yl]methyl]- | A23187, Antibiotic'

In [21]:
mesh_df.iloc[1][1]

'Temefos | Difos | Temephos | Abate'

In [22]:
mesh_df.to_csv('mesh_lookup_table_with_dsstox.csv', index=False)

# Checking Improvement

In [23]:
# Path for datasets

datapath = '../data/'

In [41]:
# Load datasets

df_train = pd.read_csv(f'{datapath}' + 'OfficialTrainingSet1.csv')
df_val = pd.read_csv(f'{datapath}' + 'OfficialValidationSet1.csv')
df_test = pd.read_csv(f'{datapath}' + 'OfficialTestSet1.csv')

print("Shape of train dataset:", df_train.shape)
print("Shape of validation dataset:", df_val.shape)
print("Shape of test dataset:", df_test.shape)

df_train.head(3)

Shape of train dataset: (500, 13)
Shape of validation dataset: (500, 13)
Shape of test dataset: (500, 13)


Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","['Naloxone', 'clonidine', 'clonidine', 'nalozo...","['hypertensive', 'hypotensive', 'hypertensive'...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","['D009270', 'D003000', 'D003000', '-1', 'D0087...","['D006973', 'D007022', 'D006973', 'D006973']",['D008750'],['D007022']
1,354896,Lidocaine-induced cardiac asystole.,Intravenous administration of a single 50-mg b...,"['Lidocaine', 'lidocaine', 'lidocaine']","['cardiac asystole', 'depression', 'bradyarrhy...","['0', '90', '409']","['9', '99', '418']","['18', '142', '331']","['34', '152', '347']","['D008012', 'D008012', 'D008012']","['D006323', 'D003866', 'D001919']",['D008012'],['D006323']
2,435349,Suxamethonium infusion rate and observed fasci...,Suxamethonium chloride (Sch) was administered ...,"['Suxamethonium', 'Suxamethonium chloride', 'S...","['fasciculations', 'tetanic', 'Fasciculations'...","['0', '80', '104', '312']","['13', '102', '107', '315']","['41', '265', '395', '483', '523', '538', '561...","['55', '272', '409', '496', '536', '544', '568...","['D013390', 'D013390', 'D013390', 'D013390']","['D005207', 'D013746', 'D005207', 'D005207', '...",['D013390'],['D005207']


In [42]:
# Data transformation functions

def convert_col_to_list(string):
    """
    Converts all string columns that look like lists (col index 3 to end) into actual lists 
    """
    return ast.literal_eval(string)


def lowercase_cols(lst):
    """
    Converts chemicals and diseases column to lowercase
    """
    return [item.lower() for item in lst]


def map_cid_to_chemical_name(row):
    """
    Maps CID of chemical in the CID_chemical column into the actual name of the chemical
    """
    cid_chemicals = row['CID_chemical']
    chemical_ids = row['chemical_ids']
    chemicals = row['chemicals']
    
    chemical_names = []
    
    for cid in cid_chemicals:
        if cid in chemical_ids:
            idx = chemical_ids.index(cid)
            chemical_names.append(chemicals[idx])
        else:
            chemical_names.append('unknown')
    
    return chemical_names


def map_cid_to_disease_name(row):
    """
    Maps CID of disease in the CID_disease column into the actual name of the disease
    """
    cid_diseases = row['CID_disease']
    disease_ids = row['disease_ids']
    diseases = row['diseases']
    
    disease_names = []
    
    for cid in cid_diseases:
        if cid in disease_ids:
            idx = disease_ids.index(cid) 
            disease_names.append(diseases[idx]) 
        else:
            disease_names.append('unknown')
    
    return disease_names


# Function to handle "unknown" for chemical names
def map_cid_to_chemical_name_unknown(data):
    '''
    Addresses 'unknown' instances of CID_chemical_names caused by chemicals with pipe (|) notation
    '''
    chemical_id_map = {}
    for i, row in data.iterrows():
        for cid, chemical in zip(row['chemical_ids'], row['chemicals']):
            chemical_id_map[cid] = chemical
    
    # Function to map "unknown" to the correct chemical name if possible
    def resolve_unknown_chemical_name(cids):
        names = []
        for cid in cids:
            # Split combined IDs (separated by '|') and check for matches in the map
            split_ids = cid.split('|')
            name = ' | '.join([chemical_id_map.get(split_id, 'unknown') for split_id in split_ids])
            names.append(name)
        return names

    # Apply the function only to rows where CID_chemical_name has "unknown"
    data['CID_chemical_name'] = data.apply(lambda row: resolve_unknown_chemical_name(row['CID_chemical']) 
                                       if 'unknown' in row['CID_chemical_name'] else row['CID_chemical_name'], axis=1)
    return data

# Function to handle "Unknown" for disease names
def map_cid_to_disease_name_unknown(data):
    '''
    Addresses 'unknown' instances of CID_disease_names caused by diseases with pipe (|) notation
    '''
    disease_id_map = {}
    for i, row in data.iterrows():
        for cid, disease in zip(row['disease_ids'], row['diseases']):
            disease_id_map[cid] = disease
    
    # Function to map "unknown" to the correct disease name if possible
    def resolve_unknown_disease_name(cids):
        names = []
        for cid in cids:
            # Split combined IDs (separated by '|') and check for matches in the map
            split_ids = cid.split('|')
            name = ' | '.join([disease_id_map.get(split_id, 'unknown') for split_id in split_ids])
            names.append(name)
        return names

    # Apply the function only to rows where CID_disease_name has "Unknown"
    data['CID_disease_name'] = data.apply(lambda row: resolve_unknown_disease_name(row['CID_disease']) 
                                      if 'unknown' in row['CID_disease_name'] else row['CID_disease_name'], axis=1)
    return data

In [43]:
# Apply the data transformations functions to all three datasets

list_columns = ['chemicals', 'diseases', 'chemical_ids', 'disease_ids', 'CID_chemical', 'CID_disease']
for col in list_columns:
    df_train[col] = df_train[col].apply(convert_col_to_list) 
    df_val[col] = df_val[col].apply(convert_col_to_list) 
    df_test[col] = df_test[col].apply(convert_col_to_list) 

df_train['chemicals'] = df_train['chemicals'].apply(lowercase_cols)
df_train['diseases'] = df_train['diseases'].apply(lowercase_cols)
df_val['chemicals'] = df_val['chemicals'].apply(lowercase_cols)
df_val['diseases'] = df_val['diseases'].apply(lowercase_cols)
df_test['chemicals'] = df_test['chemicals'].apply(lowercase_cols)
df_test['diseases'] = df_test['diseases'].apply(lowercase_cols)

df_train['CID_chemical_name'] = df_train.apply(map_cid_to_chemical_name, axis=1)
df_train['CID_disease_name'] = df_train.apply(map_cid_to_disease_name, axis=1)
df_val['CID_chemical_name'] = df_val.apply(map_cid_to_chemical_name, axis=1)
df_val['CID_disease_name'] = df_val.apply(map_cid_to_disease_name, axis=1)
df_test['CID_chemical_name'] = df_test.apply(map_cid_to_chemical_name, axis=1)
df_test['CID_disease_name'] = df_test.apply(map_cid_to_disease_name, axis=1)

df_train = map_cid_to_chemical_name_unknown(df_train)
df_train = map_cid_to_disease_name_unknown(df_train)
df_val = map_cid_to_chemical_name_unknown(df_val)
df_val = map_cid_to_disease_name_unknown(df_val)
df_test = map_cid_to_chemical_name_unknown(df_test)
df_test = map_cid_to_disease_name_unknown(df_test)

df_train.head(3)

Unnamed: 0,article_code,title,abstract,chemicals,diseases,chemical_start_indices,chemical_end_indices,disease_start_indices,disease_end_indices,chemical_ids,disease_ids,CID_chemical,CID_disease,CID_chemical_name,CID_disease_name
0,227508,Naloxone reverses the antihypertensive effect ...,"In unanesthetized, spontaneously hypertensive ...","[naloxone, clonidine, clonidine, nalozone, alp...","[hypertensive, hypotensive, hypertensive, hype...","['0', '49', '181', '244', '306', '354', '364',...","['8', '58', '190', '252', '322', '362', '372',...","['93', '274', '469', '750']","['105', '285', '481', '762']","[D009270, D003000, D003000, -1, D008750, D0092...","[D006973, D007022, D006973, D006973]",[D008750],[D007022],[alpha-methyldopa],[hypotensive]
1,354896,Lidocaine-induced cardiac asystole.,Intravenous administration of a single 50-mg b...,"[lidocaine, lidocaine, lidocaine]","[cardiac asystole, depression, bradyarrhythmias]","['0', '90', '409']","['9', '99', '418']","['18', '142', '331']","['34', '152', '347']","[D008012, D008012, D008012]","[D006323, D003866, D001919]",[D008012],[D006323],[lidocaine],[cardiac asystole]
2,435349,Suxamethonium infusion rate and observed fasci...,Suxamethonium chloride (Sch) was administered ...,"[suxamethonium, suxamethonium chloride, sch, sch]","[fasciculations, tetanic, fasciculations, fasc...","['0', '80', '104', '312']","['13', '102', '107', '315']","['41', '265', '395', '483', '523', '538', '561...","['55', '272', '409', '496', '536', '544', '568...","[D013390, D013390, D013390, D013390]","[D005207, D013746, D005207, D005207, D005207, ...",[D013390],[D005207],[suxamethonium],[fasciculations]


## MeSH only table

In [29]:
mesh_lookup_df = pd.read_csv('mesh_lookup_table.csv')

In [32]:
# MeSH Lookup Table reversed so it finds MeSH ID from entity

mesh_lookup_expanded = mesh_lookup_df.assign(names=mesh_lookup_df['Names/Entry Terms'].str.split('|')).explode('names')
mesh_lookup_expanded['names'] = mesh_lookup_expanded['names'].str.strip().str.lower()
term_to_mesh = pd.Series(mesh_lookup_expanded['MeSH ID'].values, index=mesh_lookup_expanded['names']).to_dict()

In [31]:
# Function to map terms to MeSH IDs
def get_mesh_id(terms, lookup):
    mesh_ids = []
    for term in terms:
        term = term.lower()
        mesh_id = lookup.get(term, '-1') # Returning -1 if no match is found
        mesh_ids.append(mesh_id)
    return mesh_ids

In [33]:
# Apply the function to the chemicals and diseases columns to get their MeSH IDs
df_train['mapped_chemical_ids'] = df_train['chemicals'].apply(lambda x: get_mesh_id(x, term_to_mesh))
df_train['mapped_disease_ids'] = df_train['diseases'].apply(lambda x: get_mesh_id(x, term_to_mesh))

In [35]:
def get_mismatched_chemicals(row):
    mismatches = []
    for chemical, mapped_id in zip(row['chemicals'], row['mapped_chemical_ids']):
        if mapped_id == '-1':
            mismatches.append(chemical)
    return mismatches

df_train['mismatched_chemicals'] = df_train.apply(get_mismatched_chemicals, axis=1)

#mismatch_individual_chemicals = df_train[df_train['mismatched_chemicals'].apply(lambda x: len(x) > 0)]
#mismatch_individual_chemicals[['chemicals', 'chemical_ids', 'mapped_chemical_ids', 'mismatched_chemicals']]

df_train[['chemicals', 'chemical_ids', 'mapped_chemical_ids', 'mismatched_chemicals']]

Unnamed: 0,chemicals,chemical_ids,mapped_chemical_ids,mismatched_chemicals
0,"[naloxone, clonidine, clonidine, nalozone, alp...","[D009270, D003000, D003000, -1, D008750, D0092...","[D009270, D003000, D003000, -1, D008750, D0092...","[nalozone, 3h-naloxone, 3h-dihydroergocryptine]"
1,"[lidocaine, lidocaine, lidocaine]","[D008012, D008012, D008012]","[D008012, D008012, D008012]",[]
2,"[suxamethonium, suxamethonium chloride, sch, sch]","[D013390, D013390, D013390, D013390]","[D013390, D013390, -1, -1]","[sch, sch]"
3,"[galanthamine hydrobromide, scopolamine, hyosc...","[D005702, D012601, D012601, D005702, D012601, ...","[D005702, D012601, D012601, D005702, D012601, ...",[]
4,"[lithium, lithium, lithium, lithium, lithium, ...","[D008094, D008094, D008094, D008094, D008094, ...","[D008094, D008094, D008094, D008094, D008094, ...","[li, li, li]"
...,...,...,...,...
495,"[zonisamide, zonisamide, zonisamide, zonisamid...","[C022189, C022189, C022189, C022189, C022189]","[D000078305, D000078305, D000078305, D00007830...",[]
496,"[tyrosine, pan, tyrosine, puromycin aminonucle...","[D014443, D011692, D014443, D011692, D011692, ...","[D014443, -1, D014443, D011692, D011692, -1, -...","[pan, pan, pan, pan]"
497,"[ticlopidine, ticlopidine, ticlopidine, ticlop...","[D013988, D013988, D013988, D013988, D002118]","[D013988, D013988, D013988, D013988, D002118]",[]
498,"[morphine, scopolamine, cycloheximide, morphin...","[D009020, D012601, D003513, D009020, D009020, ...","[D009020, D012601, D003513, D009020, D009020, ...",[]


In [36]:
def get_mismatched_diseases(row):
    mismatches = []
    for disease, mapped_id in zip(row['diseases'], row['mapped_disease_ids']):
        if mapped_id == '-1':
            mismatches.append(disease)
    return mismatches

df_train['mismatched_diseases'] = df_train.apply(get_mismatched_diseases, axis=1)

#mismatch_individual_diseases = df_train[df_train['mismatched_diseases'].apply(lambda x: len(x) > 0)]
#mismatch_individual_diseases[['diseases', 'disease_ids', 'mapped_disease_ids', 'mismatched_diseases']]

df_train[['diseases', 'disease_ids', 'mapped_disease_ids', 'mismatched_diseases']]

Unnamed: 0,diseases,disease_ids,mapped_disease_ids,mismatched_diseases
0,"[hypertensive, hypotensive, hypertensive, hype...","[D006973, D007022, D006973, D006973]","[-1, -1, -1, -1]","[hypertensive, hypotensive, hypertensive, hype..."
1,"[cardiac asystole, depression, bradyarrhythmias]","[D006323, D003866, D001919]","[-1, D003863, D001919]",[cardiac asystole]
2,"[fasciculations, tetanic, fasciculations, fasc...","[D005207, D013746, D005207, D005207, D005207, ...","[D005207, -1, D005207, D005207, D005207, -1, D...","[tetanic, twitch]"
3,[overdosage],[D062787],[-1],[overdosage]
4,"[chronic renal failure, nephropathy, renal fai...","[D007676, D007674, D051437, D011507, D006973, ...","[D007676, -1, D051437, D011507, D006973, -1, D...","[nephropathy, glomerulosclerosis, nephropathy]"
...,...,...,...,...
495,"[visual hallucinations, seizures, visual hallu...","[D006212, D012640, D006212, D006212, D004827, ...","[D006212, D012640, D006212, D006212, D004827, ...",[]
496,"[nephrosis, glomerular injury, nephrosis, prot...","[D009401, D007674, D009401, D011507, D011507, ...","[D009401, -1, D009401, D011507, D011507, -1]","[glomerular injury, glomerulosclerosis]"
497,"[aplastic anemia, aplastic anemia, agranulocyt...","[D000741, D000741, D000380, D001855, D000741]","[D000741, D000741, D000380, -1, D000741]",[bone marrow suppression]
498,"[amnesia, amnesia, amnesia]","[D000647, D000647, D000647]","[D000647, D000647, D000647]",[]


In [38]:
def calculate_unmatched_percentage(mapped_ids_column):
    # Flatten the list of lists for easier counting
    flat_mapped_ids = [item for sublist in mapped_ids_column for item in sublist]
    
    # Total number of IDs
    total_count = len(flat_mapped_ids)
    
    # Count how many are '-1'
    unmatched_count = flat_mapped_ids.count('-1')
    
    # Calculate the percentage
    if total_count == 0:
        return 0  # Prevent division by zero
    return (unmatched_count / total_count) * 100

chemical_unmatched_percentage = calculate_unmatched_percentage(df_train['mapped_chemical_ids'])

print(f"Percentage of mappings (not '-1') among all individual chemicals: {100 - chemical_unmatched_percentage:.2f}%")

disease_unmatched_percentage = calculate_unmatched_percentage(df_train['mapped_disease_ids'])

print(f"Percentage of mappings (not '-1') among all individual diseases: {100 - disease_unmatched_percentage:.2f}%")

Percentage of mappings (not '-1') among all individual chemicals: 75.43%
Percentage of mappings (not '-1') among all individual diseases: 61.97%


## MeSH + DSSTox

In [40]:
mesh_lookup_df = pd.read_csv('mesh_lookup_table_with_dsstox.csv')

In [44]:
mesh_lookup_expanded = mesh_lookup_df.assign(names=mesh_lookup_df['Names/Entry Terms'].str.split('|')).explode('names')
mesh_lookup_expanded['names'] = mesh_lookup_expanded['names'].str.strip().str.lower()
term_to_mesh = pd.Series(mesh_lookup_expanded['MeSH ID'].values, index=mesh_lookup_expanded['names']).to_dict()

df_train['mapped_chemical_ids'] = df_train['chemicals'].apply(lambda x: get_mesh_id(x, term_to_mesh))
df_train['mapped_disease_ids'] = df_train['diseases'].apply(lambda x: get_mesh_id(x, term_to_mesh))

df_train['mismatched_chemicals'] = df_train.apply(get_mismatched_chemicals, axis=1)
df_train['mismatched_diseases'] = df_train.apply(get_mismatched_diseases, axis=1)

In [45]:
df_train[['chemicals', 'chemical_ids', 'mapped_chemical_ids', 'mismatched_chemicals']]

Unnamed: 0,chemicals,chemical_ids,mapped_chemical_ids,mismatched_chemicals
0,"[naloxone, clonidine, clonidine, nalozone, alp...","[D009270, D003000, D003000, -1, D008750, D0092...","[D009270, D003000, D003000, -1, D008750, D0092...","[nalozone, 3h-naloxone, 3h-dihydroergocryptine]"
1,"[lidocaine, lidocaine, lidocaine]","[D008012, D008012, D008012]","[D008012, D008012, D008012]",[]
2,"[suxamethonium, suxamethonium chloride, sch, sch]","[D013390, D013390, D013390, D013390]","[D013390, D013390, -1, -1]","[sch, sch]"
3,"[galanthamine hydrobromide, scopolamine, hyosc...","[D005702, D012601, D012601, D005702, D012601, ...","[D005702, D012601, D012601, D005702, D012601, ...",[]
4,"[lithium, lithium, lithium, lithium, lithium, ...","[D008094, D008094, D008094, D008094, D008094, ...","[D008094, D008094, D008094, D008094, D008094, ...",[]
...,...,...,...,...
495,"[zonisamide, zonisamide, zonisamide, zonisamid...","[C022189, C022189, C022189, C022189, C022189]","[D000078305, D000078305, D000078305, D00007830...",[]
496,"[tyrosine, pan, tyrosine, puromycin aminonucle...","[D014443, D011692, D014443, D011692, D011692, ...","[D014443, -1, D014443, D011692, D011692, -1, -...","[pan, pan, pan, pan]"
497,"[ticlopidine, ticlopidine, ticlopidine, ticlop...","[D013988, D013988, D013988, D013988, D002118]","[D013988, D013988, D013988, D013988, D002118]",[]
498,"[morphine, scopolamine, cycloheximide, morphin...","[D009020, D012601, D003513, D009020, D009020, ...","[D009020, D012601, D003513, D009020, D009020, ...",[]


In [46]:
df_train[['diseases', 'disease_ids', 'mapped_disease_ids', 'mismatched_diseases']]

Unnamed: 0,diseases,disease_ids,mapped_disease_ids,mismatched_diseases
0,"[hypertensive, hypotensive, hypertensive, hype...","[D006973, D007022, D006973, D006973]","[-1, -1, -1, -1]","[hypertensive, hypotensive, hypertensive, hype..."
1,"[cardiac asystole, depression, bradyarrhythmias]","[D006323, D003866, D001919]","[-1, D003863, D001919]",[cardiac asystole]
2,"[fasciculations, tetanic, fasciculations, fasc...","[D005207, D013746, D005207, D005207, D005207, ...","[D005207, -1, D005207, D005207, D005207, -1, D...","[tetanic, twitch]"
3,[overdosage],[D062787],[-1],[overdosage]
4,"[chronic renal failure, nephropathy, renal fai...","[D007676, D007674, D051437, D011507, D006973, ...","[D007676, -1, D051437, D011507, D006973, -1, D...","[nephropathy, glomerulosclerosis, nephropathy]"
...,...,...,...,...
495,"[visual hallucinations, seizures, visual hallu...","[D006212, D012640, D006212, D006212, D004827, ...","[D006212, D012640, D006212, D006212, D004827, ...",[]
496,"[nephrosis, glomerular injury, nephrosis, prot...","[D009401, D007674, D009401, D011507, D011507, ...","[D009401, -1, D009401, D011507, D011507, -1]","[glomerular injury, glomerulosclerosis]"
497,"[aplastic anemia, aplastic anemia, agranulocyt...","[D000741, D000741, D000380, D001855, D000741]","[D000741, D000741, D000380, -1, D000741]",[bone marrow suppression]
498,"[amnesia, amnesia, amnesia]","[D000647, D000647, D000647]","[D000647, D000647, D000647]",[]


In [47]:
chemical_unmatched_percentage = calculate_unmatched_percentage(df_train['mapped_chemical_ids'])

print(f"Percentage of mappings (not '-1') among all individual chemicals: {100 - chemical_unmatched_percentage:.2f}%")

disease_unmatched_percentage = calculate_unmatched_percentage(df_train['mapped_disease_ids'])

print(f"Percentage of mappings (not '-1') among all individual diseases: {100 - disease_unmatched_percentage:.2f}%")

Percentage of mappings (not '-1') among all individual chemicals: 79.75%
Percentage of mappings (not '-1') among all individual diseases: 62.90%
