In [1]:
import pandas as pd
from xml.etree import ElementTree as ET
from tqdm import tqdm 
from collections import defaultdict 
import json

In [2]:
xml_file = "./data/full database.xml"

tree = ET.parse(xml_file)
root = tree.getroot()

In [3]:
ns = '{http://www.drugbank.ca}'

drug_info_list = []
name_synonyms = defaultdict(list)

for i, drug in tqdm(enumerate(root)):
    name = drug.findtext(ns + "name").strip().lower()
    synonyms_obj = drug.findall(f"{ns}synonyms/{ns}synonym[@language='english']")
    synonyms = [synonym.text.strip().lower() for synonym in synonyms_obj]

    name_set = set(synonyms)
    name_set.add(name)

    for name in name_set:
        name_synonyms[name].extend(name_set)

    dbid = drug.findtext(ns + "drugbank-id[@primary='true']")

    description = drug.findtext(f"{ns}description")
    indication = drug.findtext(f"{ns}indication")
    smiles = drug.findtext(f"{ns}calculated-properties/{ns}property[{ns}kind='SMILES']/{ns}value")

    absorption = drug.findtext(f"{ns}absorption")
    distribution = drug.findtext(f"{ns}volume-of-distribution")
    metabolism = drug.findtext(f"{ns}metabolism")
    excretion = drug.findtext(f"{ns}route-of-elimination")
    toxicity = drug.findtext(f"{ns}toxicity")

    info_tuple = (dbid, description, indication, smiles, absorption, distribution, metabolism, excretion, toxicity)

    for name in name_set:
        drug_info_list.append((name, *info_tuple))

15235it [00:13, 1154.45it/s]


In [4]:
json.dump(name_synonyms, open("data/name_synonyms.json", "w"))
drug_df = pd.DataFrame(drug_info_list, columns=["name", "dbid", "description", "indication", "smiles", "absorption", "distribution", "metabolism", "excretion", "toxicity"])
drug_df.to_csv("data/drugbank.csv", index=False, sep='\t')