In [None]:
import xml.etree.ElementTree as ET

tree = ET.parse("../full database.xml")
root = tree.getroot()

print(root.tag)
print(len(root))


In [8]:
ns = {"db": "http://www.drugbank.ca"}


In [10]:
drug = root[0]

for child in drug:
    print(child.tag.split("}")[-1])



drugbank-id
drugbank-id
drugbank-id
name
description
cas-number
unii
state
groups
general-references
synthesis-reference
indication
pharmacodynamics
mechanism-of-action
toxicity
metabolism
absorption
half-life
protein-binding
route-of-elimination
volume-of-distribution
clearance
classification
salts
synonyms
products
international-brands
mixtures
packagers
manufacturers
prices
categories
affected-organisms
dosages
atc-codes
ahfs-codes
pdb-entries
patents
food-interactions
drug-interactions
sequences
experimental-properties
external-identifiers
external-links
pathways
reactions
snp-effects
snp-adverse-drug-reactions
targets
enzymes
carriers
transporters


In [11]:
def get_drugbank_id(drug):
    for dbid in drug.findall("db:drugbank-id", ns):
        if dbid.attrib.get("primary") == "true":
            return dbid.text
    return None



In [12]:
def get_name(drug):
    elem = drug.find("db:name", ns)
    return elem.text if elem is not None else None


In [13]:
def is_approved(drug):
    groups = drug.find("db:groups", ns)
    if groups is None:
        return False
    return any(g.text == "approved" for g in groups.findall("db:group", ns))


In [19]:
# Simplified Molecular Input Line Entry System
def get_smiles(drug):
    props = drug.find("db:calculated-properties", ns)
    if props is None:
        return None

    for prop in props.findall("db:property", ns):
        kind = prop.find("db:kind", ns)
        value = prop.find("db:value", ns)
        if kind is not None and kind.text == "SMILES":
            return value.text
    return None


In [15]:
def get_targets(drug):
    genes = []
    targets = drug.find("db:targets", ns)
    if targets is None:
        return genes

    for target in targets.findall("db:target", ns):
        gene = target.find("db:gene-name", ns)
        if gene is not None:
            genes.append(gene.text)
    return genes


In [16]:
import pandas as pd

rows = []

for drug in root.findall("db:drug", ns):
    if not is_approved(drug):
        continue

    rows.append({
        "drugbank_id": get_drugbank_id(drug),
        "name": get_name(drug),
        "smiles": get_smiles(drug),
        "targets": get_targets(drug)
    })

df = pd.DataFrame(rows)
print(df.head())


  drugbank_id                 name smiles targets
0     DB00001            Lepirudin   None      []
1     DB00002            Cetuximab   None      []
2     DB00003         Dornase alfa   None      []
3     DB00004  Denileukin diftitox   None      []
4     DB00005           Etanercept   None      []


In [17]:
def is_small_molecule(drug):
    return drug.attrib.get("type") == "small molecule"


In [18]:
rows = []

for drug in root.findall("db:drug", ns):
    if not is_approved(drug):
        continue
    if not is_small_molecule(drug):
        continue

    rows.append({
        "drugbank_id": get_drugbank_id(drug),
        "name": get_name(drug),
        "smiles": get_smiles(drug),
        "targets": get_targets(drug)
    })

df = pd.DataFrame(rows)
print(df.head())


  drugbank_id          name  \
0     DB00006   Bivalirudin   
1     DB00014     Goserelin   
2     DB00027  Gramicidin D   
3     DB00035  Desmopressin   
4     DB00050    Cetrorelix   

                                              smiles targets  
0  CC[C@H](C)[C@H](NC(=O)[C@H](CCC(O)=O)NC(=O)[C@...      []  
1  CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...      []  
2  CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...      []  
3  NC(=O)CC[C@@H]1NC(=O)[C@H](CC2=CC=CC=C2)NC(=O)...      []  
4  CC(C)C[C@H](NC(=O)[C@@H](CCCNC(N)=O)NC(=O)[C@H...      []  


In [20]:
df.shape

(3018, 4)