# Drug Bank XML Pre Processing
<br>

### Creates TSV (tab separated variable) Tables + 1 JSON (for synonyms)

- aliases.json
- drugbank.tsv
- drugbank-slim.tsv
- drugbank-indication.tsv
- proteins.tsv
<br>

### Revision History
- updated synonym code to catch missing synonyms
- update drug selection to choose ALL Small Molecule Drugs (not only approved)



In [None]:
import os
import csv
import gzip
import collections
import re
import io
import json
import xml.etree.ElementTree as ET
import numpy as np

import requests
import pandas as pd
import xmltodict


In [None]:
# set up path directories
print(os.getcwd())
FDAPath = os.getcwd() + '/../data/source/'
DBPath = FDAPath + "DrugBank/"

download = DBPath + 'drugbank_all_full_database'
savepath = DBPath + 'DrugBank_CSVs'

xml_path = os.path.join(download, 'drugbank.xml') # just renamed above to match the xsd file
print(xml_path)

with open(xml_path,encoding="utf8") as xml_file:
    tree = ET.parse(xml_file) # read the XML File (tree)
root = tree.getroot()         # get root of the XML Tree

In [None]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['description'] = drug.findtext(ns + "description")
    row['indication'] = drug.findtext(ns+'indication')
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)

    rows.append(row)

In [None]:
# alternate parsing - 20210506 rkc

ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"
aliases = {}
newrows = list()
for i, drugnew in enumerate(root):
    row = collections.OrderedDict()
    assert drugnew.tag == ns + 'drug'
    row['type'] = drugnew.get('type')
    row['drugbank_id'] = drugnew.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drugnew.findtext(ns + "name")
    row['description'] = drugnew.findtext(ns + "description")
    row['indication'] = drugnew.findtext(ns+'indication')
    row['groups'] = [group.text for group in
        drugnew.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drugnew.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drugnew.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drugnew.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drugnew.findtext(inchikey_template.format(ns = ns))

    row['SMILES'] = drugnew.findall("{ns}calculated-properties/{ns}property[@kind='SMILES']".format(ns = ns))
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        # added /{ns}name to correct missing names here
        drugnew.findall("{ns}international-brands/{ns}international-brand/{ns}name".format(ns = ns)) +
        # why use only english ?   remove this limit
        #drugnew.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drugnew.findall("{ns}synonyms/{ns}synonym".format(ns = ns)) +
        # why have the international-brands in 2x ?   remove this
        #drugnew.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drugnew.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)
    
    newrows.append(row)

drugbanknew_df = pd.DataFrame.from_dict(newrows)

pd.options.display.max_colwidth = 500
print("drugbanknew_df class is: ", drugbanknew_df.__class__)
print("drugbanknew_df.groups class is: ", drugbanknew_df.groups.__class__)
print("drugbanknew_df.atc_codes class is: ", drugbanknew_df.atc_codes.__class__)
print("drugbanknew_df.categories class is: ", drugbanknew_df.categories.__class__)
print("drugbanknew_df.aliases class is: ", drugbanknew_df.aliases.__class__)
print(drugbanknew_df.shape)
#display(drugbanknew_df.head(8))
display(drugbanknew_df.iloc[:2])
display(drugbanknew_df.iloc[5])

display(pd.DataFrame(drugbanknew_df['type'].value_counts()))
drugbanknew_df = drugbanknew_df[drugbanknew_df.type == 'small molecule']
display(pd.DataFrame(drugbanknew_df['type'].value_counts()))


In [None]:

with open(xml_path, encoding="utf8") as db:
    doc = xmltodict.parse(db.read())

    values = []
for item in doc['drugbank']['drug']:
    logp = None
    try:
        ID = item['drugbank-id']['#text']
        prop = item['experimental-properties']
        calc = item['calculated-properties']
        if prop is not None:
            # Iterate through the list of propterties
            prop_list = prop['property']
            if type(prop_list) is not list:
                prop_list = [prop_list]
            for p in prop_list:
                if p['kind'] == logP:
                    logp = p['value']
        if logp is not None:
            for prop in calc['property']:
                if prop['kind'] == SMILES:
                    smiles = prop['value']
                if prop['kind'] == logP:
                    logp_exp = prop['value']
            values.append((ID, smiles, logp, logp_exp))
    except:
        pass

values_df = pd.DataFrame(values)
columns = ["DrugBankID", "SMILES", "expLogP", "calcLogP"]
#values_df.columns = columns


In [None]:
values_df.shape


In [None]:
# save aliases.json file
aliasoutf = savepath+'/aliases.json'
print(aliasoutf)
alias_dict = {row['drugbank_id']: row['aliases'] for row in newrows} # switch from rows to newrows
print(len(alias_dict))
print(alias_dict.__class__)

with open(aliasoutf, 'w') as fp:
    json.dump(alias_dict, fp, indent=2, sort_keys=True)

In [None]:
import itertools
out = dict(itertools.islice(alias_dict.items(), 5)) 
display(out)

In [None]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [None]:
display(rows.__class__)
display(rows[:2])



In [None]:
columns = ['drugbank_id', 'name', 'indication']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
drugbank_df.head(10)

In [None]:
# write drugbank indications
path = os.path.join(savepath, 'drugbank-indication.tsv')
drugbank_df.to_csv(path, sep='\t', index=False)

In [None]:
print(path)
print(drugbank_df.shape)


In [None]:
columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi', 'description']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
display(pd.DataFrame(drugbank_df['type'].value_counts()))
display(drugbank_df.head(10))


In [None]:
drugbank_slim_df = drugbank_df[
    #drugbank_df.groups.map(lambda x: 'approved' in x) &
    #drugbank_df.inchi.map(lambda x: x is not None) &
    drugbank_df.type.map(lambda x: x == 'small molecule')
]

display(drugbank_slim_df.shape)
display(drugbank_slim_df.head())

In [None]:
# write drugbank tsv
path = os.path.join(savepath, 'drugbank.tsv')
drugbank_df.to_csv(path, sep='\t', index=False)
print(path)
print(drugbank_df.shape)

# write slim drugbank tsv
path = os.path.join(savepath, 'drugbank-slim.tsv')
drugbank_slim_df.to_csv(path, sep='\t', index=False)
print(path)
print(drugbank_slim_df.shape)


In [None]:
protein_rows = list()
for i, drug in enumerate(root):
    drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
    for category in ['target', 'enzyme', 'carrier', 'transporter']:
        proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))
        for protein in proteins:
            row = {'drugbank_id': drugbank_id, 'category': category}
            row['organism'] = protein.findtext('{}organism'.format(ns))
            row['known_action'] = protein.findtext('{}known-action'.format(ns))
            actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))
            row['actions'] = '|'.join(action.text for action in actions)
            uniprot_ids = [polypep.text for polypep in protein.findall(
                "{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=ns))]            
            if len(uniprot_ids) != 1:
                continue
            row['uniprot_id'] = uniprot_ids[0]
            #ref_text = protein.findtext("{ns}references[@format='textile']".format(ns=ns))
            #pmids = re.findall(r'pubmed/([0-9]+)', ref_text)
            #row['pubmed_ids'] = '|'.join(pmids)
            protein_rows.append(row)

protein_df = pd.DataFrame.from_dict(protein_rows)

In [None]:
display(protein_df[15:24])

In [None]:
columns = ['drugbank_id', 'category', 'uniprot_id', 'organism']
entrez_df = protein_df[columns]

In [None]:
entrez_df.head()

In [None]:
path = os.path.join(savepath, 'proteins.tsv')
#entrez_df.to_csv(path, sep=',', index=False)
entrez_df.to_csv(path, sep='\t', index=False)

print(path)
print(entrez_df.shape)


In [None]:
len(set(entrez_df.drugbank_id))

In [None]:
len(set(entrez_df.uniprot_id))

In [None]:
len(entrez_df)

next steps for existing fields

In [None]:
# parse out atc_codes, groups and indications into one-hot encoded versions



### fields to consider adding (see excel examples if needed)

targets
enzymes
carriers
transporters

pathways
	drugs
	enzymes

reactions

drug-interactions
	drug-interaction
		drugbank-id
		name
		description

food-interactions
	food-interaction

ahfs-codes
protein-binding

absorption
toxicity
metabolism
mechanism-of-action
pharmacodynamics
indication

average-mass
monoisotopic-mass

calculated-properties
	kind, value
	Molecular Weight, 2180.2853
	SMILES, CC[C@H]...
	Molecular Formula, C98H138N24O33
	Polar Surface Area (PSA), 901.57

external-identifiers

classifiation
	direct-parent
	kingdom
	superclass
	class
	subclass

category
	category-name
	mesh-id

product
	route
	approved
	dosage-form