In [1]:
# This jupyter notebook is borrowed from https://github.com/dhimmel/drugbank
import os, sys
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import pickle as pkl
import collections
import re

## Load DrugBank data
### Parse XML file

In [2]:
tree = ET.parse('/home/oyj/data/DrugTarget/DrugBank/full database.xml')
root  = tree.getroot()

#### Drug information

In [3]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"
smiles_template = "{ns}calculated-properties/{ns}property[{ns}kind='SMILES']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    # drug info
    row['type'] = drug.get('type')  # Type
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")  # ID
    row['name'] = drug.findtext(ns + "name")  # Generic Name
    # row['description'] = drug.findtext(ns + "description")  # Background
    row['groups'] = [group.text for group in drug.findall("{ns}groups/{ns}group".format(ns = ns))]  # Group: Clinical status
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]  # ATC Codes
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]  # Drug Categories
    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    row['smiles'] = drug.findtext(smiles_template.format(ns = ns))
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)

    rows.append(row)

In [4]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [5]:
columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi', 'smiles', 'aliases']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
drugbank_df.head()

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,smiles,aliases
0,DB00001,Lepirudin,biotech,approved|withdrawn,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",,,,Lepirudin|Refludan
1,DB00002,Cetuximab,biotech,approved,L01FE01,"Amino Acids, Peptides, and Proteins|Antibodies...",,,,Cetuximab|Erbitux
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and ...",,,,\n |Dornase alfa|Pulmozyme|Pulmozyme 1mg/ml
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides,...",,,,Denileukin diftitox|Ontak
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Agents reducing cytokine levels|Amino Acids, P...",,,,\n |Benepali|Brenzys|Enbrel|Erelzi|Etaner...


In [6]:
drugbank_slim_df = drugbank_df[
    drugbank_df.groups.map(lambda x: 'approved' in x) &
    drugbank_df.smiles.map(lambda x: x is not None) &
    drugbank_df.type.map(lambda x: x == 'small molecule')
]
print(drugbank_df.shape)
print(drugbank_slim_df.shape)

drugbank_df.to_csv('/home/oyj/data/DrugTarget/DrugBank/Processed_drugbank_whole_info.csv', index=False)
drugbank_slim_df.to_csv('/home/oyj/data/DrugTarget/DrugBank/Processed_drugbank_sm_approved.csv', index=False)

(15235, 10)
(2744, 10)


#### Protein information

In [7]:
protein_rows = list()
for i, drug in enumerate(root):
    drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
    for category in ['target', 'enzyme', 'carrier', 'transporter']:
        proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))
        for protein in proteins:
            row = {'drugbank_id': drugbank_id, 'category': category}
            row['organism'] = protein.findtext('{}organism'.format(ns))
            row['known_action'] = protein.findtext('{}known-action'.format(ns))
            actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))
            row['actions'] = '|'.join(action.text for action in actions)
            uniprot_ids = [polypep.text for polypep in protein.findall(
                "{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=ns))]            
            # if len(uniprot_ids) != 1:
            #     continue
            row['uniprot_id'] = "|".join(uniprot_ids)
            # ref_text = protein.findtext("{ns}references[@format='textile']".format(ns=ns))
            # pmids = re.findall(r'pubmed/([0-9]+)', ref_text)
            # row['pubmed_ids'] = '|'.join(pmids)
            protein_rows.append(row)

protein_df = pd.DataFrame.from_dict(protein_rows)

In [8]:
protein_df.head(3)

Unnamed: 0,drugbank_id,category,organism,known_action,actions,uniprot_id
0,DB00001,target,Humans,yes,inhibitor,P00734
1,DB00002,target,Humans,yes,binder,P00533
2,DB00002,target,Humans,unknown,binder,O75015


In [9]:
protein_df.shape

(29279, 6)

In [10]:
# set(drugbank_df.drugbank_id.tolist()) - set(protein_df.drugbank_id.tolist())

In [11]:
len(root)

15235

In [12]:
protein_df.to_csv('/home/oyj/data/DrugTarget/DrugBank/Processed_DTI_info.csv', index=False)

In [13]:
# Read our uniprot to entrez_gene mapping
# import requests
# import io, gzip

# response = requests.get('http://git.dhimmel.com/uniprot/data/map/GeneID.tsv.gz', stream=True)
# text = io.TextIOWrapper(gzip.GzipFile(fileobj=response.raw))
# uniprot_df = pd.read_table(text, engine='python')
# uniprot_df.rename(columns={'uniprot': 'uniprot_id', 'GeneID': 'entrez_gene_id'}, inplace=True)

# # merge uniprot mapping with protein_df
# entrez_df = protein_df.merge(uniprot_df, how='inner')