# Drug Bank XML Pre Processing
<br>

### Creates TSV (tab separated variable) Tables + 1 JSON (for synonyms)

- aliases.json
- drugbank.tsv
- drugbank-slim.tsv
- drugbank-indication.tsv
- proteins.tsv
<br>

### Revision History
- updated synonym code to catch missing synonyms
- update drug selection to choose ALL Small Molecule Drugs (not only approved)



In [1]:
import os
import csv
import gzip
import collections
import re
import io
import json
import xml.etree.ElementTree as ET
import numpy as np

import requests
import pandas as pd
import xmltodict


In [2]:
# set up path directories
print(os.getcwd())
FDAPath = os.getcwd() + '/../data/source/'
DBPath = FDAPath + "DrugBank/"

download = DBPath + 'drugbank_all_full_database'
savepath = DBPath + 'DrugBank_CSVs'

xml_path = os.path.join(download, 'drugbank.xml') # just renamed above to match the xsd file
print(xml_path)

with open(xml_path,encoding="utf8") as xml_file:
    tree = ET.parse(xml_file) # read the XML File (tree)
root = tree.getroot()         # get root of the XML Tree

/home/sasdemo05/Python
/home/sasdemo05/Python/../data/source/DrugBank/drugbank_all_full_database/drugbank.xml


In [3]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['description'] = drug.findtext(ns + "description")
    row['indication'] = drug.findtext(ns+'indication')
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)

    rows.append(row)

In [4]:
# alternate parsing - 20210506 rkc

ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"
aliases = {}
newrows = list()
for i, drugnew in enumerate(root):
    row = collections.OrderedDict()
    assert drugnew.tag == ns + 'drug'
    row['type'] = drugnew.get('type')
    row['drugbank_id'] = drugnew.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drugnew.findtext(ns + "name")
    row['description'] = drugnew.findtext(ns + "description")
    row['indication'] = drugnew.findtext(ns+'indication')
    row['groups'] = [group.text for group in
        drugnew.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drugnew.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drugnew.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drugnew.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drugnew.findtext(inchikey_template.format(ns = ns))

    row['SMILES'] = drugnew.findall("{ns}calculated-properties/{ns}property[@kind='SMILES']".format(ns = ns))
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        # added /{ns}name to correct missing names here
        drugnew.findall("{ns}international-brands/{ns}international-brand/{ns}name".format(ns = ns)) +
        # why use only english ?   remove this limit
        #drugnew.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drugnew.findall("{ns}synonyms/{ns}synonym".format(ns = ns)) +
        # why have the international-brands in 2x ?   remove this
        #drugnew.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drugnew.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)
    
    newrows.append(row)

drugbanknew_df = pd.DataFrame.from_dict(newrows)

pd.options.display.max_colwidth = 500
print("drugbanknew_df class is: ", drugbanknew_df.__class__)
print("drugbanknew_df.groups class is: ", drugbanknew_df.groups.__class__)
print("drugbanknew_df.atc_codes class is: ", drugbanknew_df.atc_codes.__class__)
print("drugbanknew_df.categories class is: ", drugbanknew_df.categories.__class__)
print("drugbanknew_df.aliases class is: ", drugbanknew_df.aliases.__class__)
print(drugbanknew_df.shape)
#display(drugbanknew_df.head(8))
display(drugbanknew_df.iloc[:2])
display(drugbanknew_df.iloc[5])

display(pd.DataFrame(drugbanknew_df['type'].value_counts()))
drugbanknew_df = drugbanknew_df[drugbanknew_df.type == 'small molecule']
display(pd.DataFrame(drugbanknew_df['type'].value_counts()))


drugbanknew_df class is:  <class 'pandas.core.frame.DataFrame'>
drugbanknew_df.groups class is:  <class 'pandas.core.series.Series'>
drugbanknew_df.atc_codes class is:  <class 'pandas.core.series.Series'>
drugbanknew_df.categories class is:  <class 'pandas.core.series.Series'>
drugbanknew_df.aliases class is:  <class 'pandas.core.series.Series'>
(13580, 12)


Unnamed: 0,type,drugbank_id,name,description,indication,groups,atc_codes,categories,inchi,inchikey,SMILES,aliases
0,biotech,DB00001,Lepirudin,"Lepirudin is identical to natural hirudin except for substitution of leucine for isoleucine at the N-terminal end of the molecule and the absence of a sulfate group on the tyrosine at position 63. It is produced via yeast cells. Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.",For the treatment of heparin-induced thrombocytopenia,[approved],[B01AE02],"[Amino Acids, Peptides, and Proteins, Anticoagulants, Antithrombin Proteins, Antithrombins, Blood and Blood Forming Organs, Cardiovascular Agents, Enzyme Inhibitors, Fibrin Modulating Agents, Hematologic Agents, Peptides, Protease Inhibitors, Proteins, Serine Protease Inhibitors, Serpins, Thrombin Inhibitors]",,,[],"[Hirudin variant-1, Lepirudin, Lepirudin recombinant, Refludan]"
1,biotech,DB00002,Cetuximab,"Cetuximab is an epidermal growth factor receptor binding FAB. Cetuximab is composed of the Fv (variable; antigen-binding) regions of the 225 murine EGFr monoclonal antibody specific for the N-terminal portion of human EGFr with human IgG1 heavy and kappa light chain constant (framework) regions. Cetuximab is marketed under the brand Erbitux® by Eli Lilly and Company. In the United States, a regimen of cetuximab costs approximately $30,790 for an eight-week course.","Cetuximab, used in combination with irinotecan, is indicated for the treatment of EGFR-expressing, metastatic colorectal carcinoma in patients who are refractory to irinotecan-based chemotherapy. Cetuximab administered as a single agent is indicated for the treatment of EGFR-expressing, metastatic colorectal carcinoma in patients who are intolerant to irinotecan-based chemotherapy.",[approved],[L01XC06],"[Amino Acids, Peptides, and Proteins, Antibodies, Antibodies, Monoclonal, Antibodies, Monoclonal, Humanized, Antineoplastic Agents, Antineoplastic Agents, Immunological, Antineoplastic and Immunomodulating Agents, Blood Proteins, Epidermal Growth Factor Receptor Antagonist, Globulins, HER1 Antagonists, Immunoglobulins, Immunoproteins, Narrow Therapeutic Index Drugs, Proteins, Serum Globulins]",,,[],"[Cetuximab, Cetuximabum, Cétuximab, Erbitux]"


type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                small molecule
drugbank_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

Unnamed: 0,type
small molecule,11414
biotech,2166


Unnamed: 0,type
small molecule,11414


In [5]:

with open(xml_path, encoding="utf8") as db:
    doc = xmltodict.parse(db.read())

    values = []
for item in doc['drugbank']['drug']:
    logp = None
    try:
        ID = item['drugbank-id']['#text']
        prop = item['experimental-properties']
        calc = item['calculated-properties']
        if prop is not None:
            # Iterate through the list of propterties
            prop_list = prop['property']
            if type(prop_list) is not list:
                prop_list = [prop_list]
            for p in prop_list:
                if p['kind'] == logP:
                    logp = p['value']
        if logp is not None:
            for prop in calc['property']:
                if prop['kind'] == SMILES:
                    smiles = prop['value']
                if prop['kind'] == logP:
                    logp_exp = prop['value']
            values.append((ID, smiles, logp, logp_exp))
    except:
        pass

values_df = pd.DataFrame(values)
columns = ["DrugBankID", "SMILES", "expLogP", "calcLogP"]
#values_df.columns = columns


In [6]:
values_df.shape


(0, 0)

In [7]:
# save aliases.json file
aliasoutf = savepath+'/aliases.json'
print(aliasoutf)
alias_dict = {row['drugbank_id']: row['aliases'] for row in newrows} # switch from rows to newrows
print(len(alias_dict))
print(alias_dict.__class__)

with open(aliasoutf, 'w') as fp:
    json.dump(alias_dict, fp, indent=2, sort_keys=True)

/home/sasdemo05/Python/../data/source/DrugBank/DrugBank_CSVs/aliases.json
13580
<class 'dict'>


In [8]:
import itertools
out = dict(itertools.islice(alias_dict.items(), 5)) 
display(out)

{'DB00001': ['Hirudin variant-1',
  'Lepirudin',
  'Lepirudin recombinant',
  'Refludan'],
 'DB00002': ['Cetuximab', 'Cetuximabum', 'Cétuximab', 'Erbitux'],
 'DB00003': ['Deoxyribonuclease (human clone 18-1 protein moiety)',
  'Dornasa alfa',
  'Dornase alfa',
  'Dornase alfa, recombinant',
  'Dornase alpha',
  'Pulmozyme',
  'Pulmozyme 1mg/ml',
  'Recombinant deoxyribonuclease (DNAse)',
  'Viscozyme'],
 'DB00004': ['Denileukin',
  'Denileukin diftitox',
  'Interleukin-2/diptheria toxin fusion protein',
  'Ontak'],
 'DB00005': ['Brenzys',
  'Davictrel',
  'Enbrel',
  'Erelzi',
  'Etanercept',
  'Recombinant human TNF',
  'TNFR-Immunoadhesin',
  'Tunex',
  'etanercept-szzs',
  'etanercept-ykro',
  'rhu TNFR:Fc',
  'rhu-TNFR:Fc']}

In [9]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [10]:
display(rows.__class__)
display(rows[:2])



list

[OrderedDict([('type', 'biotech'),
              ('drugbank_id', 'DB00001'),
              ('name', 'Lepirudin'),
              ('description',
               'Lepirudin is identical to natural hirudin except for substitution of leucine for isoleucine at the N-terminal end of the molecule and the absence of a sulfate group on the tyrosine at position 63. It is produced via yeast cells. Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.'),
              ('indication',
               'For the treatment of heparin-induced thrombocytopenia'),
              ('groups', 'approved'),
              ('atc_codes', 'B01AE02'),
              ('categories',
               'Amino Acids, Peptides, and Proteins|Anticoagulants|Antithrombin Proteins|Antithrombins|Blood and Blood Forming Organs|Cardiovascular Agents|Enzyme Inhibitors|Fibrin Modulating Agents|Hematologic Agents|Peptides|Protease Inhibitors|Proteins|Serine Protease Inhibitors|Serpins|Thrombin Inhibitors'),
         

In [11]:
columns = ['drugbank_id', 'name', 'indication']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
drugbank_df.head(10)

Unnamed: 0,drugbank_id,name,indication
0,DB00001,Lepirudin,For the treatment of heparin-induced thrombocytopenia
1,DB00002,Cetuximab,"Cetuximab, used in combination with irinotecan, is indicated for the treatment of EGFR-expressing, metastatic colorectal carcinoma in patients who are refractory to irinotecan-based chemotherapy. Cetuximab administered as a single agent is indicated for the treatment of EGFR-expressing, metastatic colorectal carcinoma in patients who are intolerant to irinotecan-based chemotherapy."
2,DB00003,Dornase alfa,Used as adjunct therapy in the treatment of cystic fibrosis.
3,DB00004,Denileukin diftitox,For treatment of cutaneous T-cell lymphoma
4,DB00005,Etanercept,Etanercept is indicated for the treatment of moderately to severely active rheumatoid arthritis in adults and chronic moderate to severe plaque psoriasis in adults. It is also used to manage signs and symptoms of polyarticular idiopathic arthritis in those aged 4 to 17 after insufficient response to one or more disease-modifying anti-rheumatic drugs. Etanercept is also used to improve psoriatic arthritis and ankylosing spondylitis.
5,DB00006,Bivalirudin,"For treatment of heparin-induced thrombocytopenia and for the prevention of thrombosis. Bivalirudin is indicated for use in patients undergoing percutaneous coronary intervention (PCI), in patients at moderate to high risk acute coronary syndromes due to unstable angina or non-ST segment elevation in whom a PCI is planned."
6,DB00007,Leuprolide,"Leuprolide is indicated for the palliative treatment of advanced prostate cancer[L13781, L13790] as well as for the treatment of pediatric patients with central precocious puberty (CPP).[L13784, L13787] In combination with oral [norethisterone] (also known as norethindrone), leuprolide is also indicated for the initial treatment of the symptoms of endometriosis.[L10310] Finally, in combination with iron supplementation, leuprolide is indicated for the preoperative hematological improvement o..."
7,DB00008,Peginterferon alfa-2a,Peginterferon alfa-2a is indicated for the treatment of HCV in combination with other antiviral drugs in patients over 5 years of age with compensated liver disease [FDA Label]. May be used as a monotherapy in patients with contraindications to or significant intolerance to other anti-viral therapies.\r\n\r\nPeginterferon alfa-2a is also indicated as a monotherapy for adult patients with HBeAg positive and HBeAg negative chronic hepatitis B infection who have compensated liver disease and ev...
8,DB00009,Alteplase,"For management of acute myocardial infarction, acute ischemic strok and for lysis of acute pulmonary emboli"
9,DB00010,Sermorelin,"For the treatment of dwarfism, prevention of HIV-induced weight loss"


In [12]:
# write drugbank indications
path = os.path.join(savepath, 'drugbank-indication.tsv')
drugbank_df.to_csv(path, sep='\t', index=False)

In [13]:
print(path)
print(drugbank_df.shape)


/home/sasdemo05/Python/../data/source/DrugBank/DrugBank_CSVs/drugbank-indication.tsv
(13580, 3)


In [14]:
columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi', 'description']
drugbank_df = pd.DataFrame.from_dict(rows)[columns]
display(pd.DataFrame(drugbank_df['type'].value_counts()))
display(drugbank_df.head(10))


Unnamed: 0,type
small molecule,11414
biotech,2166


Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
0,DB00001,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagulants|Antithrombin Proteins|Antithrombins|Blood and Blood Forming Organs|Cardiovascular Agents|Enzyme Inhibitors|Fibrin Modulating Agents|Hematologic Agents|Peptides|Protease Inhibitors|Proteins|Serine Protease Inhibitors|Serpins|Thrombin Inhibitors",,,"Lepirudin is identical to natural hirudin except for substitution of leucine for isoleucine at the N-terminal end of the molecule and the absence of a sulfate group on the tyrosine at position 63. It is produced via yeast cells. Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012."
1,DB00002,Cetuximab,biotech,approved,L01XC06,"Amino Acids, Peptides, and Proteins|Antibodies|Antibodies, Monoclonal|Antibodies, Monoclonal, Humanized|Antineoplastic Agents|Antineoplastic Agents, Immunological|Antineoplastic and Immunomodulating Agents|Blood Proteins|Epidermal Growth Factor Receptor Antagonist|Globulins|HER1 Antagonists|Immunoglobulins|Immunoproteins|Narrow Therapeutic Index Drugs|Proteins|Serum Globulins",,,"Cetuximab is an epidermal growth factor receptor binding FAB. Cetuximab is composed of the Fv (variable; antigen-binding) regions of the 225 murine EGFr monoclonal antibody specific for the N-terminal portion of human EGFr with human IgG1 heavy and kappa light chain constant (framework) regions. Cetuximab is marketed under the brand Erbitux® by Eli Lilly and Company. In the United States, a regimen of cetuximab costs approximately $30,790 for an eight-week course."
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and Cold Preparations|Decreased Respiratory Secretion Viscosity|Deoxyribonucleases|Endodeoxyribonucleases|Endonucleases|Enzymes|Enzymes and Coenzymes|Esterases|Expectorants|Hydrolases|Proteins|Recombinant Human Deoxyribonuclease 1",,,"Dornase alfa is a biosynthetic form of human deoxyribunuclease I (DNase I) enzyme. It is produced in genetically modified Chinese hamster ovary (CHO) cells using recombinant DNA technology. The 260-amino acid sequence of dornase alfa is identical to the endogenous human enzyme. Dornase alfa cleaves extracellular DNA to 5´-phosphodinucleotide and 5´-phosphooligonucleotide end products without affecting intracellular DNA. In individuals with cystic fibrosis, extracellular DNA, which is an extr..."
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides, and Proteins|Antineoplastic Agents|Antineoplastic and Immunomodulating Agents|Bacterial Toxins|Biological Factors|CD25-directed Cytotoxin|Cytokines|Enzymes|Enzymes and Coenzymes|Glycosyltransferases|Intercellular Signaling Peptides and Proteins|Interleukins|Lymphokines|Narrow Therapeutic Index Drugs|Pentosyltransferases|Peptides|Proteins|Recombinant Proteins|Toxins, Biological|Transferases",,,A recombinant DNA-derived cytotoxic protein composed of the amino acid sequences for diphtheria toxin fragments A and B (Met 1-Thr 387)-His followed by the sequences for interleukin-2 (IL-2; Ala 1-Thr 133). It is produced in an E. coli expression system.
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Agents reducing cytokine levels|Amino Acids, Peptides, and Proteins|Anti-Inflammatory Agents|Antibodies|Antirheumatic Agents|Biological Products|Biologics for Rheumatoid Arthritis Treatment|Complex Mixtures|Dermatologicals|Disease-modifying Antirheumatic Agents|Immunoglobulin Constant Regions|Immunoglobulin Fc Fragments|Immunoglobulin Fragments|Immunoglobulin Isotypes|Immunologic Factors|Immunoproteins|Immunosuppressive Agents|Membrane Proteins|Peptide Fragments|Peptides|Proteins|Receptors, ...",,,"Dimeric fusion protein consisting of the extracellular ligand-binding portion of the human 75 kilodalton (p75) tumor necrosis factor receptor (TNFR) linked to the Fc portion of human IgG1. The Fc component of etanercept contains the CH2 domain, the CH3 domain and hinge region, but not the CH1 domain of IgG1. Etanercept is produced by recombinant DNA technology in a Chinese hamster ovary (CHO) mammalian cell expression system. It consists of 934 amino acids."
5,DB00006,Bivalirudin,small molecule,approved|investigational,B01AE06,"Amino Acids, Peptides, and Proteins|Anticoagulants|Antithrombin Proteins|Antithrombins|Blood and Blood Forming Organs|Enzyme Inhibitors|Hematologic Agents|Peptides|Protease Inhibitors|Proteins|Serine Protease Inhibitors|Serpins|Thrombin Inhibitors",OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122-39-15-23-70(122)92(149)114-60(30-34-79(134)135)85(142)111-59(29-33-78(132)133)86(143)116-64(43-55-24-26-56(123)27-25-55)89(146)118-67(97(154)155)40-51(2)3)119-87(144)61(31-35-80(136)137)112-84(141)58(28-32-77(130)131)113-88(145)63(42-54-18-10-7-11-19-54)117-90(147)66(45-81(138)139)110-76(129)50-107-83(140)65(44-71(100)124)109-75(128)49-106-73(126)47-104-72(125)46-105-74(127)48-108-91(148)68-21-13-38-121(68)95(152)62(20-12-36-103-98(101)102)115-9...,"Bivalirudin is a synthetic 20 residue peptide (thrombin inhibitor) which reversibly inhibits thrombin. Once bound to the active site, thrombin cannot activate fibrinogen into fibrin, the crucial step in the formation of thrombus. It is administered intravenously. Because it can cause blood stagnation, it is important to monitor changes in hematocrit, activated partial thromboplastin time, international normalized ratio and blood pressure."
6,DB00007,Leuprolide,small molecule,approved|investigational,L02AE51|L02AE02,"Adrenal Cortex Hormones|Agents Causing Muscle Toxicity|Amino Acids, Peptides, and Proteins|Antineoplastic Agents|Antineoplastic Agents, Hormonal|Antineoplastic and Immunomodulating Agents|Drugs causing inadvertant photosensitivity|Drugs that are Mainly Renally Excreted|Endocrine Therapy|Fertility Agents|Fertility Agents, Female|Gonadotropin Releasing Hormone Receptor Agonist|Gonadotropin Releasing Hormone Receptor Agonists|Gonadotropin-releasing hormone agonist|Gonadotropins|Hormones|Hormone...",GFIJNRVAKGFPGQ-LIJARHBVSA-N,"InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-22-75(48)58(87)41(13-9-21-64-59(60)61)68-51(80)42(23-32(2)3)69-52(81)43(24-33(4)5)70-53(82)44(25-34-15-17-37(77)18-16-34)71-56(85)47(30-76)74-54(83)45(26-35-28-65-39-12-8-7-11-38(35)39)72-55(84)46(27-36-29-62-31-66-36)73-50(79)40-19-20-49(78)67-40/h7-8,11-12,15-18,28-29,31-33,40-48,65,76-77H,6,9-10,13-14,19-27,30H2,1-5H3,(H,62,66)(H,63,86)(H,67,78)(H,68,80)(H,69,81)(H,70,82)(H,71,85)(H,72,84)(H,73,79)(H,74,83)(H4,60,61,64)/t40-,41-,42-,43+,44-,45-...","Leuprolide is a synthetic 9-residue peptide analogue of gonadotropin-releasing hormone (GnRH). Unlike the endogenous decapeptide GnRH, leuprolide contains a single D-amino acid (D-leucyl) residue, which helps to increase its circulating half-life from three to four minutes to approximately three hours.[A203222] As a GnRH mimic, leuprolide is capable of binding to the GnRH receptor (GnRHR) and inducing downstream modulation of both gonadotropin hormone and sex steroid levels. Prolonged activa..."
7,DB00008,Peginterferon alfa-2a,biotech,approved|investigational,L03AB11|L03AB61,"Adjuvants, Immunologic|Alcohols|Alfa Interferons|Amino Acids, Peptides, and Proteins|Anti-Infective Agents|Antineoplastic Agents|Antineoplastic and Immunomodulating Agents|Antiviral Agents|Biological Factors|Cardiotoxic antineoplastic agents|Compounds used in a research, industrial, or household setting|Cytochrome P-450 CYP1A2 Inhibitors|Cytochrome P-450 CYP1A2 Inhibitors (strength unknown)|Cytochrome P-450 Enzyme Inhibitors|Cytokines|Drug Carriers|Ethylene Glycols|Glycols|Hepatotoxic Agents...",,,"Peginterferon alfa-2a is a form of recombinant interferon used as part of combination therapy to treat chronic Hepatitis C, an infectious liver disease caused by infection with Hepatitis C Virus (HCV). HCV is a single-stranded RNA virus that is categorized into nine distinct genotypes, with genotype 1 being the most common in the United States, and affecting 72% of all chronic HCV patients [L852]. Treatment options for chronic Hepatitis C have advanced significantly since 2011, with the deve..."
8,DB00009,Alteplase,biotech,approved,B01AD02|S01XA13,"Agents causing angioedema|Amino Acids, Peptides, and Proteins|Anticoagulants|Biological Factors|Blood and Blood Forming Organs|Blood Proteins|Cardiovascular Agents|Endopeptidases|Enzymes|Enzymes and Coenzymes|Fibrin Modulating Agents|Fibrinolytic Agents|Hematologic Agents|Hydrolases|Ophthalmologicals|Peptide Hydrolases|Plasminogen Activators|Proteins|Sensory Organs|Serine Endopeptidases|Serine Proteases|Tissue Plasminogen Activator|Tissue Plasminogen Activator, antagonists & inhibitors",,,"Human tissue plasminogen activator, purified, glycosylated, 527 residues purified from CHO cells"
9,DB00010,Sermorelin,biotech,approved|withdrawn,V04CD03|H01AC04,"Amino Acids, Peptides, and Proteins|Anterior Pituitary Lobe Hormones and Analogues|Diagnostic Agents|Growth Hormone-Releasing Hormone|Hormones|Hormones, Hormone Substitutes, and Hormone Antagonists|Hypothalamic Hormones|Nerve Tissue Proteins|Neuropeptides|Peptide Hormones|Peptides|Pituitary and Hypothalamic Hormones and Analogues|Pituitary Hormone-Releasing Hormones|Proteins|Somatropin and Somatropin Agonists|Systemic Hormonal Preparations, Excl. Sex Hormones and Insulins|Tests for Pituitary...",,,Sermorelin acetate is the acetate salt of an amidated synthetic 29-amino acid peptide (GRF 1-29 NH 2 ) that corresponds to the amino-terminal segment of the naturally occurring human growth hormone-releasing hormone (GHRH or GRF) consisting of 44 amino acid residues


In [15]:
drugbank_slim_df = drugbank_df[
    #drugbank_df.groups.map(lambda x: 'approved' in x) &
    #drugbank_df.inchi.map(lambda x: x is not None) &
    drugbank_df.type.map(lambda x: x == 'small molecule')
]

display(drugbank_slim_df.shape)
display(drugbank_slim_df.head())

(11414, 9)

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
5,DB00006,Bivalirudin,small molecule,approved|investigational,B01AE06,"Amino Acids, Peptides, and Proteins|Anticoagulants|Antithrombin Proteins|Antithrombins|Blood and Blood Forming Organs|Enzyme Inhibitors|Hematologic Agents|Peptides|Protease Inhibitors|Proteins|Serine Protease Inhibitors|Serpins|Thrombin Inhibitors",OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122-39-15-23-70(122)92(149)114-60(30-34-79(134)135)85(142)111-59(29-33-78(132)133)86(143)116-64(43-55-24-26-56(123)27-25-55)89(146)118-67(97(154)155)40-51(2)3)119-87(144)61(31-35-80(136)137)112-84(141)58(28-32-77(130)131)113-88(145)63(42-54-18-10-7-11-19-54)117-90(147)66(45-81(138)139)110-76(129)50-107-83(140)65(44-71(100)124)109-75(128)49-106-73(126)47-104-72(125)46-105-74(127)48-108-91(148)68-21-13-38-121(68)95(152)62(20-12-36-103-98(101)102)115-9...,"Bivalirudin is a synthetic 20 residue peptide (thrombin inhibitor) which reversibly inhibits thrombin. Once bound to the active site, thrombin cannot activate fibrinogen into fibrin, the crucial step in the formation of thrombus. It is administered intravenously. Because it can cause blood stagnation, it is important to monitor changes in hematocrit, activated partial thromboplastin time, international normalized ratio and blood pressure."
6,DB00007,Leuprolide,small molecule,approved|investigational,L02AE51|L02AE02,"Adrenal Cortex Hormones|Agents Causing Muscle Toxicity|Amino Acids, Peptides, and Proteins|Antineoplastic Agents|Antineoplastic Agents, Hormonal|Antineoplastic and Immunomodulating Agents|Drugs causing inadvertant photosensitivity|Drugs that are Mainly Renally Excreted|Endocrine Therapy|Fertility Agents|Fertility Agents, Female|Gonadotropin Releasing Hormone Receptor Agonist|Gonadotropin Releasing Hormone Receptor Agonists|Gonadotropin-releasing hormone agonist|Gonadotropins|Hormones|Hormone...",GFIJNRVAKGFPGQ-LIJARHBVSA-N,"InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-22-75(48)58(87)41(13-9-21-64-59(60)61)68-51(80)42(23-32(2)3)69-52(81)43(24-33(4)5)70-53(82)44(25-34-15-17-37(77)18-16-34)71-56(85)47(30-76)74-54(83)45(26-35-28-65-39-12-8-7-11-38(35)39)72-55(84)46(27-36-29-62-31-66-36)73-50(79)40-19-20-49(78)67-40/h7-8,11-12,15-18,28-29,31-33,40-48,65,76-77H,6,9-10,13-14,19-27,30H2,1-5H3,(H,62,66)(H,63,86)(H,67,78)(H,68,80)(H,69,81)(H,70,82)(H,71,85)(H,72,84)(H,73,79)(H,74,83)(H4,60,61,64)/t40-,41-,42-,43+,44-,45-...","Leuprolide is a synthetic 9-residue peptide analogue of gonadotropin-releasing hormone (GnRH). Unlike the endogenous decapeptide GnRH, leuprolide contains a single D-amino acid (D-leucyl) residue, which helps to increase its circulating half-life from three to four minutes to approximately three hours.[A203222] As a GnRH mimic, leuprolide is capable of binding to the GnRH receptor (GnRHR) and inducing downstream modulation of both gonadotropin hormone and sex steroid levels. Prolonged activa..."
13,DB00014,Goserelin,small molecule,approved,L02AE03,"Adrenal Cortex Hormones|Amino Acids, Peptides, and Proteins|Antineoplastic Agents|Antineoplastic Agents, Hormonal|Antineoplastic and Immunomodulating Agents|Drugs that are Mainly Renally Excreted|Endocrine Therapy|Gonadotropin Releasing Hormone Receptor Agonist|Gonadotropin Releasing Hormone Receptor Agonists|Gonadotropin-releasing hormone agonist|Gonadotropins and Antigonadotropins|Hormones|Hormones and Related Agents|Hormones, Hormone Substitutes, and Hormone Antagonists|Hyperglycemia-Asso...",BLCLNMBMMGCOAS-URPVMXJPSA-N,"InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-39(12-8-20-64-57(60)61)56(89)77-21-9-13-46(77)55(88)75-76-58(62)90)69-54(87)45(29-91-59(3,4)5)74-50(83)41(23-32-14-16-35(79)17-15-32)70-53(86)44(28-78)73-51(84)42(24-33-26-65-37-11-7-6-10-36(33)37)71-52(85)43(25-34-27-63-30-66-34)72-48(81)38-18-19-47(80)67-38/h6-7,10-11,14-17,26-27,30-31,38-46,65,78-79H,8-9,12-13,18-25,28-29H2,1-5H3,(H,63,66)(H,67,80)(H,68,82)(H,69,87)(H,70,86)(H,71,85)(H,72,81)(H,73,84)(H,74,83)(H,75,88)(H4,60,61,64)(H3,62,76,90)...","Goserelin is a synthetic hormone. In men, it stops the production of the hormone testosterone, which may stimulate the growth of cancer cells. In women, goserelin decreases the production of the hormone estradiol (which may stimulate the growth of cancer cells) to levels similar to a postmenopausal state. When the medication is stopped, hormone levels return to normal."
25,DB00027,Gramicidin D,small molecule,approved,R02AB30,"Amino Acids, Peptides, and Proteins|Anti-Bacterial Agents|Anti-Infective Agents|Anti-Infective Agents, Local|Membrane Proteins|P-glycoprotein inhibitors|P-glycoprotein substrates|Peptides|Peptides, Cyclic|Pore Forming Cytotoxic Proteins|Proteins|Throat Preparations",NDAYQJDHGXTBJL-MWWSRJDJSA-N,"InChI=1S/C96H135N19O16/c1-50(2)36-71(105-79(118)48-102-93(128)80(54(9)10)103-49-117)86(121)104-58(17)84(119)113-82(56(13)14)95(130)115-83(57(15)16)96(131)114-81(55(11)12)94(129)112-78(43-62-47-101-70-33-25-21-29-66(62)70)92(127)108-74(39-53(7)8)89(124)111-77(42-61-46-100-69-32-24-20-28-65(61)69)91(126)107-73(38-52(5)6)88(123)110-76(41-60-45-99-68-31-23-19-27-64(60)68)90(125)106-72(37-51(3)4)87(122)109-75(85(120)97-34-35-116)40-59-44-98-67-30-22-18-26-63(59)67/h18-33,44-47,49-58,71-78,80-83,9...","Gramcidin D is a heterogeneous mixture of three antibiotic compounds, gramicidins A, B and C, making up 80%, 6%, and 14% respectively all of which are obtained from the soil bacterial species Bacillus brevis and called collectively gramicidin D. Gramcidins are 15 residue peptides with alternating D and L amino acids, which assemble inside of the hydrophobic interior of the cellular lipid bilayer to form a β-helix. Active against most Gram-positive bacteria and some Gram-negative organisms, G..."
33,DB00035,Desmopressin,small molecule,approved,H01BA02,"Agents that produce hypertension|Amino Acids, Peptides, and Proteins|Antidiuretic Agents|Arginine Vasopressin|Cardiovascular Agents|Coagulants|Drugs that are Mainly Renally Excreted|Factor VIII Activator|Hematologic Agents|Hemostatics|Hormones|Hormones, Hormone Substitutes, and Hormone Antagonists|Increased Coagulation Factor VIII Activity|Increased Coagulation Factor VIII Concentration|Natriuretic Agents|Nerve Tissue Proteins|Neuropeptides|Oligopeptides|Peptide Hormones|Peptides|Pituitary|P...",NFLWUMRGJYTJIN-PNIOQBSNSA-N,"InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(67)58-32(22-36(48)63)43(70)59-33(45(72)60-18-5-9-34(60)44(71)56-28(8-4-17-52-46(50)51)39(66)53-23-37(49)64)24-74-73-19-16-38(65)54-30(21-26-10-12-27(61)13-11-26)41(68)57-31(42(69)55-29)20-25-6-2-1-3-7-25/h1-3,6-7,10-13,28-34,61H,4-5,8-9,14-24H2,(H2,47,62)(H2,48,63)(H2,49,64)(H,53,66)(H,54,65)(H,55,69)(H,56,71)(H,57,68)(H,58,67)(H,59,70)(H4,50,51,52)/t28-,29+,30+,31+,32+,33+,34+/m1/s1","Desmopressin (dDAVP), a synthetic analogue of 8-arginine vasopressin (ADH), is an antidiuretic peptide drug modified by deamination of 1-cysteine and substitution of 8-L-arginine by 8-D-arginine. ADH is an endogenous pituitary hormone that has a crucial role in the control of the water content in the body. Upon release from the stimulation of increased plasma osmolarity or decreased circulating blood volume, ADH mainly acts on the cells of the distal part of the nephron and the collecting tu..."


In [16]:
# write drugbank tsv
path = os.path.join(savepath, 'drugbank.tsv')
drugbank_df.to_csv(path, sep='\t', index=False)
print(path)
print(drugbank_df.shape)

# write slim drugbank tsv
path = os.path.join(savepath, 'drugbank-slim.tsv')
drugbank_slim_df.to_csv(path, sep='\t', index=False)
print(path)
print(drugbank_slim_df.shape)


/home/sasdemo05/Python/../data/source/DrugBank/DrugBank_CSVs/drugbank.tsv
(13580, 9)
/home/sasdemo05/Python/../data/source/DrugBank/DrugBank_CSVs/drugbank-slim.tsv
(11414, 9)


In [17]:
protein_rows = list()
for i, drug in enumerate(root):
    drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
    for category in ['target', 'enzyme', 'carrier', 'transporter']:
        proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))
        for protein in proteins:
            row = {'drugbank_id': drugbank_id, 'category': category}
            row['organism'] = protein.findtext('{}organism'.format(ns))
            row['known_action'] = protein.findtext('{}known-action'.format(ns))
            actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))
            row['actions'] = '|'.join(action.text for action in actions)
            uniprot_ids = [polypep.text for polypep in protein.findall(
                "{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=ns))]            
            if len(uniprot_ids) != 1:
                continue
            row['uniprot_id'] = uniprot_ids[0]
            #ref_text = protein.findtext("{ns}references[@format='textile']".format(ns=ns))
            #pmids = re.findall(r'pubmed/([0-9]+)', ref_text)
            #row['pubmed_ids'] = '|'.join(pmids)
            protein_rows.append(row)

protein_df = pd.DataFrame.from_dict(protein_rows)

In [18]:
display(protein_df[15:24])

Unnamed: 0,actions,category,drugbank_id,known_action,organism,uniprot_id
15,ligand,target,DB00005,unknown,Humans,P31994
16,ligand,target,DB00005,unknown,Humans,P31995
17,ligand,target,DB00005,unknown,Humans,P08637
18,ligand,target,DB00005,unknown,Humans,O75015
19,antibody,target,DB00005,yes,Humans,P01374
20,inhibitor,target,DB00006,yes,Humans,P00734
21,inhibitor,enzyme,DB00006,unknown,Humans,P05164
22,agonist,target,DB00007,yes,Humans,P30968
23,agonist,target,DB00008,yes,Humans,P48551


In [19]:
columns = ['drugbank_id', 'category', 'uniprot_id', 'organism']
entrez_df = protein_df[columns]

In [20]:
entrez_df.head()

Unnamed: 0,drugbank_id,category,uniprot_id,organism
0,DB00001,target,P00734,Humans
1,DB00002,target,P00533,Humans
2,DB00002,target,O75015,Humans
3,DB00002,target,P02745,Humans
4,DB00002,target,P02746,Humans


In [21]:
path = os.path.join(savepath, 'proteins.tsv')
#entrez_df.to_csv(path, sep=',', index=False)
entrez_df.to_csv(path, sep='\t', index=False)

print(path)
print(entrez_df.shape)


/home/sasdemo05/Python/../data/source/DrugBank/DrugBank_CSVs/proteins.tsv
(26965, 4)


In [22]:
len(set(entrez_df.drugbank_id))

7861

In [23]:
len(set(entrez_df.uniprot_id))

4995

In [24]:
len(entrez_df)

26965

next steps for existing fields

In [25]:
# parse out atc_codes, groups and indications into one-hot encoded versions



### fields to consider adding (see excel examples if needed)

targets
enzymes
carriers
transporters

pathways
	drugs
	enzymes

reactions

drug-interactions
	drug-interaction
		drugbank-id
		name
		description

food-interactions
	food-interaction

ahfs-codes
protein-binding

absorption
toxicity
metabolism
mechanism-of-action
pharmacodynamics
indication

average-mass
monoisotopic-mass

calculated-properties
	kind, value
	Molecular Weight, 2180.2853
	SMILES, CC[C@H]...
	Molecular Formula, C98H138N24O33
	Polar Surface Area (PSA), 901.57

external-identifiers

classifiation
	direct-parent
	kingdom
	superclass
	class
	subclass

category
	category-name
	mesh-id

product
	route
	approved
	dosage-form