# XML Parsing example
SAX vs DOM
https://www.tutorialspoint.com/python/python_xml_processing.htm

## copy of drug_target_relationship_DrugBank with small changes
### - modified download path to my folder
### - added savepath
### - added display of file output path and shape (rows, columns) for each .tsv written
### - modified proteins.tsv write to use tabs rather than commas so it matches the file extension


In [2]:
import os
import csv
import gzip
import collections
import re
import io
import json
import xml.etree.ElementTree as ET
import numpy as np

import requests
import pandas as pd
import xmltodict


In [2]:
# load downloaded drugbank xml files and get parsed structure 
# download = 'C:/Users/ZLiu/anaconda/COVID19/data_06242020/drugbank_all_full_database.xml'

# to run on laptop
#download = 'C:/Users/rycarr/Documents/FDA/Data/DrugBank/drugbank_all_full_database'
#savepath = 'C:/Users/rycarr/Documents/FDA/Data/DrugBank'

# to run on RACE image

#download = '/home/sasdemo/FDAData/1_DrugBank/drugbank_all_full_database'
download = '/opt/sas/viya/config/data/cas/default/public/FDA/DrugBank/drugbank_all_full_database'
savepath = '/home/sasdemo/FDAData/1_DrugBank/DrugBank_CSVs'

# moved to CAS and got locked out :-/
# /opt/sas/viya/config/data/cas/default/public/FDA/DrugBank/drugbank_all_full_database/drugbank.xml

#xml_path = os.path.join(download, 'full database.xml')
xml_path = os.path.join(download, 'drugbank.xml') # just renamed above to match the xsd file
print(xml_path)

with open(xml_path,encoding="utf8") as xml_file:
    tree = ET.parse(xml_file) # read the XML File (tree)
root = tree.getroot()         # get root of the XML Tree

/opt/sas/viya/config/data/cas/default/public/FDA/DrugBank/drugbank_all_full_database/drugbank.xml


In [22]:
ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"

rows = list()
for i, drug in enumerate(root):
    row = collections.OrderedDict()
    assert drug.tag == ns + 'drug'
    row['type'] = drug.get('type')
    row['drugbank_id'] = drug.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drug.findtext(ns + "name")
    row['description'] = drug.findtext(ns + "description")
    row['indication'] = drug.findtext(ns+'indication')
    row['groups'] = [group.text for group in
        drug.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drug.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drug.findtext(inchikey_template.format(ns = ns))
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drug.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drug.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)

    rows.append(row)

### fields to consider adding (see excel examples if needed)

targets
enzymes
carriers
transporters

pathways
	drugs
	enzymes

reactions

drug-interactions
	drug-interaction
		drugbank-id
		name
		description

food-interactions
	food-interaction

ahfs-codes
protein-binding

absorption
toxicity
metabolism
mechanism-of-action
pharmacodynamics
indication

average-mass
monoisotopic-mass

calculated-properties
	kind, value
	Molecular Weight, 2180.2853
	SMILES, CC[C@H]...
	Molecular Formula, C98H138N24O33
	Polar Surface Area (PSA), 901.57

external-identifiers

classifiation
	direct-parent
	kingdom
	superclass
	class
	subclass

category
	category-name
	mesh-id

product
	route
	approved
	dosage-form

In [68]:
# alternate parsing - 20210506 rkc

ns = '{http://www.drugbank.ca}'
inchikey_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value"
inchi_template = "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value"
aliases = {}
newrows = list()
for i, drugnew in enumerate(root):
    row = collections.OrderedDict()
    assert drugnew.tag == ns + 'drug'
    row['type'] = drugnew.get('type')
    row['drugbank_id'] = drugnew.findtext(ns + "drugbank-id[@primary='true']")
    row['name'] = drugnew.findtext(ns + "name")
    row['description'] = drugnew.findtext(ns + "description")
    row['indication'] = drugnew.findtext(ns+'indication')
    row['groups'] = [group.text for group in
        drugnew.findall("{ns}groups/{ns}group".format(ns = ns))]
    row['atc_codes'] = [code.get('code') for code in
        drugnew.findall("{ns}atc-codes/{ns}atc-code".format(ns = ns))]
    row['categories'] = [x.findtext(ns + 'category') for x in
        drugnew.findall("{ns}categories/{ns}category".format(ns = ns))]
    row['inchi'] = drugnew.findtext(inchi_template.format(ns = ns))
    row['inchikey'] = drugnew.findtext(inchikey_template.format(ns = ns))

    row['SMILES'] = drugnew.findall("{ns}calculated-properties/{ns}property[@kind='SMILES']".format(ns = ns))
    
    # Add drug aliases
    aliases = {
        elem.text for elem in 
        # added /{ns}name to correct missing names here
        drugnew.findall("{ns}international-brands/{ns}international-brand/{ns}name".format(ns = ns)) +
        # why use only english ?   remove this limit
        #drugnew.findall("{ns}synonyms/{ns}synonym[@language='English']".format(ns = ns)) +
        drugnew.findall("{ns}synonyms/{ns}synonym".format(ns = ns)) +
        # why have the international-brands in 2x ?   remove this
        #drugnew.findall("{ns}international-brands/{ns}international-brand".format(ns = ns)) +
        drugnew.findall("{ns}products/{ns}product/{ns}name".format(ns = ns))

    }
    aliases.add(row['name'])
    row['aliases'] = sorted(aliases)
    
    newrows.append(row)

drugbanknew_df = pandas.DataFrame.from_dict(newrows)

pandas.options.display.max_colwidth = 500
print("drugbanknew_df class is: ", drugbanknew_df.__class__)
print("drugbanknew_df.groups class is: ", drugbanknew_df.groups.__class__)
print("drugbanknew_df.atc_codes class is: ", drugbanknew_df.atc_codes.__class__)
print("drugbanknew_df.categories class is: ", drugbanknew_df.categories.__class__)
print("drugbanknew_df.aliases class is: ", drugbanknew_df.aliases.__class__)
print(drugbanknew_df.shape)
#display(drugbanknew_df.head(8))
display(drugbanknew_df.iloc[:2])
display(drugbanknew_df.iloc[5])

display(pandas.DataFrame(drugbanknew_df['type'].value_counts()))
drugbanknew_df = drugbanknew_df[drugbanknew_df.type == 'small molecule']
display(pandas.DataFrame(drugbanknew_df['type'].value_counts()))


drugbanknew_df class is:  <class 'pandas.core.frame.DataFrame'>
drugbanknew_df.groups class is:  <class 'pandas.core.series.Series'>
drugbanknew_df.atc_codes class is:  <class 'pandas.core.series.Series'>
drugbanknew_df.categories class is:  <class 'pandas.core.series.Series'>
drugbanknew_df.aliases class is:  <class 'pandas.core.series.Series'>
(13580, 12)


Unnamed: 0,type,drugbank_id,name,description,indication,groups,atc_codes,categories,inchi,inchikey,SMILES,aliases
0,biotech,DB00001,Lepirudin,"Lepirudin is identical to natural hirudin except for substitution of leucine for isoleucine at the N-terminal end of the molecule and the absence of a sulfate group on the tyrosine at position 63. It is produced via yeast cells. Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.",For the treatment of heparin-induced thrombocytopenia,[approved],[B01AE02],"[Amino Acids, Peptides, and Proteins, Anticoagulants, Antithrombin Proteins, Antithrombins, Blood and Blood Forming Organs, Cardiovascular Agents, Enzyme Inhibitors, Fibrin Modulating Agents, Hematologic Agents, Peptides, Protease Inhibitors, Proteins, Serine Protease Inhibitors, Serpins, Thrombin Inhibitors]",,,[],"[Hirudin variant-1, Lepirudin, Lepirudin recombinant, Refludan]"
1,biotech,DB00002,Cetuximab,"Cetuximab is an epidermal growth factor receptor binding FAB. Cetuximab is composed of the Fv (variable; antigen-binding) regions of the 225 murine EGFr monoclonal antibody specific for the N-terminal portion of human EGFr with human IgG1 heavy and kappa light chain constant (framework) regions. Cetuximab is marketed under the brand Erbitux® by Eli Lilly and Company. In the United States, a regimen of cetuximab costs approximately $30,790 for an eight-week course.","Cetuximab, used in combination with irinotecan, is indicated for the treatment of EGFR-expressing, metastatic colorectal carcinoma in patients who are refractory to irinotecan-based chemotherapy. Cetuximab administered as a single agent is indicated for the treatment of EGFR-expressing, metastatic colorectal carcinoma in patients who are intolerant to irinotecan-based chemotherapy.",[approved],[L01XC06],"[Amino Acids, Peptides, and Proteins, Antibodies, Antibodies, Monoclonal, Antibodies, Monoclonal, Humanized, Antineoplastic Agents, Antineoplastic Agents, Immunological, Antineoplastic and Immunomodulating Agents, Blood Proteins, Epidermal Growth Factor Receptor Antagonist, Globulins, HER1 Antagonists, Immunoglobulins, Immunoproteins, Narrow Therapeutic Index Drugs, Proteins, Serum Globulins]",,,[],"[Cetuximab, Cetuximabum, Cétuximab, Erbitux]"


type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                small molecule
drugbank_id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

Unnamed: 0,type
small molecule,11414
biotech,2166


Unnamed: 0,type
small molecule,11414


In [52]:

with open(xml_path, encoding="utf8") as db:
    doc = xmltodict.parse(db.read())

    values = []
for item in doc['drugbank']['drug']:
    logp = None
    try:
        ID = item['drugbank-id']['#text']
        prop = item['experimental-properties']
        calc = item['calculated-properties']
        if prop is not None:
            # Iterate through the list of propterties
            prop_list = prop['property']
            if type(prop_list) is not list:
                prop_list = [prop_list]
            for p in prop_list:
                if p['kind'] == logP:
                    logp = p['value']
        if logp is not None:
            for prop in calc['property']:
                if prop['kind'] == SMILES:
                    smiles = prop['value']
                if prop['kind'] == logP:
                    logp_exp = prop['value']
            values.append((ID, smiles, logp, logp_exp))
    except:
        pass

values_df = pd.DataFrame(values)
columns = ["DrugBankID", "SMILES", "expLogP", "calcLogP"]
#values_df.columns = columns


KeyError: '{http://www.drugbank.ca}drugbank'

In [51]:
values_df.shape


(0, 0)

In [None]:
# parse out atc_codes, groups and indications into one-hot encoded versions



In [69]:
# save aliases.json file
aliasoutf = savepath+'/aliases.json'
print(aliasoutf)
alias_dict = {row['drugbank_id']: row['aliases'] for row in newrows} # switch from rows to newrows
print(len(alias_dict))
print(alias_dict.__class__)

with open(aliasoutf, 'w') as fp:
    json.dump(alias_dict, fp, indent=2, sort_keys=True)

/home/sasdemo/FDAData/1_DrugBank/DrugBank_CSVs/aliases.json
13580
<class 'dict'>


In [70]:
import itertools
out = dict(itertools.islice(alias_dict.items(), 5)) 
display(out)

{'DB00001': ['Hirudin variant-1',
  'Lepirudin',
  'Lepirudin recombinant',
  'Refludan'],
 'DB00002': ['Cetuximab', 'Cetuximabum', 'Cétuximab', 'Erbitux'],
 'DB00003': ['Deoxyribonuclease (human clone 18-1 protein moiety)',
  'Dornasa alfa',
  'Dornase alfa',
  'Dornase alfa, recombinant',
  'Dornase alpha',
  'Pulmozyme',
  'Pulmozyme 1mg/ml',
  'Recombinant deoxyribonuclease (DNAse)',
  'Viscozyme'],
 'DB00004': ['Denileukin',
  'Denileukin diftitox',
  'Interleukin-2/diptheria toxin fusion protein',
  'Ontak'],
 'DB00005': ['Brenzys',
  'Davictrel',
  'Enbrel',
  'Erelzi',
  'Etanercept',
  'Recombinant human TNF',
  'TNFR-Immunoadhesin',
  'Tunex',
  'etanercept-szzs',
  'etanercept-ykro',
  'rhu TNFR:Fc',
  'rhu-TNFR:Fc']}

In [5]:
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [6]:
display(rows.__class__)
display(rows[:2])



list

[OrderedDict([('type', 'biotech'),
              ('drugbank_id', 'DB00001'),
              ('name', 'Lepirudin'),
              ('description',
               'Lepirudin is identical to natural hirudin except for substitution of leucine for isoleucine at the N-terminal end of the molecule and the absence of a sulfate group on the tyrosine at position 63. It is produced via yeast cells. Bayer ceased the production of lepirudin (Refludan) effective May 31, 2012.'),
              ('indication',
               'For the treatment of heparin-induced thrombocytopenia'),
              ('groups', 'approved'),
              ('atc_codes', 'B01AE02'),
              ('categories',
               'Amino Acids, Peptides, and Proteins|Anticoagulants|Antithrombin Proteins|Antithrombins|Blood and Blood Forming Organs|Cardiovascular Agents|Enzyme Inhibitors|Fibrin Modulating Agents|Hematologic Agents|Peptides|Protease Inhibitors|Proteins|Serine Protease Inhibitors|Serpins|Thrombin Inhibitors'),
         

In [7]:
columns = ['drugbank_id', 'name', 'indication']
drugbank_df = pandas.DataFrame.from_dict(rows)[columns]
drugbank_df.head(10)

Unnamed: 0,drugbank_id,name,indication
0,DB00001,Lepirudin,For the treatment of heparin-induced thrombocy...
1,DB00002,Cetuximab,"Cetuximab, used in combination with irinotecan..."
2,DB00003,Dornase alfa,Used as adjunct therapy in the treatment of cy...
3,DB00004,Denileukin diftitox,For treatment of cutaneous T-cell lymphoma
4,DB00005,Etanercept,Etanercept is indicated for the treatment of m...
5,DB00006,Bivalirudin,For treatment of heparin-induced thrombocytope...
6,DB00007,Leuprolide,Leuprolide is indicated for the palliative tre...
7,DB00008,Peginterferon alfa-2a,Peginterferon alfa-2a is indicated for the tre...
8,DB00009,Alteplase,"For management of acute myocardial infarction,..."
9,DB00010,Sermorelin,"For the treatment of dwarfism, prevention of H..."


In [8]:
# write drugbank indications
path = os.path.join(savepath, 'drugbank-indication.tsv')
drugbank_df.to_csv(path, sep='\t', index=False)

In [9]:
print(path)
print(drugbank_df.shape)


/home/sasdemo/FDAData/1_DrugBank/DrugBank_CSVs/drugbank-indication.tsv
(13580, 3)


In [10]:
columns = ['drugbank_id', 'name', 'type', 'groups', 'atc_codes', 'categories', 'inchikey', 'inchi', 'description']
drugbank_df = pandas.DataFrame.from_dict(rows)[columns]
display(pandas.DataFrame(drugbank_df['type'].value_counts()))
display(drugbank_df.head(10))


Unnamed: 0,type
small molecule,11414
biotech,2166


Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
0,DB00001,Lepirudin,biotech,approved,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",,,Lepirudin is identical to natural hirudin exce...
1,DB00002,Cetuximab,biotech,approved,L01XC06,"Amino Acids, Peptides, and Proteins|Antibodies...",,,Cetuximab is an epidermal growth factor recept...
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and ...",,,Dornase alfa is a biosynthetic form of human d...
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides,...",,,A recombinant DNA-derived cytotoxic protein co...
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Agents reducing cytokine levels|Amino Acids, P...",,,Dimeric fusion protein consisting of the extra...
5,DB00006,Bivalirudin,small molecule,approved|investigational,B01AE06,"Amino Acids, Peptides, and Proteins|Anticoagul...",OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,Bivalirudin is a synthetic 20 residue peptide ...
6,DB00007,Leuprolide,small molecule,approved|investigational,L02AE51|L02AE02,Adrenal Cortex Hormones|Agents Causing Muscle ...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,Leuprolide is a synthetic 9-residue peptide an...
7,DB00008,Peginterferon alfa-2a,biotech,approved|investigational,L03AB11|L03AB61,"Adjuvants, Immunologic|Alcohols|Alfa Interfero...",,,Peginterferon alfa-2a is a form of recombinant...
8,DB00009,Alteplase,biotech,approved,B01AD02|S01XA13,"Agents causing angioedema|Amino Acids, Peptide...",,,"Human tissue plasminogen activator, purified, ..."
9,DB00010,Sermorelin,biotech,approved|withdrawn,V04CD03|H01AC04,"Amino Acids, Peptides, and Proteins|Anterior P...",,,Sermorelin acetate is the acetate salt of an a...


In [11]:
drugbank_slim_df = drugbank_df[
    #drugbank_df.groups.map(lambda x: 'approved' in x) &
    #drugbank_df.inchi.map(lambda x: x is not None) &
    drugbank_df.type.map(lambda x: x == 'small molecule')
]

display(drugbank_slim_df.shape)
display(drugbank_slim_df.head())

(11414, 9)

Unnamed: 0,drugbank_id,name,type,groups,atc_codes,categories,inchikey,inchi,description
5,DB00006,Bivalirudin,small molecule,approved|investigational,B01AE06,"Amino Acids, Peptides, and Proteins|Anticoagul...",OIRCOABEOLEUMC-GEJPAHFPSA-N,InChI=1S/C98H138N24O33/c1-5-52(4)82(96(153)122...,Bivalirudin is a synthetic 20 residue peptide ...
6,DB00007,Leuprolide,small molecule,approved|investigational,L02AE51|L02AE02,Adrenal Cortex Hormones|Agents Causing Muscle ...,GFIJNRVAKGFPGQ-LIJARHBVSA-N,InChI=1S/C59H84N16O12/c1-6-63-57(86)48-14-10-2...,Leuprolide is a synthetic 9-residue peptide an...
13,DB00014,Goserelin,small molecule,approved,L02AE03,"Adrenal Cortex Hormones|Amino Acids, Peptides,...",BLCLNMBMMGCOAS-URPVMXJPSA-N,InChI=1S/C59H84N18O14/c1-31(2)22-40(49(82)68-3...,"Goserelin is a synthetic hormone. In men, it s..."
25,DB00027,Gramicidin D,small molecule,approved,R02AB30,"Amino Acids, Peptides, and Proteins|Anti-Bacte...",NDAYQJDHGXTBJL-MWWSRJDJSA-N,InChI=1S/C96H135N19O16/c1-50(2)36-71(105-79(11...,Gramcidin D is a heterogeneous mixture of thre...
33,DB00035,Desmopressin,small molecule,approved,H01BA02,"Agents that produce hypertension|Amino Acids, ...",NFLWUMRGJYTJIN-PNIOQBSNSA-N,InChI=1S/C46H64N14O12S2/c47-35(62)15-14-29-40(...,"Desmopressin (dDAVP), a synthetic analogue of ..."


In [12]:
# write drugbank tsv
path = os.path.join(savepath, 'drugbank.tsv')
drugbank_df.to_csv(path, sep='\t', index=False)
print(path)
print(drugbank_df.shape)

# write slim drugbank tsv
path = os.path.join(savepath, 'drugbank-slim.tsv')
drugbank_slim_df.to_csv(path, sep='\t', index=False)
print(path)
print(drugbank_slim_df.shape)


/home/sasdemo/FDAData/1_DrugBank/DrugBank_CSVs/drugbank.tsv
(13580, 9)
/home/sasdemo/FDAData/1_DrugBank/DrugBank_CSVs/drugbank-slim.tsv
(11414, 9)


In [13]:
protein_rows = list()
for i, drug in enumerate(root):
    drugbank_id = drug.findtext(ns + "drugbank-id[@primary='true']")
    for category in ['target', 'enzyme', 'carrier', 'transporter']:
        proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=ns, cat=category))
        for protein in proteins:
            row = {'drugbank_id': drugbank_id, 'category': category}
            row['organism'] = protein.findtext('{}organism'.format(ns))
            row['known_action'] = protein.findtext('{}known-action'.format(ns))
            actions = protein.findall('{ns}actions/{ns}action'.format(ns=ns))
            row['actions'] = '|'.join(action.text for action in actions)
            uniprot_ids = [polypep.text for polypep in protein.findall(
                "{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=ns))]            
            if len(uniprot_ids) != 1:
                continue
            row['uniprot_id'] = uniprot_ids[0]
            #ref_text = protein.findtext("{ns}references[@format='textile']".format(ns=ns))
            #pmids = re.findall(r'pubmed/([0-9]+)', ref_text)
            #row['pubmed_ids'] = '|'.join(pmids)
            protein_rows.append(row)

protein_df = pandas.DataFrame.from_dict(protein_rows)

In [14]:
display(protein_df[15:24])

Unnamed: 0,actions,category,drugbank_id,known_action,organism,uniprot_id
15,ligand,target,DB00005,unknown,Humans,P31994
16,ligand,target,DB00005,unknown,Humans,P31995
17,ligand,target,DB00005,unknown,Humans,P08637
18,ligand,target,DB00005,unknown,Humans,O75015
19,antibody,target,DB00005,yes,Humans,P01374
20,inhibitor,target,DB00006,yes,Humans,P00734
21,inhibitor,enzyme,DB00006,unknown,Humans,P05164
22,agonist,target,DB00007,yes,Humans,P30968
23,agonist,target,DB00008,yes,Humans,P48551


In [15]:
columns = ['drugbank_id', 'category', 'uniprot_id', 'organism']
entrez_df = protein_df[columns]

In [16]:
entrez_df.head()

Unnamed: 0,drugbank_id,category,uniprot_id,organism
0,DB00001,target,P00734,Humans
1,DB00002,target,P00533,Humans
2,DB00002,target,O75015,Humans
3,DB00002,target,P02745,Humans
4,DB00002,target,P02746,Humans


In [17]:
path = os.path.join(savepath, 'proteins.tsv')
#entrez_df.to_csv(path, sep=',', index=False)
entrez_df.to_csv(path, sep='\t', index=False)

print(path)
print(entrez_df.shape)


/home/sasdemo/FDAData/1_DrugBank/DrugBank_CSVs/proteins.tsv
(26965, 4)


In [18]:
len(set(entrez_df.drugbank_id))

7861

In [19]:
len(set(entrez_df.uniprot_id))

4995

In [20]:
len(entrez_df)

26965