In [1]:
# import libraries
import pandas as pd
import requests
import xml.dom.minidom as m
import xml.etree.ElementTree as et
import time as time
import json

In [2]:
# make request using get to obtain 1000 Alzheimer (al) id
r_al_ids = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=Alzheimers+AND+2019[pdat]&retmax=1000&retmode=xml")

# parse returned response
al_doc = m.parseString(r_al_ids.text)
al_idlist = al_doc.getElementsByTagName("Id")

# adding ids to id list
al_ids = []
i = 0
while i < 1000:
    al_ids.append(al_idlist[i].childNodes[0].wholeText)
    i += 1


In [3]:
# get articles identified by ids through POST
# create dictionary with parameters for POST
al_param = {'db': 'pubmed', 'retmode': 'xml', 'id' : al_ids}
r_al = requests.post("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi", data = al_param)
print(r_al.status_code)

200


In [4]:
with open("headers", "wb") as f:
    f.write(r_al.text.encode('utf-8'))

In [5]:
# function for reading xml file into dictionary
# matching each id with its title, abstract, query, and mesh
def xml_to_dict(root, dict, query):
    for paper in root:
        # getting pmid
        id = paper.find(".//PMID")
        id = et.tostring(id, method = 'text').decode()
        # print(id)

        # inner dictionary where al_dict{id} = vals
        vals = {}

        # getting ["title", "abstract", "query", "mesh"]
        title = paper.find(".//ArticleTitle")
        if title is None:
            title = paper.find(".//BookTitle")
            if title is None:
                title = "N/A"
        title = et.tostring(title, method = "text").decode()
        vals["title"] = title

        abstract = paper.find(".//Abstract")
        if abstract is None:
            vals["abstract"] = "N/A"
        else:
            abstract = et.tostring(abstract, method = 'text').decode()
            vals["abstract"] = abstract

        vals["query"] = query

        mesh = paper.find(".//MeshHeading")
        if mesh is None:
            vals["mesh"] = "N/A"
        else:
            mesh = et.tostring(mesh, method = 'text').decode()
            vals["mesh"] = mesh

        dict[id] = vals
    return dict

In [6]:
# read metadata for Alzheimer
al_root = et.fromstring(r_al.text)
al_dict = {}
al_dict = xml_to_dict(al_root, al_dict, "Alzheimer's")

# and save information from post request as json file
with open("Alzheimer.json", "w") as f:
    json.dump(al_dict, f, indent=4)

In [7]:
# make request using get to obtain 1000 cancer (cn) id
r_cn_ids = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=cancer+AND+2019[pdat]&retmax=1000&retmode=xml")

# parse returned response
cn_doc = m.parseString(r_cn_ids.text)
cn_idlist = cn_doc.getElementsByTagName("Id")

# adding ids to id list
cn_ids = []
i = 0
while i < 1000:
    cn_ids.append(cn_idlist[i].childNodes[0].wholeText)
    i += 1

In [8]:
# get articles identified by ids through POST
# create dictionary with parameters for POST

cn_param = {'db': 'pubmed', 'retmode': 'xml', 'id': cn_ids}
r_cn = requests.post("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi", data = cn_param)

print(r_cn.status_code)

200


In [9]:
# read metadata for Alzheimer
cn_root = et.fromstring(r_cn.text)
cn_dict = {}
cn_dict = xml_to_dict(cn_root, cn_dict, "Cancer")

# and save information from post request as json file
with open("Cancer.json", "w") as f:
    json.dump(cn_dict, f, indent=4)