In [14]:
# import module minidom and libraries requests, time, json.
import requests
import xml.dom.minidom as m
import time
import json

In [28]:
# function get_id_of_disease() was defined to return the PubmedId list of a specific disease.
def get_id_of_disease(disease):
    # send a GET request to the specified url, and return a response object.
    r = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={disease}+AND+2019[pdat]&retmode=xml&retmax=1000")
    time.sleep(1)
    # use minidom to parse strings of text from response object.
    doc = m.parseString(r.text)
    # get the Id elements by .getElementByTagName().
    PubmedId = doc.getElementsByTagName('Id')
    # create a list IdList, to save all the PubmedId.
    IdList = []
    # use for loop to get values of element, save into the list IdList.
    # reference: https://stackoverflow.com/questions/317413/get-element-value-with-minidom-with-python
    for i in range(len(PubmedId)):
        IdList.append(PubmedId[i].firstChild.data)

    return IdList

In [29]:
# function overlap_in_two_papers() was defined, to determine whether there is a overlap in the two sets of papers.
def overlap_in_two_papers(disease1,disease2):
    # get the PubmedId list from disease1 and disease2, save into lists IdList1, IdList2, separately.
    IdList1 = get_id_of_disease(disease1)
    IdList2 = get_id_of_disease(disease2)
    # turn list into set to remove the duplicates.
    set1 = set(IdList1)
    set2 = set(IdList2)
    overlap = list(set1&set2)
    # If there is no elements in the list overlap, there is no overlap.
    if len(overlap) == 0:
        print('There is no overlap in the two sets of papers that I identified.')
    # If there is one element in the list overlap, there is a overlap.
    elif len(overlap) == 1:
        print(f"There is a overlap in the two sets of papers that I identified, the Pubmed Id is {overlap[0]}.")
        return overlap[0]
    # If there are more than one elements in the list overlap, there are multiple overlaps.
    else:
        print(f"There are overlaps in the two sets of papers that I identified, the Pubmed Ids are{overlap}.")
        return overlap

In [30]:
overlap_in_two_papers('Alzheimers','cancer')

There is a overlap in the two sets of papers that I identified, the Pubmed Id is 32501203.


'32501203'

In [18]:
# function pull_metadata() was defined, to pull the metadata for each paper of a specific disease.
def pull_metadata(disease):
    global paper
    # run function get_id_of_disease(), to get the PubmedId list of a specific disease. 
    IdList = get_id_of_disease(disease)
    # create dictionary paper, to save the metadata for each paper of this disease.
    paper = {}
    for PubmedId in IdList:
        # suspends execution for each second.
        time.sleep(1)
        # send a GET request to the url of each PubmedId and returned a response object
        r = requests.post(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id={int(PubmedId)}")
        doc = m.parseString(r.text)

        # get the ArticleTitle elements by .getElementByTagName().
        ArticleTitle = doc.getElementsByTagName('ArticleTitle')
        Title = ""
        if len(ArticleTitle) > 0:
            # if ArticleTitle is not empty, loop through childnodes of all the elements, and get the text.
            for elm in ArticleTitle:
                for textmessage in elm.childNodes:
                    try:
                        Title += textmessage._get_wholeText()
                    # reference: https://docs.python.org/3/tutorial/errors.html
                    # if AttributeError is reported, check if the next childnode is a text node, and save the data into Title.
                    except AttributeError: 
                        for subnode in textmessage.childNodes:
                            if subnode.nodeType == m.Node.TEXT_NODE:
                                Title += subnode.data
        # same as AbstractText.
        AbstractText = doc.getElementsByTagName('AbstractText')
        Abstract = ""
        if len(AbstractText) > 0:
            for elm in AbstractText:
                for textmessage in elm.childNodes:
                    try:
                        Abstract += textmessage._get_wholeText()
                    # reference: https://docs.python.org/3/tutorial/errors.html
                    except AttributeError: 
                        for subnode in textmessage.childNodes:
                            if subnode.nodeType == m.Node.TEXT_NODE:
                                Abstract += subnode.data

        # reference: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=xml&id=32501203
        MeshHeading = doc.getElementsByTagName('MeshHeading')
        Mesh = []
        # reference: https://stackoverflow.com/questions/6520192/how-to-get-the-text-node-of-an-element
        if len(MeshHeading) > 0:
            try:
                for mesh in MeshHeading:
                    Mesh.append(mesh.firstChild.childNodes[0].nodeValue)
            except AttributeError: pass
        
        # set the dictionary key as PubmedId, dictionary values include each paper's title, abstract, MeSH terms and query.
        paper[PubmedId] = {
            'ArticleTitle': Title,
            'AbstractText': Abstract,
            'Query': disease,
            'Mesh': Mesh
        }
    # return the dictionary paper.
    return paper
    

In [20]:
# save metadata of Alzheimer's into dictionary all_data, save metadata of cancers into dictionary cancer_data.
all_data = pull_metadata('Alzheimers')
cancer_data = pull_metadata('cancer')
# use .update() to update the dictionary all_data, so that data from both Alzheimer's and cancer are saved.
all_data.update(cancer_data)
# run function overlap_in_two_papers(), to find the overlap in the two sets of papers.
overlap_PubmedId = overlap_in_two_papers('Alzheimers','cancer')
# change the query of this overlap into 'Alzheimer's', 'cancer'.
all_data[overlap_PubmedId]['Query'] = ['Alzheimers','cancer']

# save the dictionary all_data into a JSON file paper.json.
with open('paper.json','a') as f:
    json.dump(all_data,f)


There is a overlap in the two sets of papers that I identified, the pubmed id is 32501203.
