### Import

In [115]:
import pandas as pd
import datetime
import time
import requests
import os
import json

In [2]:
from pymed import PubMed

from IPython.display import clear_output

### Settings

In [65]:
headers = {
    'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 69.0.3497.81 Safari / 537.36'
}

authors_list = [
    'Danilov Gleb',
    'Shifrin Michael',
    'Potapov Alexander',
    'Shimansky Vadim'
]

tool = 'my_tool1'
email = 'tishankulov@nsi.ru'
api_key = '2839ed49187b099ec3d13cc079fc3ca0fc09'
max_results = 5000

full_texts_path = 'fulltexts/'

url_pubmed_to_pmc = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
url_full_text = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
url_article_ids = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'

### Functions

In [111]:
def get_items_pubmed(fullname):
    """
    Makes request to Pubmed database using pymed library.
    Returns DataFrame with all articles authored by <fullname>.
    """
    items = pd.DataFrame(columns=['fullname', 'pubmed_id', 'title', 'abstract', 'keywords', 'journal', 'publication_date',
                                  'authors', 'affiliations', 'methods', 'conclusions', 'results', 'copyrights', 'doi'])
    
    lastname = fullname.split()[0]
    firstname = fullname.split()[1]
    query = '{0}, {1}[Author]'.format(lastname, firstname)
    
    pubmed = PubMed(tool=tool, email=email)
    results = pubmed.query(query, max_results=max_results)
    
    while True:
        try:
            result = next(results)
            result_dict = result.toDict()
            result_dict['fullname'] = fullname
            result_dict['affiliations'] = [author['affiliation'] for author in result_dict['authors']]
            result_dict['affiliations'] = '; '.join(result_dict['affiliations'])
            result_dict['authors'] = [' '.join([author['lastname'], author['firstname']]) for author in result_dict['authors']]
            result_dict['authors'] = '; '.join(result_dict['authors'])
            result_dict['keywords'] = '; '.join(result_dict['keywords'])
            result_dict['pubmed_id'] = result_dict['pubmed_id'].replace('\n', '; ')
            del result_dict['xml']
            items = items.append(result_dict, ignore_index=True)
        except StopIteration:
            break
    
    return items

In [112]:
def add_pmc_id(items):
    """Add PMC ID to <items> DataFrame"""
    params = {
        'format': 'json',
        'dbfrom': 'pubmed',
        'linkname': 'pubmed_pmc',
        'api_key': api_key
    }
    
    if not 'pmc_id' in items.columns:
        items.insert(2, 'pmc_id', pd.np.nan)
    
    for i in range(0, len(items)):
        params['id'] = items.loc[i, 'pubmed_id']
        try:
            response = requests.get(url=url_pubmed_to_pmc, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured with Pubmed ID: {0}'.format(params['id']))
        else:
            data = response.json()
            if 'linksetdbs' in data['linksets'][0]:
                items.loc[i, 'pmc_id'] = data['linksets'][0]['linksetdbs'][0]['links'][0]
    return items

In [486]:
def parse_element_tree(root):
    item = {}
    
    item['article-type'] = root.find('article').get('article-type')                             # Article type
    article_meta = root.findall('article/front/article-meta')                                   # Article meta
    for meta in article_meta:
        article_ids = meta.findall('article-id')                                                # Article ids
        for article_id in article_ids:
            item[article_id.get('pub-id-type')] = article_id.text
            
        item['category'] = meta.find('article-categories/subj-group/subject').text              # Article category
        item['title'] = meta.find('title-group/article-title').text                             # Article title
    
        pass                                                                                    # Authors
        pass                                                                                    # Publication date
    
        copyright = meta.find('permissions/copyright-statement')                                # Copyright statement
        if copyright:
            item['copyright'] = copyright.text
        license_type = meta.find('permissions/license')                                         # License type
        if license_type:
            item['license-type'] = license_type.get('license-type')           
        license = meta.find('permissions/license/license-p')                                    # License text
        if license:
            for element in license.iter():
                if element.text:
                    item['license'] = item.get('license', '') + element.text + ' '
                if element.tail:
                    item['license'] = item.get('license', '') + element.tail + ' '
        
        abstract = meta.find('abstract')                                                        # Abstract
        if abstract:
            for element in abstract.iter():
                if element.text:
                    item['abstract'] = item.get('abstract', '') + element.text + ' '
                if element.tail:
                    item['abstract'] = item.get('abstract', '') + element.tail + ' '
                                                                                    
                        
        pass                                                                                    # Journal meta
        pass                                                                                    # Keywords
    
    body = root.find('article/body')
    if body:
        for element in body.iter():
            if element.text:
                item['full_text'] = item.get('full_text', '') + element.text + ' '
            if element.tail:
                item['full_text'] = item.get('full_text', '') + element.tail + ' '
                
    item['abstract_len'] = len(item.get('abstract', ''))
    item['full_text_len'] = len(item.get('full_text', ''))
    
            
    return item

In [450]:
def get_element_tree(pmc_id):
    """Returns parsed ElementTree object of article with <pmc_id> identifier"""
    filename = os.path.join(full_texts_path, str(pmc_id))
    
    if not os.path.exists(filename):
        return None
    
    with open(filename, 'r') as f:
        data = f.read()        
    root = ET.fromstring(data)
    
    return root

In [50]:
def get_article_ids(query):
    """Returns all article ids found by <query_words>"""
    params = {
        'db': 'pmc',
        'tool': tool,
        'email': email,
        'term': query,
        'retmax': max_results,
        'retstart': 0
    }
    article_ids = []
    count = params['retmax'] + 1
    
    while params['retstart'] < count:
        try:
            response = requests.get(url=url_article_ids, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured at retstart: {0}'.format(params['retstart']))
        else:
            root = ET.fromstring(response.text)
            for article_id in root.iter('Id'):
                article_ids.append(int(article_id.text))
            count = int(root.find('Count').text)
        finally:
            params['retstart'] += params['retmax']

    return article_ids

In [96]:
def download_query_response(pmc_id, refresh=False):
    """Saves article query response with <pmc_id> identifier to file"""
    params = {
        'db': 'pmc',
        'id': pmc_id,
        'api_key': api_key
    }
    
    filename = os.path.join(full_texts_path, str(pmc_id))
    if (os.path.exists(filename)) and (not refresh):
        pass
    else:
        try:
            response = requests.get(url=url_full_text, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured with PMC ID: {0}'.format(pmc_id))
        else:
            data = response.text
            with open(filename, 'w+') as f:
                f.write(data)

In [113]:
def download_all_query_responses(query='neurosurgery', refresh=False):
    """Downloads all query responses got by <query>"""
    article_ids = get_article_ids(query)
    print('{0} articles found, downloading...'.format(len(article_ids)))
    
    for pmc_id in article_ids:
        download_query_response(pmc_id, refresh)
        
    files_count = len(os.listdir(full_texts_path))
    print('{0} articles stored in the database'.format(files_count))

### Main

In [51]:
items = get_items(authors_list[0])
items.head(2)

Unnamed: 0,fullname,pubmed_id,title,abstract,keywords,journal,publication_date,authors,affiliations,methods,conclusions,results,copyrights,doi
0,Danilov Gleb,31349300,An Information Extraction Algorithm for Detect...,"Rich-in-morphology language, such as Russian, ...",Adverse Events; Electronic Health Records; Nat...,Studies in health technology and informatics,2019-07-28,Danilov Gleb; Shifrin Michael; Strunina Uliya;...,National Medical Research Center for Neurosurg...,,,,,10.3233/SHTI190051
1,Danilov Gleb,31121374,A Single-Center Retrospective Descriptive Coho...,Cerebrospinal fluid (CSF) leak remains a signi...,Cerebrospinal fluid leakage; Children; Craniot...,World neurosurgery,2019-05-24,Kushel Yury; Danilov Gleb; Tekoev Aslan; Cheld...,"2nd Neurosurgical Department, Burdenko Neurosu...",,,Postoperative wound CSF leakage was observed i...,Copyright © 2019 Elsevier Inc. All rights rese...,10.1016/j.wneu.2019.05.091


In [52]:
items = add_pmc_id(items)
items.head(2)

Unnamed: 0,fullname,pubmed_id,pmc_id,title,abstract,keywords,journal,publication_date,authors,affiliations,methods,conclusions,results,copyrights,doi
0,Danilov Gleb,31349300,,An Information Extraction Algorithm for Detect...,"Rich-in-morphology language, such as Russian, ...",Adverse Events; Electronic Health Records; Nat...,Studies in health technology and informatics,2019-07-28,Danilov Gleb; Shifrin Michael; Strunina Uliya;...,National Medical Research Center for Neurosurg...,,,,,10.3233/SHTI190051
1,Danilov Gleb,31121374,,A Single-Center Retrospective Descriptive Coho...,Cerebrospinal fluid (CSF) leak remains a signi...,Cerebrospinal fluid leakage; Children; Craniot...,World neurosurgery,2019-05-24,Kushel Yury; Danilov Gleb; Tekoev Aslan; Cheld...,"2nd Neurosurgical Department, Burdenko Neurosu...",,,Postoperative wound CSF leakage was observed i...,Copyright © 2019 Elsevier Inc. All rights rese...,10.1016/j.wneu.2019.05.091


In [121]:
items.to_excel('output/pubmed-pymed.xlsx')

### Download Neurosurgery articles

In [114]:
download_all_query_responses(query='neurosurgery', refresh=False)

121183 articles found, downloading...
121183 articles stored in the database


### Handle query responses

In [None]:
filenames = os.listdir(full_texts_path)

### Debug

In [452]:
root = get_element_tree(6069828)

In [491]:
root = get_element_tree(496205)
item = parse_element_tree(root)
item

{'article-type': 'book-review',
 'pmc': '496205',
 'category': 'Book Reviews',
 'title': 'STROKES DUE TO VERTEBRO-BASILAR DISEASE',
 'abstract_len': 0,
 'full_text_len': 0}

In [498]:
filenames = os.listdir(full_texts_path)
items = pd.DataFrame()
for filename in filenames[0:1000]:
    root = get_element_tree(filename)
    item = parse_element_tree(root)
    items = items.append(item, ignore_index=True)
items

Unnamed: 0,abstract,abstract_len,article-type,category,full_text_len,pmc,pmid,title,publisher-id
0,\n \n As part of a prospective sur...,1555.0,research-article,Research Article,0.0,1000359,496443,The neuropsychiatric disorder in systemic lupu...,
1,,0.0,book-review,Book Reviews,0.0,1000485,,Clinical Aspects of Neuroimmunology,
2,"\n \n Two popliteal swellings, tho...",462.0,research-article,Research Article,0.0,1001222,6696518,Popliteal masses masquerading as popliteal cysts.,
3,,0.0,research-article,Research Article,0.0,1001788,4083935,Rheumatoid subluxations of the cervical spine.,
4,\n \n Cervical myelopathy is a rar...,1292.0,research-article,Research Article,0.0,1001789,4083936,Surgical treatment of cervical cord compressio...,
5,\n \n Material from 100 consecutiv...,958.0,research-article,Research Article,0.0,1001857,3954473,Amyloid in intervertebral discs: a histopathol...,
6,,0.0,research-article,Research Article,0.0,1002279,2309487,Perceived health quackery use among patients. ...,
7,\n \n \n \n Images...,167.0,research-article,Research Article,0.0,1002313,2305578,Intracranial tuberculoma developing during the...,
8,\n \n Multiple myeloma associated ...,1147.0,review-article,Research Article,0.0,1002325,2185597,"Syndrome of plasma cell dyscrasia, polyneuropa...",
9,\n \n \n \n Images...,129.0,research-article,Research Article,0.0,1002389,2161587,A 36-year-old woman with a pulsatile mass of t...,


In [451]:
item = {}

item['article-type'] = root.find('article').get('article-type')                 # Article type

article_meta = root.findall('article/front/article-meta')                       # Article meta
for meta in article_meta:
    article_ids = meta.findall('article-id')                                    # Article ids
    for article_id in article_ids:
        item[article_id.get('pub-id-type')] = article_id.text
        
    item['category'] = meta.find('article-categories/subj-group/subject').text  # Article category
    item['title'] = meta.find('title-group/article-title').text                 # Article title

    pass                                                                        # Authors
    pass                                                                        # Publication date

    item['copyright'] = meta.find('permissions/copyright-statement').text       # Copyright statement        
    item['license-type'] = meta.find('permissions/license').get('license-type') # License type  
    license = meta.find('permissions/license/license-p')                        # License text
    for element in license.iter():
        if element.text:
            item['license'] = item.get('license', '') + element.text + ' '
        if element.tail:
            item['license'] = item.get('license', '') + element.tail + ' '
    abstract = meta.find('abstract')                                            # Abstract
    for element in abstract.iter():
        if element.text:
            item['abstract'] = item.get('abstract', '') + element.text + ' '
        if element.tail:
            item['abstract'] = item.get('abstract', '') + element.tail + ' '
                                                                                
                    
    pass                                                                        # Journal meta
    pass                                                                        # Keywords

body = root.find('article/body')
for element in body.iter():
    if element.text:
        item['full_text'] = item.get('full_text', '') + element.text + ' '
    if element.tail:
        item['full_text'] = item.get('full_text', '') + element.tail + ' '
        
#item

In [343]:
article_meta = root.findall('article/front/article-meta')
article_meta

[<Element 'article-meta' at 0x000001E11A85DE58>]

In [243]:
test = {}
for meta in article_meta:
    article_ids = meta.findall('article-id')
    for article_id in article_ids:
        test[article_id.get('pub-id-type')] = article_id.text
test

{'pmid': '30083313',
 'pmc': '6069828',
 'publisher-id': '383',
 'doi': '10.1186/s13756-018-0383-4'}

In [342]:
license = article_meta[0].iter()
# print(license)
for element in license:
    #print(element.tag)
    if element.tag == 'license-p':
        #print(element.text, element.tail)
        for l in element.iter():
            if l.text:
                print(l.text, l.tail)

Open Access This article is distributed under the terms of the Creative Commons Attribution 4.0 International License (
http://creativecommons.org/licenses/by/4.0/ ), which permits unrestricted use, distribution, and reproduction in any medium, provided you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons license, and indicate if changes were made. The Creative Commons Public Domain Dedication waiver (
http://creativecommons.org/publicdomain/zero/1.0/ ) applies to the data made available in this article, unless otherwise stated.


In [425]:
a = ''
article_meta = root.findall('article/front/article-meta')
for meta in article_meta:
    abstract = meta.find('abstract')
    for _ in abstract.iter():
        a+= ' '.join([_.text, _.tail])
print(a)


         
      
           
        Background 
          The impact of infection prevention and control (IPC) programs in limited resource countries such as Russia are largely unknown due to a lack of reliable data. The aim of this study is to evaluate the effect of an IPC program with respect to healthcare associated infection (HAI) prevention and to define the incidence of HAIs in a Russian ICU. 
        
           
        Methods 
          A pioneering IPC program was implemented in a neuro-ICU at Burdenko Neurosurgery Institute in 2010 and included hand hygiene, surveillance, contact precautions, patient isolation, and environmental cleaning measures. This prospective observational cohort study lasted from 2011 to 2016, included high-risk ICU patients, and evaluated the dynamics of incidence, etiological spectrum, and resistance profile of four types of HAIs, including subgroup analysis of device-associated infections. Survival analysis compared patients with and without HAIs