### Import

In [78]:
import pandas as pd
import datetime
import time
import requests
#import xmltodict
#import xml
import os
import json

In [80]:
from pymed import PubMed
import xml.etree.ElementTree as ET

### Settings

In [79]:
headers = {
    'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 69.0.3497.81 Safari / 537.36'
}

authors_list = [
    'Danilov Gleb',
    'Shifrin Michael',
    'Potapov Alexander',
    'Shimansky Vadim'
]

tool = 'my_tool1'
email = 'tishankulov@nsi.ru'
max_results = 500

full_texts_path = 'fulltexts/'

url_pubmed_to_pmc = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
url_full_text = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'

### Functions

In [4]:
def get_items(fullname):
    """
    Makes request to Pubmed database.
    Returns DataFrame with all articles authored by <fullname>.
    """
    items = pd.DataFrame(columns=['fullname', 'pubmed_id', 'title', 'abstract', 'keywords', 'journal', 'publication_date',
                                  'authors', 'affiliations', 'methods', 'conclusions', 'results', 'copyrights', 'doi'])
    
    lastname = fullname.split()[0]
    firstname = fullname.split()[1]
    query = '{0}, {1}[Author]'.format(lastname, firstname)
    
    pubmed = PubMed(tool=tool, email=email)
    results = pubmed.query(query, max_results=max_results)
    
    while True:
        try:
            result = next(results)
            result_dict = result.toDict()
            result_dict['fullname'] = fullname
            result_dict['affiliations'] = [author['affiliation'] for author in result_dict['authors']]
            result_dict['affiliations'] = '; '.join(result_dict['affiliations'])
            result_dict['authors'] = [' '.join([author['lastname'], author['firstname']]) for author in result_dict['authors']]
            result_dict['authors'] = '; '.join(result_dict['authors'])
            result_dict['keywords'] = '; '.join(result_dict['keywords'])
            result_dict['pubmed_id'] = result_dict['pubmed_id'].replace('\n', '; ')
            del result_dict['xml']
            items = items.append(result_dict, ignore_index=True)
        except StopIteration:
            break
    
    return items

In [28]:
def add_pmc_id(items):
    """Makes request to Pubmed to get PMC ID by Pubmed ID in <items>"""
    params = {
        'format': 'json',
        'dbfrom': 'pubmed',
        'linkname': 'pubmed_pmc',
        'tool': tool,
        'email': email
    }
    
    if not 'pmc_id' in items.columns:
        items.insert(2, 'pmc_id', pd.np.nan)
    
    for i in range(0, len(items)):
        params['id'] = items.loc[i, 'pubmed_id']
        try:
            response = requests.get(url=url_pubmed_to_pmc, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured with Pubmed ID: {0}'.format(params['id']))
        else:
            data = response.json()
            if 'linksetdbs' in data['linksets'][0]:
                items.loc[i, 'pmc_id'] = data['linksets'][0]['linksetdbs'][0]['links'][0]
    return items

In [81]:
def get_full_text(pmc_id, download=True, refresh=False):
    """Returns full text of article with <pmc_id> identifier"""
    params = {
        'db': 'pmc',
        'id': pmc_id,
        'tool': tool,
        'email': email
    }
    data = ''
    full_text = ''
    
    filename = os.path.join(full_texts_path, str(pmc_id))
    if (os.path.exists(filename)) and (not refresh):
        with open(filename, 'r') as f:
            data = f.read()
    else:
        try:
            response = requests.get(url=url_full_text, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured with PMC ID: {0}'.format(pmc_id))
        else:
            data = response.text
            if download:
                with open(filename, 'w+') as f:
                    f.write(data)
    if data:
        root = ET.fromstring(data)
        for p in root.iter('p'):
            full_text += p.text
    
    return full_text

### Main

In [8]:
items = get_items(authors_list[0])
items.head()

Unnamed: 0,fullname,pubmed_id,title,abstract,keywords,journal,publication_date,authors,affiliations,methods,conclusions,results,copyrights,doi
0,Danilov Gleb,31349300,An Information Extraction Algorithm for Detect...,"Rich-in-morphology language, such as Russian, ...",Adverse Events; Electronic Health Records; Nat...,Studies in health technology and informatics,2019-07-28,Danilov Gleb; Shifrin Michael; Strunina Uliya;...,National Medical Research Center for Neurosurg...,,,,,10.3233/SHTI190051
1,Danilov Gleb,31121374,A Single-Center Retrospective Descriptive Coho...,Cerebrospinal fluid (CSF) leak remains a signi...,Cerebrospinal fluid leakage; Children; Craniot...,World neurosurgery,2019-05-24,Kushel Yury; Danilov Gleb; Tekoev Aslan; Cheld...,"2nd Neurosurgical Department, Burdenko Neurosu...",,,Postoperative wound CSF leakage was observed i...,Copyright © 2019 Elsevier Inc. All rights rese...,10.1016/j.wneu.2019.05.091
2,Danilov Gleb,30942728,Prediction of Postoperative Hospital Stay with...,Electronic Health Records (EHRs) conceal a hid...,Deep Learning; Electronic Health Records; Neur...,Studies in health technology and informatics,2019-04-04,Danilov Gleb; Kotik Konstantin; Shifrin Michae...,National Medical Research Center for Neurosurg...,,,,,
3,Danilov Gleb,30468924,Pineal Cyst-Related Aqueductal Stenosis as Cau...,Pineal cysts (PCs) are histologically benign l...,Cerebral aqueduct; Headache; Magnetic resonanc...,World neurosurgery,2018-11-24,Pitskhelauri David I; Konovalov Alexander N; A...,"Department of Neurooncology, Burdenko National...",,The results indicate a statistically significa...,In 25 patients (82%) we observed clinical impr...,Copyright © 2018 Elsevier Inc. All rights rese...,10.1016/j.wneu.2018.11.096
4,Danilov Gleb,30096505,Great Hospitals of the Russian Federation: Nat...,In 1929 the surgeon N. N. Burdenko and the neu...,Education; N. N. Burdenko Neurosurgery Center;...,World neurosurgery,2018-08-11,Potapov Alexander; Likhterman Leonid; Danilov ...,Federal State Autonomous Institution N. N. Bur...,,,,Copyright © 2018 The Authors. Published by Els...,10.1016/j.wneu.2018.07.280


In [27]:
items = add_pmc_id(items)
items.tail()

Unnamed: 0,fullname,pubmed_id,pmc_id,title,abstract,keywords,journal,publication_date,authors,affiliations,methods,conclusions,results,copyrights,doi
5,Danilov Gleb,30083313; 18171185; 11716632; 27408261; 401411...,6069828.0,Implementing an infection control and preventi...,The impact of infection prevention and control...,Cross infection; Drug resistance; Infection co...,Antimicrobial resistance and infection control,2018-08-08,Ershova Ksenia; Savin Ivan; Kurdyumova Nataliy...,1Center for Data-Intensive Biotechnology and B...,,,,,10.1186/s13756-018-0383-4
6,Danilov Gleb,29968664,,Diagnostic Games as a Teaching Tool.,Diagnostic games were developed in the 70s of ...,Digital transformation; clinical decision maki...,Studies in health technology and informatics,2018-07-04,Shifrin Michael; Danilov Gleb; Shklovskiy-Kord...,N.N. Burdenko National Medical Research Centre...,,,,,
7,Danilov Gleb,29557532,,Cisterna magna arachnoid membrane suturing dec...,A pseudomeningocele and an incisional cerebros...,Arachnoid suturing; CSF leak; Pseudomeningocel...,Acta neurochirurgica,2018-03-21,Pitskhelauri David; Kudieva Elina; Moshchev Dm...,"Burdenko Neurosurgery Center, Department of Ne...",,,In the patients who underwent arachnoid membra...,,10.1007/s00701-018-3507-y
8,Danilov Gleb,29413730,,Healthcare-associated ventriculitis and mening...,To define the incidence of healthcare-associat...,Bacterial; Cross infection; Infection control;...,Journal of critical care,2018-02-08,Savin Ivan; Ershova Ksenia; Kurdyumova Nataliy...,"Burdenko Neurosurgery Institute, 16 4th Tversk...",,We first reported HAVM incidence in a neuro-IC...,"2286 patients of all ages were included, 216 o...",Copyright © 2018 Elsevier Inc. All rights rese...,10.1016/j.jcrc.2018.01.022
9,Danilov Gleb,25991140,,Electronic patient records system as a monitor...,Data collected in electronic patient records (...,,Studies in health technology and informatics,2015-05-21,Shifrin Michael; Kurdumova Natalia; Danilov Gl...,"N. N. Burdenko Neurosurgery Institute, Moscow,...",,,,,


In [121]:
items.to_excel('output/pubmed-pymed.xlsx')

### Debug

In [91]:
test = get_full_text(6069828)
len(test)

17372

In [92]:
test[:2000]

'The impact of infection prevention and control (IPC) programs in limited resource countries such as Russia are largely unknown due to a lack of reliable data. The aim of this study is to evaluate the effect of an IPC program with respect to healthcare associated infection (HAI) prevention and to define the incidence of HAIs in a Russian ICU.A pioneering IPC program was implemented in a neuro-ICU at Burdenko Neurosurgery Institute in 2010 and included hand hygiene, surveillance, contact precautions, patient isolation, and environmental cleaning measures. This prospective observational cohort study lasted from 2011 to 2016, included high-risk ICU patients, and evaluated the dynamics of incidence, etiological spectrum, and resistance profile of four types of HAIs, including subgroup analysis of device-associated infections. Survival analysis compared patients with and without HAIs.We included 2038 high-risk patients. By 2016, HAI cumulative incidence decreased significantly for respirato

In [65]:
params = {
    'db': 'pmc',
    'id': 6069828,
    'tool': tool,
    'email': email
}

response = requests.get(url=url_full_text, headers=headers, params=params)

In [88]:
print(response.text[:1000])

<?xml version="1.0" ?>
<!DOCTYPE pmc-articleset PUBLIC "-//NLM//DTD ARTICLE SET 2.0//EN" "https://dtd.nlm.nih.gov/ncbi/pmc/articleset/nlm-articleset-2.0.dtd">
<pmc-articleset><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
  <?properties open_access?>
  <front>
    <journal-meta>
      <journal-id journal-id-type="nlm-ta">Antimicrob Resist Infect Control</journal-id>
      <journal-id journal-id-type="iso-abbrev">Antimicrob Resist Infect Control</journal-id>
      <journal-title-group>
        <journal-title>Antimicrobial Resistance and Infection Control</journal-title>
      </journal-title-group>
      <issn pub-type="epub">2047-2994</issn>
      <publisher>
        <publisher-name>BioMed Central</publisher-name>
        <publisher-loc>London</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="pmid">30083313</article-id>
      <article-id pub-id-typ