### Import

In [499]:
import datetime
import time
import requests
import os
import json

In [500]:
from pymed import PubMed
from IPython.display import clear_output
import xml.etree.ElementTree as ET
import pandas as pd

### Settings

In [65]:
headers = {
    'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 69.0.3497.81 Safari / 537.36'
}

authors_list = [
    'Danilov Gleb',
    'Shifrin Michael',
    'Potapov Alexander',
    'Shimansky Vadim'
]

tool = 'my_tool1'
email = 'tishankulov@nsi.ru'
api_key = '2839ed49187b099ec3d13cc079fc3ca0fc09'
max_results = 5000

full_texts_path = 'fulltexts/'

url_pubmed_to_pmc = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
url_full_text = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
url_article_ids = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'

### Functions

In [111]:
def get_items_pubmed(fullname):
    """
    Makes request to Pubmed database using pymed library.
    Returns DataFrame with all articles authored by <fullname>.
    """
    items = pd.DataFrame(columns=['fullname', 'pubmed_id', 'title', 'abstract', 'keywords', 'journal', 'publication_date',
                                  'authors', 'affiliations', 'methods', 'conclusions', 'results', 'copyrights', 'doi'])
    
    lastname = fullname.split()[0]
    firstname = fullname.split()[1]
    query = '{0}, {1}[Author]'.format(lastname, firstname)
    
    pubmed = PubMed(tool=tool, email=email)
    results = pubmed.query(query, max_results=max_results)
    
    while True:
        try:
            result = next(results)
            result_dict = result.toDict()
            result_dict['fullname'] = fullname
            result_dict['affiliations'] = [author['affiliation'] for author in result_dict['authors']]
            result_dict['affiliations'] = '; '.join(result_dict['affiliations'])
            result_dict['authors'] = [' '.join([author['lastname'], author['firstname']]) for author in result_dict['authors']]
            result_dict['authors'] = '; '.join(result_dict['authors'])
            result_dict['keywords'] = '; '.join(result_dict['keywords'])
            result_dict['pubmed_id'] = result_dict['pubmed_id'].replace('\n', '; ')
            del result_dict['xml']
            items = items.append(result_dict, ignore_index=True)
        except StopIteration:
            break
    
    return items

In [112]:
def add_pmc_id(items):
    """Add PMC ID to <items> DataFrame"""
    params = {
        'format': 'json',
        'dbfrom': 'pubmed',
        'linkname': 'pubmed_pmc',
        'api_key': api_key
    }
    
    if not 'pmc_id' in items.columns:
        items.insert(2, 'pmc_id', pd.np.nan)
    
    for i in range(0, len(items)):
        params['id'] = items.loc[i, 'pubmed_id']
        try:
            response = requests.get(url=url_pubmed_to_pmc, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured with Pubmed ID: {0}'.format(params['id']))
        else:
            data = response.json()
            if 'linksetdbs' in data['linksets'][0]:
                items.loc[i, 'pmc_id'] = data['linksets'][0]['linksetdbs'][0]['links'][0]
    return items

In [640]:
def parse_pub_date(meta, pub_type):
    pub_date = None
    pub_dates = meta.findall('pub-date')
    for pub_record in pub_dates:
        if pub_record.attrib['pub-type'] == pub_type:
            day = pub_record.find('day')
            month = pub_record.find('month')
            year = pub_record.find('year')
            if year is not None:
                pub_date = year.text
                if month is not None:
                    pub_date += '-' + month.text
                    if day is not None:
                        pub_date += '-' + day.text
    return pub_date

In [657]:
def parse_element_tree(root):
    item = {}
    authors = []
    
    item['article-type'] = root.find('article').get('article-type')                             # Article type
    article_meta = root.findall('article/front/article-meta')                                   # Article meta
    for meta in article_meta:
        article_ids = meta.findall('article-id')                                                # Article ids
        for article_id in article_ids:
            item[article_id.get('pub-id-type')] = article_id.text
            
        category = meta.find('article-categories/subj-group/subject')                           # Article category
        if category is not None:
            item['category'] = category.text
        title = meta.find('title-group/article-title')                                          # Article title
        if title is not None:
            item['title'] = title.text
            item['title'] = ' '.join(item['title'].split())
        
        contributors = meta.findall('contrib-group/contrib')                                    # Authors
        for contributor in contributors:
            for contributor_meta in contributor.iter('name'):
                surname = contributor_meta.find('surname')
                given_name = contributor_meta.find('given-names')
                if surname is not None and given_name is not None:
                    authors.append(' '.join([given_name.text, surname.text]))
        if len(authors) > 0:
            item['authors'] = ', '.join(authors)
            item['authors'] = ' '.join(item['authors'].split())
        
        item['pub_date'] = parse_pub_date(meta, 'epub')                                         # Publication date
        if item.get('pub_date', None) is None:
            item['pub_date'] = parse_pub_date(meta, 'ppub')
    
        copyright = meta.find('permissions/copyright-statement')                                # Copyright statement
        if copyright is not None:
            item['copyright'] = copyright.text
        license_type = meta.find('permissions/license')                                         # License type
        if license_type is not None:
            item['license-type'] = license_type.get('license-type')           
        license = meta.find('permissions/license/license-p')                                    # License text
        if license is not None:
            for element in license.iter():
                if element.text:
                    item['license'] = item.get('license', '') + element.text + ' '
                if element.tail:
                    item['license'] = item.get('license', '') + element.tail + ' '
            item['license'] = ' '.join(item['license'].split())
        
        abstract = meta.find('abstract')                                                        # Abstract
        if abstract is not None:
            for element in abstract.iter():
                if element.text:
                    item['abstract'] = item.get('abstract', '') + element.text + ' '
                if element.tail:
                    item['abstract'] = item.get('abstract', '') + element.tail + ' '
            item['abstract'] = ' '.join(item['abstract'].split())

                        
        pass                                                                                    # Journal meta
        pass                                                                                    # Keywords
    
    body = root.find('article/body')
    if body is not None:
        for element in body.iter():
            if element.text:
                item['full_text'] = item.get('full_text', '') + element.text + ' '
            if element.tail:
                item['full_text'] = item.get('full_text', '') + element.tail + ' '
        item['full_text'] = ' '.join(item['full_text'].split())
                
    item['abstract_len'] = len(item.get('abstract', ''))
    item['full_text_len'] = len(item.get('full_text', ''))
    
            
    return item

In [450]:
def get_element_tree(pmc_id):
    """Returns parsed ElementTree object of article with <pmc_id> identifier"""
    filename = os.path.join(full_texts_path, str(pmc_id))
    
    if not os.path.exists(filename):
        return None
    
    with open(filename, 'r') as f:
        data = f.read()        
    root = ET.fromstring(data)
    
    return root

In [542]:
def handle_query_responses():
    filenames = os.listdir(full_texts_path)
    print('{0} query responses found. Starting...'.format(len(filenames)))
    time.sleep(1)
    
    items = pd.DataFrame(columns=['pmid', 'pmc', 'publisher-id', 'doi', 'abstract_len', 'full_text_len', 'file_size',
                                  'title', 'article-type', 'category', 'authors', 'pub_date',
                                  'copyright', 'license-type', 'license', 'abstract', 'full_text'])
    for i in range(0, len(filenames)):
        clear_output(wait=True)
        print('Working on {0}'.format(filenames[i]))
        root = get_element_tree(filenames[i])
        item = parse_element_tree(root)
        item['file_size'] = os.path.getsize(os.path.join(full_texts_path, filenames[i]))
        items = items.append(item, ignore_index=True)
        if i % 1000 == 0:
            items.to_csv('database/pmc.csv', sep='|', index=False)
        print('Done {0} of {1}'.format(i+1, len(filenames)))
        
    return items

In [50]:
def get_article_ids(query):
    """Returns all article ids found by <query_words>"""
    params = {
        'db': 'pmc',
        'tool': tool,
        'email': email,
        'term': query,
        'retmax': max_results,
        'retstart': 0
    }
    article_ids = []
    count = params['retmax'] + 1
    
    while params['retstart'] < count:
        try:
            response = requests.get(url=url_article_ids, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured at retstart: {0}'.format(params['retstart']))
        else:
            root = ET.fromstring(response.text)
            for article_id in root.iter('Id'):
                article_ids.append(int(article_id.text))
            count = int(root.find('Count').text)
        finally:
            params['retstart'] += params['retmax']

    return article_ids

In [96]:
def download_query_response(pmc_id, refresh=False):
    """Saves article query response with <pmc_id> identifier to file"""
    params = {
        'db': 'pmc',
        'id': pmc_id,
        'api_key': api_key
    }
    
    filename = os.path.join(full_texts_path, str(pmc_id))
    if (os.path.exists(filename)) and (not refresh):
        pass
    else:
        try:
            response = requests.get(url=url_full_text, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured with PMC ID: {0}'.format(pmc_id))
        else:
            data = response.text
            with open(filename, 'w+') as f:
                f.write(data)

In [113]:
def download_all_query_responses(query='neurosurgery', refresh=False):
    """Downloads all query responses got by <query>"""
    article_ids = get_article_ids(query)
    print('{0} articles found, downloading...'.format(len(article_ids)))
    
    for pmc_id in article_ids:
        download_query_response(pmc_id, refresh)
        
    files_count = len(os.listdir(full_texts_path))
    print('{0} articles stored in the database'.format(files_count))

### Main

In [51]:
items = get_items(authors_list[0])
items.head(2)

Unnamed: 0,fullname,pubmed_id,title,abstract,keywords,journal,publication_date,authors,affiliations,methods,conclusions,results,copyrights,doi
0,Danilov Gleb,31349300,An Information Extraction Algorithm for Detect...,"Rich-in-morphology language, such as Russian, ...",Adverse Events; Electronic Health Records; Nat...,Studies in health technology and informatics,2019-07-28,Danilov Gleb; Shifrin Michael; Strunina Uliya;...,National Medical Research Center for Neurosurg...,,,,,10.3233/SHTI190051
1,Danilov Gleb,31121374,A Single-Center Retrospective Descriptive Coho...,Cerebrospinal fluid (CSF) leak remains a signi...,Cerebrospinal fluid leakage; Children; Craniot...,World neurosurgery,2019-05-24,Kushel Yury; Danilov Gleb; Tekoev Aslan; Cheld...,"2nd Neurosurgical Department, Burdenko Neurosu...",,,Postoperative wound CSF leakage was observed i...,Copyright © 2019 Elsevier Inc. All rights rese...,10.1016/j.wneu.2019.05.091


In [52]:
items = add_pmc_id(items)
items.head(2)

Unnamed: 0,fullname,pubmed_id,pmc_id,title,abstract,keywords,journal,publication_date,authors,affiliations,methods,conclusions,results,copyrights,doi
0,Danilov Gleb,31349300,,An Information Extraction Algorithm for Detect...,"Rich-in-morphology language, such as Russian, ...",Adverse Events; Electronic Health Records; Nat...,Studies in health technology and informatics,2019-07-28,Danilov Gleb; Shifrin Michael; Strunina Uliya;...,National Medical Research Center for Neurosurg...,,,,,10.3233/SHTI190051
1,Danilov Gleb,31121374,,A Single-Center Retrospective Descriptive Coho...,Cerebrospinal fluid (CSF) leak remains a signi...,Cerebrospinal fluid leakage; Children; Craniot...,World neurosurgery,2019-05-24,Kushel Yury; Danilov Gleb; Tekoev Aslan; Cheld...,"2nd Neurosurgical Department, Burdenko Neurosu...",,,Postoperative wound CSF leakage was observed i...,Copyright © 2019 Elsevier Inc. All rights rese...,10.1016/j.wneu.2019.05.091


In [121]:
items.to_excel('output/pubmed-pymed.xlsx')

### Download Neurosurgery articles

In [114]:
download_all_query_responses(query='neurosurgery', refresh=False)

121183 articles found, downloading...
121183 articles stored in the database


### Handle query responses

In [545]:
items = handle_query_responses()
items.head()

Working on 99430
Done 121183 of 121183


Unnamed: 0,pmid,pmc,publisher-id,doi,abstract_len,full_text_len,file_size,title,article-type,category,...,manuscript,pii,sici,other,medline,pmcid,coden,art-access-id,publisher-manuscript,pmc-scan
0,496443.0,1000359,,,1490,0,4635,,research-article,,...,,,,,,,,,,
1,,1000485,,,0,0,1523,,book-review,,...,,,,,,,,,,
2,6696518.0,1001222,,,359,0,2839,,research-article,,...,,,,,,,,,,
3,4083935.0,1001788,,,0,0,1587,,research-article,,...,,,,,,,,,,
4,4083936.0,1001789,,,1189,0,4061,,research-article,,...,,,,,,,,,,


In [546]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121183 entries, 0 to 121182
Data columns (total 27 columns):
pmid                    110744 non-null object
pmc                     121183 non-null object
publisher-id            56677 non-null object
doi                     87377 non-null object
abstract_len            121183 non-null object
full_text_len           121183 non-null object
file_size               121183 non-null object
title                   7820 non-null object
article-type            121183 non-null object
category                19 non-null object
authors                 0 non-null object
pub_date                0 non-null object
copyright               1164 non-null object
license-type            39082 non-null object
license                 28246 non-null object
abstract                99930 non-null object
full_text               50771 non-null object
manuscript              13759 non-null object
pii                     1867 non-null object
sici                   

In [547]:
items.to_csv('database/pmc_all.csv', sep='|')

### Debug

In [608]:
root = get_element_tree(6069828)

In [656]:
root = get_element_tree(15521)
item = parse_element_tree(root)
item

{'article-type': 'research-article',
 'pmid': '9990066',
 'pmc': '15521',
 'publisher-id': '4665',
 'category': 'Biological Sciences',
 'title': 'Multinuclear solid-state three-dimensional MRI of bone and synthetic calcium phosphates',
 'authors': 'Yaotang Wu, David A. Chesler, Melvin J. Glimcher, Leoncio Garrido, Jinxi Wang, Hong J. Jiang, Jerome L. Ackerman',
 'pub_date': '1999-2-16',
 'abstract': 'Multinuclear three-dimensional solid-state MRI of bone, tooth, and synthetic calcium phosphates is demonstrated in vitro and in vivo with a projection reconstruction technique based on acquisition of free induction decays in the presence of fixed amplitude magnetic field gradients. Phosphorus-31 solid-state MRI provides direct images of the calcium phosphate constituents of bone substance and is a quantitative measurement of the true volumetric bone mineral density of the bone. Proton solid-state MRI shows the density of bone matrix including its organic constituents, which consist princip

In [501]:
filenames = os.listdir(full_texts_path)
items = pd.DataFrame()
for filename in filenames[0:10]:
    root = get_element_tree(filename)
    item = parse_element_tree(root)
    items = items.append(item, ignore_index=True)
items

Unnamed: 0,abstract,abstract_len,article-type,category,full_text_len,pmc,pmid,title
0,\n \n As part of a prospective sur...,1555.0,research-article,Research Article,0.0,1000359,496443.0,The neuropsychiatric disorder in systemic lupu...
1,,0.0,book-review,Book Reviews,0.0,1000485,,Clinical Aspects of Neuroimmunology
2,"\n \n Two popliteal swellings, tho...",462.0,research-article,Research Article,0.0,1001222,6696518.0,Popliteal masses masquerading as popliteal cysts.
3,,0.0,research-article,Research Article,0.0,1001788,4083935.0,Rheumatoid subluxations of the cervical spine.
4,\n \n Cervical myelopathy is a rar...,1292.0,research-article,Research Article,0.0,1001789,4083936.0,Surgical treatment of cervical cord compressio...
5,\n \n Material from 100 consecutiv...,958.0,research-article,Research Article,0.0,1001857,3954473.0,Amyloid in intervertebral discs: a histopathol...
6,,0.0,research-article,Research Article,0.0,1002279,2309487.0,Perceived health quackery use among patients. ...
7,\n \n \n \n Images...,167.0,research-article,Research Article,0.0,1002313,2305578.0,Intracranial tuberculoma developing during the...
8,\n \n Multiple myeloma associated ...,1147.0,review-article,Research Article,0.0,1002325,2185597.0,"Syndrome of plasma cell dyscrasia, polyneuropa..."
9,\n \n \n \n Images...,129.0,research-article,Research Article,0.0,1002389,2161587.0,A 36-year-old woman with a pulsatile mass of t...


In [343]:
article_meta = root.findall('article/front/article-meta')
article_meta

[<Element 'article-meta' at 0x000001E11A85DE58>]

In [595]:
authors = []
article_meta = root.findall('article/front/article-meta')
for meta in article_meta:
    contributors = meta.findall('contrib-group/contrib')
    for contributor in contributors:
        for contributor_meta in contributor.iter('name'):
            surname = contributor_meta.find('surname')
            given_name = contributor_meta.find('given-names')
            if surname is not None and given_name is not None:
                authors.append(' '.join([given_name.text, surname.text]))
    authors = ', '.join(authors)
print(authors)

Ksenia Ershova, Ivan Savin, Nataliya Kurdyumova, Darren Wong, Gleb Danilov, Michael Shifrin, Irina Alexandrova, Ekaterina Sokolova, Nadezhda Fursova, Vladimir Zelman, Olga Ershova


In [629]:
item = {}
article_meta = root.findall('article/front/article-meta')
for meta in article_meta:
    pub_dates = meta.findall('pub-date')
    for pub_date in pub_dates:
        if pub_date.attrib['pub-type'] == 'epub':
            day = pub_date.find('day')
            month = pub_date.find('month')
            year = pub_date.find('year')
            if year is not None:
                item['pub_date'] = year.text
                if month is not None:
                    item['pub_date'] += '-' + month.text
                    if day is not None:
                        item['pub_date'] += '-' + day.text
    if item.get('pub_date', None) is None:
        for pub_date in pub_dates:
            if pub_date.attrib['pub-type'] == 'epub':
                day = pub_date.find('day')
                month = pub_date.find('month')
                year = pub_date.find('year')
                if year is not None:
                    item['pub_date'] = year.text
                    if month is not None:
                        item['pub_date'] += '-' + month.text
                        if day is not None:
                            item['pub_date'] += '-' + day.text
        
            
print(date)

2018-7-31
