### Import

In [499]:
import datetime
import time
import requests
import os
import json

In [691]:
from pymed import PubMed
from IPython.display import clear_output
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

### Settings

In [65]:
headers = {
    'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 69.0.3497.81 Safari / 537.36'
}

authors_list = [
    'Danilov Gleb',
    'Shifrin Michael',
    'Potapov Alexander',
    'Shimansky Vadim'
]

tool = 'my_tool1'
email = 'tishankulov@nsi.ru'
api_key = '2839ed49187b099ec3d13cc079fc3ca0fc09'
max_results = 5000

full_texts_path = 'fulltexts/'

url_pubmed_to_pmc = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
url_full_text = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
url_article_ids = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'

### Functions

In [111]:
def get_items_pubmed(fullname):
    """
    Makes request to Pubmed database using pymed library.
    Returns DataFrame with all articles authored by <fullname>.
    """
    items = pd.DataFrame(columns=['fullname', 'pubmed_id', 'title', 'abstract', 'keywords', 'journal', 'publication_date',
                                  'authors', 'affiliations', 'methods', 'conclusions', 'results', 'copyrights', 'doi'])
    
    lastname = fullname.split()[0]
    firstname = fullname.split()[1]
    query = '{0}, {1}[Author]'.format(lastname, firstname)
    
    pubmed = PubMed(tool=tool, email=email)
    results = pubmed.query(query, max_results=max_results)
    
    while True:
        try:
            result = next(results)
            result_dict = result.toDict()
            result_dict['fullname'] = fullname
            result_dict['affiliations'] = [author['affiliation'] for author in result_dict['authors']]
            result_dict['affiliations'] = '; '.join(result_dict['affiliations'])
            result_dict['authors'] = [' '.join([author['lastname'], author['firstname']]) for author in result_dict['authors']]
            result_dict['authors'] = '; '.join(result_dict['authors'])
            result_dict['keywords'] = '; '.join(result_dict['keywords'])
            result_dict['pubmed_id'] = result_dict['pubmed_id'].replace('\n', '; ')
            del result_dict['xml']
            items = items.append(result_dict, ignore_index=True)
        except StopIteration:
            break
    
    return items

In [112]:
def add_pmc_id(items):
    """Add PMC ID to <items> DataFrame"""
    params = {
        'format': 'json',
        'dbfrom': 'pubmed',
        'linkname': 'pubmed_pmc',
        'api_key': api_key
    }
    
    if not 'pmc_id' in items.columns:
        items.insert(2, 'pmc_id', pd.np.nan)
    
    for i in range(0, len(items)):
        params['id'] = items.loc[i, 'pubmed_id']
        try:
            response = requests.get(url=url_pubmed_to_pmc, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured with Pubmed ID: {0}'.format(params['id']))
        else:
            data = response.json()
            if 'linksetdbs' in data['linksets'][0]:
                items.loc[i, 'pmc_id'] = data['linksets'][0]['linksetdbs'][0]['links'][0]
    return items

In [821]:
def parse_pub_date(subroot, pub_type):
    """Returns parsed publication date from given ET <subroot>"""
    pub_date = np.NaN
    pub_dates = subroot.findall('pub-date')
    for pub_record in pub_dates:
        if (pub_record.attrib.get('pub-type', '') == pub_type or
            pub_record.attrib.get('date-type', '') == 'pub'):
            day = pub_record.find('day')
            month = pub_record.find('month')
            year = pub_record.find('year')
            if year is not None:
                pub_date = year.text
                if month is not None:
                    pub_date += '-' + month.text
                    if day is not None:
                        pub_date += '-' + day.text
    if pub_date is np.NaN:
        pub_date = parse_pub_date(subroot, 'ppub')
    return pub_date

In [822]:
def parse_authors(subroot, path):
    """Returns parsed authors string from given ET <subroot>"""
    authors = []
    contributors = subroot.findall(path)
    for contributor in contributors:
        for contributor_meta in contributor.iter('name'):
            surname = contributor_meta.find('surname')
            given_name = contributor_meta.find('given-names')
            if (surname is not None) and (given_name is not None):
                surname = surname.text if surname.text is not None else ''
                given_name = given_name.text if given_name.text is not None else ''
                authors.append(' '.join([given_name, surname]))
    if len(authors) > 0:
        authors = ', '.join(authors)
        authors = ' '.join(authors.split())
    if authors:
        return authors
    return np.NaN

In [823]:
def parse_keywords(subroot):
    """Returns parsed keywords string from given ET <subroot>"""
    keywords = []
    kwds = subroot.find('kwd-group')
    if kwds is not None:
        for kwd in kwds.iter('kwd'):
            if kwd is not None and kwd.text is not None:
                keywords.append(kwd.text)
    if len(keywords) > 0:
        keywords = '; '.join(keywords)
        return keywords
    return np.NaN

In [824]:
def parse_issue(subroot):
    """Returns parsed journal issue from given ET <subroot>"""
    journal_issue = {}
    volume = subroot.find('volume')                            # Volume
    if volume is not None:
        journal_issue['volume'] = volume.text
        
    elocation_id = subroot.find('elocation-id')                # Elocation-id
    if elocation_id is not None:
        journal_issue['elocation-id'] = elocation_id.text
        
    issue = subroot.find('issue')                              # Issue
    if issue is not None:
        journal_issue['issue'] = issue.text
        
    fpage = subroot.find('fpage')                              # Pages
    fpage = fpage.text if fpage is not None else None
    lpage = subroot.find('lpage')
    lpage = lpage.text if lpage is not None else None
    
    if isinstance(fpage, str) and isinstance(lpage, str):
        journal_issue['pages'] = '-'.join([fpage, lpage])
        
    return journal_issue

In [825]:
def extract_text(subroot, path):
    """Returns extracted text between xml tags"""
    text = ''
    section = subroot.find(path)
    if section is not None:
        for element in section.iter():
            if element.text:
                text = text + element.text + ' '
            if element.tail:
                text = text + element.tail + ' '
        text = ' '.join(text.split())
    if text:
        return text
    return np.NaN

In [826]:
def parse_element_tree(root):
    """Returns parsed dict with all values found"""
    item = {}
    
    item['article-type'] = root.find('article').get('article-type')                             # Article type
    article_meta = root.findall('article/front/article-meta')                                   # Article meta
    for meta in article_meta:
        article_ids = meta.findall('article-id')                                                # Article ids
        for article_id in article_ids:
            item[article_id.get('pub-id-type')] = article_id.text
            
        category = meta.find('article-categories/subj-group/subject')                           # Article category
        if category is not None:
            item['category'] = category.text
            
        item['title'] = extract_text(meta, 'title-group/article-title')                         # Article title
        item['authors'] = parse_authors(meta, 'contrib-group/contrib')                          # Authors
        item['pub_date'] = parse_pub_date(meta, 'epub')                                         # Publication date
    
        copyright = meta.find('permissions/copyright-statement')                                # Copyright statement
        if copyright is not None:
            item['copyright'] = copyright.text
        license_type = meta.find('permissions/license')                                         # License type
        if license_type is not None:
            item['license-type'] = license_type.get('license-type')
        item['license'] = extract_text(meta, 'permissions/license/license-p')                   # License text
        
        item['keywords'] = parse_keywords(meta)                                                 # Keywords
        item['abstract'] = extract_text(meta, 'abstract')                                       # Abstract
        item.update(parse_issue(meta))                                                          # Journal issue
    
    pass                                                                                    # Journal meta
    item['full_text'] = extract_text(root, 'article/body')
    
    item['abstract_len'] = len(item['abstract']) if item['abstract'] is not np.NaN else 0
    item['full_text_len'] = len(item['full_text']) if item['full_text'] is not np.NaN else 0
            
    return item

In [450]:
def get_element_tree(pmc_id):
    """Returns parsed ElementTree object of article with <pmc_id> identifier"""
    filename = os.path.join(full_texts_path, str(pmc_id))
    
    if not os.path.exists(filename):
        return None
    
    with open(filename, 'r') as f:
        data = f.read()        
    root = ET.fromstring(data)
    
    return root

In [829]:
def handle_query_responses():
    filenames = os.listdir(full_texts_path)
    print('{0} query responses found. Starting...'.format(len(filenames)))
    time.sleep(1)
    
    items = pd.DataFrame(columns=['pmid', 'pmc', 'publisher-id', 'doi', 'abstract_len', 'full_text_len', 'file_size',
                                  'title', 'article-type', 'category', 'authors', 'pub_date', 'keywords',
                                  'volume', 'elocation-id', 'issue', 'pages',
                                  'copyright', 'license-type', 'license', 'abstract', 'full_text'])
    for i in range(0, len(filenames)):
        clear_output(wait=True)
        print('Done {0} of {1}. Working on {2}'.format(i+1, len(filenames), filenames[i]))
        root = get_element_tree(filenames[i])
        item = parse_element_tree(root)
        item['file_size'] = os.path.getsize(os.path.join(full_texts_path, filenames[i]))
        items = items.append(item, ignore_index=True)
        if i % 5000 == 0:
            items.to_csv('database/pmc.csv', sep='|', index=False)
        
    return items

In [790]:
def get_article_ids(query):
    """Returns all article ids found by <query_words>"""
    params = {
        'db': 'pmc',
        #'tool': tool,
        #'email': email,
        'api_key': api_key,
        'term': query,
        'retmax': max_results,
        'retstart': 0
    }
    article_ids = []
    count = params['retmax'] + 1
    
    while params['retstart'] < count:
        try:
            response = requests.get(url=url_article_ids, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured at retstart: {0}'.format(params['retstart']))
        else:
            root = ET.fromstring(response.text)
            for article_id in root.iter('Id'):
                article_ids.append(int(article_id.text))
            count = int(root.find('Count').text)
        finally:
            params['retstart'] += params['retmax']

    return article_ids

In [96]:
def download_query_response(pmc_id, refresh=False):
    """Saves article query response with <pmc_id> identifier to file"""
    params = {
        'db': 'pmc',
        'id': pmc_id,
        'api_key': api_key
    }
    
    filename = os.path.join(full_texts_path, str(pmc_id))
    if (os.path.exists(filename)) and (not refresh):
        pass
    else:
        try:
            response = requests.get(url=url_full_text, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured with PMC ID: {0}'.format(pmc_id))
        else:
            data = response.text
            with open(filename, 'w+') as f:
                f.write(data)

In [113]:
def download_all_query_responses(query='neurosurgery', refresh=False):
    """Downloads all query responses got by <query>"""
    article_ids = get_article_ids(query)
    print('{0} articles found, downloading...'.format(len(article_ids)))
    
    for pmc_id in article_ids:
        download_query_response(pmc_id, refresh)
        
    files_count = len(os.listdir(full_texts_path))
    print('{0} articles stored in the database'.format(files_count))

### Main

In [51]:
items = get_items(authors_list[0])
items.head(2)

Unnamed: 0,fullname,pubmed_id,title,abstract,keywords,journal,publication_date,authors,affiliations,methods,conclusions,results,copyrights,doi
0,Danilov Gleb,31349300,An Information Extraction Algorithm for Detect...,"Rich-in-morphology language, such as Russian, ...",Adverse Events; Electronic Health Records; Nat...,Studies in health technology and informatics,2019-07-28,Danilov Gleb; Shifrin Michael; Strunina Uliya;...,National Medical Research Center for Neurosurg...,,,,,10.3233/SHTI190051
1,Danilov Gleb,31121374,A Single-Center Retrospective Descriptive Coho...,Cerebrospinal fluid (CSF) leak remains a signi...,Cerebrospinal fluid leakage; Children; Craniot...,World neurosurgery,2019-05-24,Kushel Yury; Danilov Gleb; Tekoev Aslan; Cheld...,"2nd Neurosurgical Department, Burdenko Neurosu...",,,Postoperative wound CSF leakage was observed i...,Copyright © 2019 Elsevier Inc. All rights rese...,10.1016/j.wneu.2019.05.091


In [52]:
items = add_pmc_id(items)
items.head(2)

Unnamed: 0,fullname,pubmed_id,pmc_id,title,abstract,keywords,journal,publication_date,authors,affiliations,methods,conclusions,results,copyrights,doi
0,Danilov Gleb,31349300,,An Information Extraction Algorithm for Detect...,"Rich-in-morphology language, such as Russian, ...",Adverse Events; Electronic Health Records; Nat...,Studies in health technology and informatics,2019-07-28,Danilov Gleb; Shifrin Michael; Strunina Uliya;...,National Medical Research Center for Neurosurg...,,,,,10.3233/SHTI190051
1,Danilov Gleb,31121374,,A Single-Center Retrospective Descriptive Coho...,Cerebrospinal fluid (CSF) leak remains a signi...,Cerebrospinal fluid leakage; Children; Craniot...,World neurosurgery,2019-05-24,Kushel Yury; Danilov Gleb; Tekoev Aslan; Cheld...,"2nd Neurosurgical Department, Burdenko Neurosu...",,,Postoperative wound CSF leakage was observed i...,Copyright © 2019 Elsevier Inc. All rights rese...,10.1016/j.wneu.2019.05.091


In [121]:
items.to_excel('output/pubmed-pymed.xlsx')

### Download Neurosurgery articles

In [114]:
download_all_query_responses(query='neurosurgery', refresh=False)

121183 articles found, downloading...
121183 articles stored in the database


### Handle query responses

In [794]:
items = handle_query_responses()
items.head()

Done 121183 of 121183. Working on 99430


Unnamed: 0,pmid,pmc,publisher-id,doi,abstract_len,full_text_len,file_size,title,article-type,category,...,manuscript,pii,sici,other,medline,pmcid,coden,art-access-id,publisher-manuscript,pmc-scan
0,496443.0,1000359,,,1460,0,4635,The neuropsychiatric disorder in systemic lupu...,research-article,Research Article,...,,,,,,,,,,
1,,1000485,,,0,0,1523,Clinical Aspects of Neuroimmunology,book-review,Book Reviews,...,,,,,,,,,,
2,6696518.0,1001222,,,329,0,2839,Popliteal masses masquerading as popliteal cysts.,research-article,Research Article,...,,,,,,,,,,
3,4083935.0,1001788,,,0,0,1587,Rheumatoid subluxations of the cervical spine.,research-article,Research Article,...,,,,,,,,,,
4,4083936.0,1001789,,,1159,0,4061,Surgical treatment of cervical cord compressio...,research-article,Research Article,...,,,,,,,,,,


In [795]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121183 entries, 0 to 121182
Data columns (total 28 columns):
pmid                    110744 non-null object
pmc                     121183 non-null object
publisher-id            56677 non-null object
doi                     87377 non-null object
abstract_len            121183 non-null object
full_text_len           121183 non-null object
file_size               121183 non-null object
title                   120936 non-null object
article-type            121183 non-null object
category                121169 non-null object
authors                 117241 non-null object
pub_date                121183 non-null object
keywords                61401 non-null object
copyright               74689 non-null object
license-type            39082 non-null object
license                 49738 non-null object
abstract                99856 non-null object
full_text               50393 non-null object
manuscript              13759 non-null object
pii  

In [796]:
items.to_csv('database/pmc_all.csv', sep='|')

### Debug

In [801]:
#root = get_element_tree(6069828)
root = get_element_tree(6070712)

In [828]:
root = get_element_tree(6070712)
item = parse_element_tree(root)
item

{'article-type': 'research-article',
 'pmid': '29976640',
 'pmc': '6070712',
 'doi': '10.21873/cgp.20093',
 'category': 'Research Article',
 'title': 'A Comprehensive Method for Detecting Fusion Genes in Paediatric Brain Tumours',
 'authors': 'AKIHIDE KONDO, YUZABURO SHIMIZU, SATOSHI ADACHI, IKUKO OGINO, MARIO SUZUKI, OSAMU AKIYAMA, HAJIME ARAI',
 'pub_date': '2018-7-5',
 'copyright': 'Copyright 2018, International Institute of Anticancer Research',
 'license': nan,
 'keywords': 'Paediatric brain tumours; BRAF; KIAA1549; fusion gene; pyrosequencing',
 'abstract': 'Background: Fusion genes driving tumourigenesis have drawn the attention of researchers and oncologists. Despite the importance of such molecular alterations, there are no comprehensive reproducible methods for detecting fusion genes. Materials and Methods: Nineteen paediatric brain tumours of five types, namely pilocytic astrocytoma, oligodendroglioma, anaplastic astrocytoma, glioblastoma and, ganglioglioma, were examined to

In [501]:
filenames = os.listdir(full_texts_path)
items = pd.DataFrame()
for filename in filenames[0:10]:
    root = get_element_tree(filename)
    item = parse_element_tree(root)
    items = items.append(item, ignore_index=True)
items

Unnamed: 0,abstract,abstract_len,article-type,category,full_text_len,pmc,pmid,title
0,\n \n As part of a prospective sur...,1555.0,research-article,Research Article,0.0,1000359,496443.0,The neuropsychiatric disorder in systemic lupu...
1,,0.0,book-review,Book Reviews,0.0,1000485,,Clinical Aspects of Neuroimmunology
2,"\n \n Two popliteal swellings, tho...",462.0,research-article,Research Article,0.0,1001222,6696518.0,Popliteal masses masquerading as popliteal cysts.
3,,0.0,research-article,Research Article,0.0,1001788,4083935.0,Rheumatoid subluxations of the cervical spine.
4,\n \n Cervical myelopathy is a rar...,1292.0,research-article,Research Article,0.0,1001789,4083936.0,Surgical treatment of cervical cord compressio...
5,\n \n Material from 100 consecutiv...,958.0,research-article,Research Article,0.0,1001857,3954473.0,Amyloid in intervertebral discs: a histopathol...
6,,0.0,research-article,Research Article,0.0,1002279,2309487.0,Perceived health quackery use among patients. ...
7,\n \n \n \n Images...,167.0,research-article,Research Article,0.0,1002313,2305578.0,Intracranial tuberculoma developing during the...
8,\n \n Multiple myeloma associated ...,1147.0,review-article,Research Article,0.0,1002325,2185597.0,"Syndrome of plasma cell dyscrasia, polyneuropa..."
9,\n \n \n \n Images...,129.0,research-article,Research Article,0.0,1002389,2161587.0,A 36-year-old woman with a pulsatile mass of t...


In [798]:
article_meta = root.findall('article/front/article-meta')
article_meta

[<Element 'article-meta' at 0x000001E14189F778>]

In [595]:
authors = []
article_meta = root.findall('article/front/article-meta')
for meta in article_meta:
    contributors = meta.findall('contrib-group/contrib')
    for contributor in contributors:
        for contributor_meta in contributor.iter('name'):
            surname = contributor_meta.find('surname')
            given_name = contributor_meta.find('given-names')
            if surname is not None and given_name is not None:
                authors.append(' '.join([given_name.text, surname.text]))
    authors = ', '.join(authors)
print(authors)

Ksenia Ershova, Ivan Savin, Nataliya Kurdyumova, Darren Wong, Gleb Danilov, Michael Shifrin, Irina Alexandrova, Ekaterina Sokolova, Nadezhda Fursova, Vladimir Zelman, Olga Ershova


In [709]:
body = root.find('article/body')
if body is not None:
    for element in body.iter():
        if element.text:
            item['full_text'] = item.get('full_text', '') + element.text + ' '
        if element.tail:
            item['full_text'] = item.get('full_text', '') + element.tail + ' '
    item['full_text'] = ' '.join(item['full_text'].split())

In [775]:
article_meta = root.findall('article/front/article-meta')
for meta in article_meta:
    keywords = meta.find('kwd-group')
    for keyword in keywords.iter('kwd'):
        print(keyword.text)

Cross infection
Intensive care unit
Infection control
Drug resistance
Survival analysis


In [805]:
article_meta = root.findall('article/front/article-meta')
for meta in article_meta:
    volume = meta.find('volume')
    if volume is not None:
        print(volume.text)
    elocation_id = meta.find('elocation-id')
    if elocation_id is not None:
        print(elocation_id.text)
    issue = meta.find('issue')
    if issue is not None:
        print(issue.text)
    fpage = meta.find('fpage')
    if fpage is not None:
        print(fpage.text)
    lpage = meta.find('lpage')
    if lpage is not None:
        print(lpage.text)
    pages = '-'.join([fpage.text, lpage.text])
    print(pages)

15
4
343
348
343-348
