### Import

In [1]:
import datetime
import time
import requests
import os
import json

In [4]:
#from pymed import PubMed
from IPython.display import clear_output
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

### Settings

In [50]:
headers = {
    'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 69.0.3497.81 Safari / 537.36'
}

tool = 'my_tool1'
email = 'tishankulov@nsi.ru'
api_key = '2839ed49187b099ec3d13cc079fc3ca0fc09'
max_results = 5000

dbs = ['pmc', 'pubmed']
for db in dbs:
    if not os.path.exists(db):
        os.makedirs(db)

files_stoplist = ['6796246', '4669991', '4212306', '4070603', '4912513', '2799065', '5042924', '6032109', '5042923',
                  '6555104', '5724662', '5493079', '6117636', '5933288', '6763540', '6493311', '6737605', '5637785']

url_pubmed_to_pmc = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
url_fetch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
url_article_ids = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'

### Functions

In [111]:
def get_items_pubmed(fullname):
    """
    Makes request to Pubmed database using pymed library.
    Returns DataFrame with all articles authored by <fullname>.
    Deprecated.
    """
    items = pd.DataFrame(columns=['fullname', 'pubmed_id', 'title', 'abstract', 'keywords', 'journal', 'publication_date',
                                  'authors', 'affiliations', 'methods', 'conclusions', 'results', 'copyrights', 'doi'])
    
    lastname = fullname.split()[0]
    firstname = fullname.split()[1]
    query = '{0}, {1}[Author]'.format(lastname, firstname)
    
    pubmed = PubMed(tool=tool, email=email)
    results = pubmed.query(query, max_results=max_results)
    
    while True:
        try:
            result = next(results)
            result_dict = result.toDict()
            result_dict['fullname'] = fullname
            result_dict['affiliations'] = [author['affiliation'] for author in result_dict['authors']]
            result_dict['affiliations'] = '; '.join(result_dict['affiliations'])
            result_dict['authors'] = [' '.join([author['lastname'], author['firstname']]) for author in result_dict['authors']]
            result_dict['authors'] = '; '.join(result_dict['authors'])
            result_dict['keywords'] = '; '.join(result_dict['keywords'])
            result_dict['pubmed_id'] = result_dict['pubmed_id'].replace('\n', '; ')
            del result_dict['xml']
            items = items.append(result_dict, ignore_index=True)
        except StopIteration:
            break
    
    return items

In [6]:
def add_pmc_id(items):
    """Add PMC ID to <items> DataFrame"""
    params = {
        'format': 'json',
        'dbfrom': 'pubmed',
        'linkname': 'pubmed_pmc',
        'api_key': api_key
    }
    
    if not 'pmc_id' in items.columns:
        items.insert(2, 'pmc_id', pd.np.nan)
    
    for i in range(0, len(items)):
        params['id'] = items.loc[i, 'pubmed_id']
        try:
            response = requests.get(url=url_pubmed_to_pmc, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured with Pubmed ID: {0}'.format(params['id']))
        else:
            data = response.json()
            if 'linksetdbs' in data['linksets'][0]:
                items.loc[i, 'pmc_id'] = data['linksets'][0]['linksetdbs'][0]['links'][0]
    return items

In [7]:
def parse_pub_date(subroot, pub_type):
    """Returns parsed publication date from given ET <subroot>"""
    pub_date = np.NaN
    pub_dates = subroot.findall('pub-date')
    for pub_record in pub_dates:
        if (pub_record.attrib.get('pub-type', '') == pub_type or
            pub_record.attrib.get('date-type', '') == 'pub'):
            day = pub_record.find('day')
            month = pub_record.find('month')
            year = pub_record.find('year')
            if year is not None:
                pub_date = year.text
                if month is not None:
                    pub_date += '-' + month.text
                    if day is not None:
                        pub_date += '-' + day.text
    if pub_date is np.NaN:
        pub_date = parse_pub_date(subroot, 'ppub')
    return pub_date

In [8]:
def parse_authors(subroot, path):
    """Returns parsed authors string from given ET <subroot>"""
    authors = []
    contributors = subroot.findall(path)
    for contributor in contributors:
        for contributor_meta in contributor.iter('name'):
            surname = contributor_meta.find('surname')
            given_name = contributor_meta.find('given-names')
            if (surname is not None) and (given_name is not None):
                surname = surname.text if surname.text is not None else ''
                given_name = given_name.text if given_name.text is not None else ''
                authors.append(' '.join([given_name, surname]))
    if len(authors) > 0:
        authors = ', '.join(authors)
        authors = ' '.join(authors.split())
    if authors:
        return authors
    return np.NaN

In [9]:
def parse_keywords(subroot):
    """Returns parsed keywords string from given ET <subroot>"""
    keywords = []
    kwds = subroot.find('kwd-group')
    if kwds is not None:
        for kwd in kwds.iter('kwd'):
            if kwd is not None and kwd.text is not None:
                keywords.append(kwd.text)
    if len(keywords) > 0:
        keywords = '; '.join(keywords)
        return keywords
    return np.NaN

In [10]:
def parse_issue(subroot):
    """Returns parsed journal issue dict from given ET <subroot>"""
    journal_issue = {}
    volume = subroot.find('volume')                            # Volume
    if volume is not None:
        journal_issue['volume'] = volume.text
        
    elocation_id = subroot.find('elocation-id')                # Elocation-id
    if elocation_id is not None:
        journal_issue['elocation-id'] = elocation_id.text
        
    issue = subroot.find('issue')                              # Issue
    if issue is not None:
        journal_issue['issue'] = issue.text
        
    fpage = subroot.find('fpage')                              # Pages
    fpage = fpage.text if fpage is not None else None
    lpage = subroot.find('lpage')
    lpage = lpage.text if lpage is not None else None
    
    if isinstance(fpage, str) and isinstance(lpage, str):
        journal_issue['pages'] = '-'.join([fpage, lpage])
        
    return journal_issue

In [11]:
def parse_journal_meta(subroot, path):
    """Returns parsed journal meta dict from given ET <subroot>"""
    journal_meta = {}
    meta = subroot.find(path)
    if meta is not None:
        journal_ids = meta.findall('journal-id')                                            # Journal ids
        for journal_id in journal_ids:
            journal_meta['journal-id_' + journal_id.attrib.get('journal-id-type')] = journal_id.text
            
        issns = meta.findall('issn')                                                        # ISSNs
        for issn in issns:
            journal_meta['issn_' + issn.attrib.get('pub-type')] = issn.text
        
        journal_title = meta.find('journal-title-group/journal-title')                      # Journal title
        if (journal_title is not None) and (journal_title.text is not None):
            journal_meta['journal_title'] = journal_title.text
            
        publisher = meta.find('publisher')                                                  # Publisher
        if publisher is not None:
            publisher_name = publisher.find('publisher-name')                               # Publisher's name
            if (publisher_name is not None) and (publisher_name.text is not None):
                journal_meta['publisher_name'] = publisher_name.text
                
            publisher_loc = publisher.find('publisher-loc')                                 # Publisher's location
            if (publisher_loc is not None) and (publisher_loc.text is not None):
                journal_meta['publisher_loc'] = publisher_loc.text
    return journal_meta

In [12]:
def extract_text(subroot, path):
    """Returns extracted text between xml tags"""
    text = ''
    section = subroot.find(path)
    if section is not None:
        for element in section.iter():
            if element.text:
                text = text + element.text + ' '
            if element.tail:
                text = text + element.tail + ' '
        text = ' '.join(text.split())
    if text:
        return text
    return np.NaN

In [13]:
def parse_element_tree(root):
    """Returns parsed dict with all values found"""
    item = {}
    
    item['article-type'] = root.find('article').get('article-type')                             # Article type
    article_meta = root.findall('article/front/article-meta')                                   # Article meta
    for meta in article_meta:
        article_ids = meta.findall('article-id')                                                # Article ids
        for article_id in article_ids:
            item[article_id.get('pub-id-type')] = article_id.text
            
        category = meta.find('article-categories/subj-group/subject')                           # Article category
        if category is not None:
            item['category'] = category.text
            
        item['title'] = extract_text(meta, 'title-group/article-title')                         # Article title
        item['authors'] = parse_authors(meta, 'contrib-group/contrib')                          # Authors
        item['pub_date'] = parse_pub_date(meta, 'epub')                                         # Publication date
    
        copyright = meta.find('permissions/copyright-statement')                                # Copyright statement
        if copyright is not None:
            item['copyright'] = copyright.text
        license_type = meta.find('permissions/license')                                         # License type
        if license_type is not None:
            item['license-type'] = license_type.get('license-type')
        item['license'] = extract_text(meta, 'permissions/license/license-p')                   # License text
        
        item['keywords'] = parse_keywords(meta)                                                 # Keywords
        item['abstract'] = extract_text(meta, 'abstract')                                       # Abstract
        item.update(parse_issue(meta))                                                          # Journal issue
    
    item.update(parse_journal_meta(root, 'article/front/journal-meta'))                         # Journal meta
    item['full_text'] = extract_text(root, 'article/body')
    item['abstract_len'] = len(item['abstract']) if item['abstract'] is not np.NaN else 0
    item['full_text_len'] = len(item['full_text']) if item['full_text'] is not np.NaN else 0
            
    return item

In [73]:
def get_element_tree(pmc_id, db):
    """Returns parsed ElementTree object of article with <pmc_id> identifier"""
    filename = os.path.join(db, str(pmc_id))
    
    if not os.path.exists(filename):
        return None
    
    with open(filename, 'r') as f:
        data = f.read()        
    root = ET.fromstring(data)
    
    return root

In [74]:
def handle_query_responses(db):
    """Returns articles DataFrame"""
    filenames = os.listdir(db)
    print('{0} query responses found. Starting...'.format(len(filenames)))
    time.sleep(1)
    
    items = pd.DataFrame(columns=['pmid', 'pmc', 'publisher-id', 'doi', 'abstract_len', 'full_text_len', 'file_size',
                                  'title', 'article-type', 'category', 'authors', 'pub_date', 'keywords',
                                  'volume', 'elocation-id', 'issue', 'pages', 'issn_epub', 'issn_ppub',
                                  'journal-id_nlm-ta', 'journal_title', 'publisher_name', 'publisher_loc',
                                  'copyright', 'license-type', 'license', 'abstract', 'full_text'])
    for i in range(0, len(filenames)):
        clear_output(wait=True)
        
        #if filenames[i] in files_stoplist:
        #    continue
        print('Done {0} of {1}. Working on {2}'.format(i+1, len(filenames), filenames[i]))
        
        root = get_element_tree(filenames[i], db)
        item = parse_element_tree(root)
        item['file_size'] = os.path.getsize(os.path.join(db, filenames[i]))
        items = items.append(item, ignore_index=True)
        if i % 5000 == 0:
            items.to_csv('database/pmc.csv', sep='|', index=False)
        
    return items

In [63]:
def get_article_ids(query, db):
    """Returns all article ids found by <query>"""
    params = {
        'db': db,
        'api_key': api_key,
        'term': query,
        'retmax': max_results,
        'retstart': 0,
        'retmode': 'xml'
    }
    article_ids = []
    count = params['retmax'] + 1
    
    while params['retstart'] < count:
        try:
            response = requests.get(url=url_article_ids, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured at retstart: {0}'.format(params['retstart']))
        else:
            root = ET.fromstring(response.text)
            for article_id in root.iter('Id'):
                article_ids.append(int(article_id.text))
            count = int(root.find('Count').text)
        finally:
            params['retstart'] += params['retmax']

    return article_ids

In [66]:
def download_query_response(pmc_id, db, refresh=False):
    """Saves article query response with <pmc_id> identifier to file"""
    params = {
        'db': db,
        'id': pmc_id,
        'api_key': api_key,
        'retmode': 'xml'
    }
    
    filename = os.path.join(db, str(pmc_id))
    if (os.path.exists(filename)) and (not refresh):
        pass
    else:
        try:
            response = requests.get(url=url_fetch, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured with PMC ID: {0}'.format(pmc_id))
        else:
            data = response.text
            with open(filename, 'w+', encoding='utf-8') as f:
                f.write(data)

In [69]:
def download_all_query_responses(query, db, refresh=False):
    """Downloads all query responses got by <query>"""
    article_ids = get_article_ids(query, db)
    print('{0} articles found, downloading...'.format(len(article_ids)))
    
    for pmc_id in article_ids:
        download_query_response(pmc_id, db, refresh)
        
    files_count = len(os.listdir(db))
    print('{0} articles stored in the database'.format(files_count))

### Main

### Download Neurosurgery articles

In [77]:
download_all_query_responses(query='text classification neural network', db='pubmed', refresh=False)

231 articles found, downloading...
231 articles stored in the database


### Handle query responses

In [78]:
#items = handle_query_responses('pubmed')
#items.head()

In [867]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121183 entries, 0 to 121182
Data columns (total 51 columns):
pmid                         110744 non-null object
pmc                          121183 non-null object
publisher-id                 56677 non-null object
doi                          87377 non-null object
abstract_len                 121183 non-null object
full_text_len                121183 non-null object
file_size                    121183 non-null object
title                        120936 non-null object
article-type                 121183 non-null object
category                     121169 non-null object
authors                      117241 non-null object
pub_date                     121183 non-null object
keywords                     61401 non-null object
volume                       120858 non-null object
elocation-id                 24254 non-null object
issue                        99546 non-null object
pages                        95680 non-null object
issn_epub  

In [868]:
items.to_csv('database/pmc_all.csv', sep='|')

### Debug

In [830]:
root = get_element_tree(6069828)
#root = get_element_tree(6070712)

In [878]:
root = get_element_tree(6070712)
item = parse_element_tree(root)
item

{'article-type': 'research-article',
 'pmid': '29976640',
 'pmc': '6070712',
 'doi': '10.21873/cgp.20093',
 'category': 'Research Article',
 'title': 'A Comprehensive Method for Detecting Fusion Genes in Paediatric Brain Tumours',
 'authors': 'AKIHIDE KONDO, YUZABURO SHIMIZU, SATOSHI ADACHI, IKUKO OGINO, MARIO SUZUKI, OSAMU AKIYAMA, HAJIME ARAI',
 'pub_date': '2018-7-5',
 'copyright': 'Copyright 2018, International Institute of Anticancer Research',
 'license': nan,
 'keywords': 'Paediatric brain tumours; BRAF; KIAA1549; fusion gene; pyrosequencing',
 'abstract': 'Background: Fusion genes driving tumourigenesis have drawn the attention of researchers and oncologists. Despite the importance of such molecular alterations, there are no comprehensive reproducible methods for detecting fusion genes. Materials and Methods: Nineteen paediatric brain tumours of five types, namely pilocytic astrocytoma, oligodendroglioma, anaplastic astrocytoma, glioblastoma and, ganglioglioma, were examined to

In [798]:
article_meta = root.findall('article/front/article-meta')
article_meta

[<Element 'article-meta' at 0x000001E14189F778>]

In [709]:
body = root.find('article/body')
if body is not None:
    for element in body.iter():
        if element.text:
            item['full_text'] = item.get('full_text', '') + element.text + ' '
        if element.tail:
            item['full_text'] = item.get('full_text', '') + element.tail + ' '
    item['full_text'] = ' '.join(item['full_text'].split())

In [894]:
filter_full_text = (items.full_text.isnull()) & (items.file_size > 20000)
items[filter_full_text]

Unnamed: 0,pmid,pmc,publisher-id,doi,abstract_len,full_text_len,file_size,title,article-type,category,...,journal-id_,pmcid,journal-id_doi,journal-id_coden,coden,art-access-id,journal-id_issn,publisher-manuscript,journal-id_archive,pmc-scan
11022,12042091,1462953,,10.1089/089771502753754037,1541,0,23509,Clinical Trials in Head Injury,research-article,Article,...,,,,,,,,,,
11316,15720813,1490313,04535,,1440,0,47216,Gene Expression Profile of Glioblastoma Multif...,research-article,Research Article,...,,,,,,,,,,
11463,16207475,1501886,05160,,1140,0,26640,Potential Applications of Flat-Panel Volumetri...,research-article,Research Article,...,,,,,,,,,,
12194,15159284,1575049,,10.1038/sj.bjp.0705839,1514,0,22329,Presynaptic cannabinoid CB 1 receptors are inv...,research-article,Papers,...,,,,,,,,,,
12196,15210585,1575121,,10.1038/sj.bjp.0705876,1723,0,21397,Peripheral GABA B agonists stimulate gastric a...,research-article,Papers,...,,,,,,,,,,
12208,15451776,1575931,,10.1038/sj.bjp.0705987,1803,0,25186,Sympathectomy reveals α 1A - and α 1D -adrenoc...,research-article,Papers,...,,,,,,,,,,
12550,14742253,1602258,,,1359,0,20204,Targeting the Tie2/Tek Receptor in Astrocytomas,research-article,Regular Articles,...,,,,,,,,,,
12551,15743799,1602359,,,1681,0,48363,"Up-Regulation of Angiopoietin-2, Matrix Metall...",research-article,Original Research Paper,...,,,,,,,,,,
12638,16436655,1606491,,10.2353/ajpath.2006.050126,1451,0,23781,Ca 2+ -Dependent Cytoprotective Effects of Urs...,research-article,Original Research Paper,...,,,,,,,,,,
12859,15277233,1618572,,,1548,0,32737,Vascular Gene Expression in Nonneoplastic and ...,research-article,Regular Articles,...,,,,,,,,,,


In [895]:
items[items.full_text.notnull()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50393 entries, 5230 to 121144
Data columns (total 51 columns):
pmid                         49659 non-null object
pmc                          50393 non-null object
publisher-id                 37249 non-null object
doi                          49030 non-null object
abstract_len                 50393 non-null object
full_text_len                50393 non-null object
file_size                    50393 non-null object
title                        50389 non-null object
article-type                 50393 non-null object
category                     50393 non-null object
authors                      50173 non-null object
pub_date                     50393 non-null object
keywords                     36146 non-null object
volume                       50323 non-null object
elocation-id                 21797 non-null object
issue                        32402 non-null object
pages                        28406 non-null object
issn_epub           