### Import

In [1]:
import datetime
import time
import requests
import os
import json

In [2]:
from IPython.display import clear_output
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np

### Settings

In [3]:
headers = {
    'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64; x64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 69.0.3497.81 Safari / 537.36'
}

api_key = '2839ed49187b099ec3d13cc079fc3ca0fc09'
max_results = 5000
retmode = 'xml'

dbs = ['pmc', 'pubmed', 'database']
for db in dbs:
    if not os.path.exists(db):
        os.makedirs(db)

files_stoplist = ['6796246', '4669991', '4212306', '4070603', '4912513', '2799065', '5042924', '6032109', '5042923',
                  '6555104', '5724662', '5493079', '6117636', '5933288', '6763540', '6493311', '6737605', '5637785']

url_pubmed_to_pmc = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
url_fetch = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
url_search = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'

### Functions

In [4]:
def parse_pub_date(subroot, pub_type):
    """Returns parsed publication date from given ET <subroot>"""
    pub_date = np.NaN
    pub_dates = subroot.findall('pub-date')
    for pub_record in pub_dates:
        if (pub_record.attrib.get('pub-type', '') == pub_type or
            pub_record.attrib.get('date-type', '') == 'pub'):
            day = pub_record.find('day')
            month = pub_record.find('month')
            year = pub_record.find('year')
            if year is not None:
                pub_date = year.text
                if month is not None:
                    pub_date += '-' + month.text
                    if day is not None:
                        pub_date += '-' + day.text
    if pub_date is np.NaN:
        pub_date = parse_pub_date(subroot, 'ppub')
    return pub_date

In [5]:
def parse_authors(subroot, path):
    """Returns parsed authors string from given ET <subroot>"""
    authors = []
    contributors = subroot.findall(path)
    for contributor in contributors:
        for contributor_meta in contributor.iter('name'):
            surname = contributor_meta.find('surname')
            given_name = contributor_meta.find('given-names')
            if (surname is not None) and (given_name is not None):
                surname = surname.text if surname.text is not None else ''
                given_name = given_name.text if given_name.text is not None else ''
                authors.append(' '.join([given_name, surname]))
    if len(authors) > 0:
        authors = ', '.join(authors)
        authors = ' '.join(authors.split())
    if authors:
        return authors
    return np.NaN

In [6]:
def parse_keywords(subroot):
    """Returns parsed keywords string from given ET <subroot>"""
    keywords = []
    kwds = subroot.find('kwd-group')
    if kwds is not None:
        for kwd in kwds.iter('kwd'):
            if kwd is not None and kwd.text is not None:
                keywords.append(kwd.text)
    if len(keywords) > 0:
        keywords = '; '.join(keywords)
        return keywords
    return np.NaN

In [7]:
def parse_issue(subroot):
    """Returns parsed journal issue dict from given ET <subroot>"""
    journal_issue = {}
    volume = subroot.find('volume')                            # Volume
    if volume is not None:
        journal_issue['volume'] = volume.text
        
    elocation_id = subroot.find('elocation-id')                # Elocation-id
    if elocation_id is not None:
        journal_issue['elocation-id'] = elocation_id.text
        
    issue = subroot.find('issue')                              # Issue
    if issue is not None:
        journal_issue['issue'] = issue.text
        
    fpage = subroot.find('fpage')                              # Pages
    fpage = fpage.text if fpage is not None else None
    lpage = subroot.find('lpage')
    lpage = lpage.text if lpage is not None else None
    
    if isinstance(fpage, str) and isinstance(lpage, str):
        journal_issue['pages'] = '-'.join([fpage, lpage])
        
    return journal_issue

In [8]:
def parse_journal_meta(subroot, path):
    """Returns parsed journal meta dict from given ET <subroot>"""
    journal_meta = {}
    meta = subroot.find(path)
    if meta is not None:
        journal_ids = meta.findall('journal-id')                                            # Journal ids
        for journal_id in journal_ids:
            journal_meta['journal-id_' + journal_id.attrib.get('journal-id-type')] = journal_id.text
            
        issns = meta.findall('issn')                                                        # ISSNs
        for issn in issns:
            journal_meta['issn_' + issn.attrib.get('pub-type')] = issn.text
        
        journal_title = meta.find('journal-title-group/journal-title')                      # Journal title
        if (journal_title is not None) and (journal_title.text is not None):
            journal_meta['journal_title'] = journal_title.text
            
        publisher = meta.find('publisher')                                                  # Publisher
        if publisher is not None:
            publisher_name = publisher.find('publisher-name')                               # Publisher's name
            if (publisher_name is not None) and (publisher_name.text is not None):
                journal_meta['publisher_name'] = publisher_name.text
                
            publisher_loc = publisher.find('publisher-loc')                                 # Publisher's location
            if (publisher_loc is not None) and (publisher_loc.text is not None):
                journal_meta['publisher_loc'] = publisher_loc.text
    return journal_meta

In [9]:
def extract_text(subroot, path):
    """Returns extracted text between xml tags"""
    text = ''
    section = subroot.find(path)
    if section is not None:
        for element in section.iter():
            if element.text:
                text = text + element.text + ' '
            if element.tail:
                text = text + element.tail + ' '
        text = ' '.join(text.split())
    if text:
        return text
    return np.NaN

In [10]:
def parse_element_tree_pmc(root):
    """Returns parsed dict with all values found"""
    item = {}
    
    item['article-type'] = root.find('article').get('article-type')                             # Article type
    article_meta = root.findall('article/front/article-meta')                                   # Article meta
    for meta in article_meta:
        article_ids = meta.findall('article-id')                                                # Article ids
        for article_id in article_ids:
            item[article_id.get('pub-id-type')] = article_id.text
            
        category = meta.find('article-categories/subj-group/subject')                           # Article category
        if category is not None:
            item['category'] = category.text
            
        item['title'] = extract_text(meta, 'title-group/article-title')                         # Article title
        item['authors'] = parse_authors(meta, 'contrib-group/contrib')                          # Authors
        item['pub_date'] = parse_pub_date(meta, 'epub')                                         # Publication date
    
        copyright = meta.find('permissions/copyright-statement')                                # Copyright statement
        if copyright is not None:
            item['copyright'] = copyright.text
        license_type = meta.find('permissions/license')                                         # License type
        if license_type is not None:
            item['license-type'] = license_type.get('license-type')
        item['license'] = extract_text(meta, 'permissions/license/license-p')                   # License text
        
        item['keywords'] = parse_keywords(meta)                                                 # Keywords
        item['abstract'] = extract_text(meta, 'abstract')                                       # Abstract
        item.update(parse_issue(meta))                                                          # Journal issue
    
    item.update(parse_journal_meta(root, 'article/front/journal-meta'))                         # Journal meta
    item['full_text'] = extract_text(root, 'article/body')
    item['abstract_len'] = len(item['abstract']) if item['abstract'] is not np.NaN else 0
    item['full_text_len'] = len(item['full_text']) if item['full_text'] is not np.NaN else 0
            
    return item

In [11]:
def parse_element_tree_pubmed(root):
    """Returns parsed dict with all values found"""
    item = {}
    
    pubmed_data = root.find('PubmedArticle/PubmedData')
    article_meta = root.find('PubmedArticle/MedlineCitation/Article')
    
    article_ids = pubmed_data.findall('ArticleIdList/ArticleId')                                # Article ids
    if article_ids is not None:
        for article_id in article_ids:
            if article_id.get('IdType') == 'pubmed':
                item['pmid'] = article_id.text
            else:
                item[article_id.get('IdType')] = article_id.text
    
    title = article_meta.find('ArticleTitle')                                                   # Title
    if title is not None:
        item['title'] = title.text
        
    abstract = article_meta.find('Abstract/AbstractText')                                       # Abstract
    if abstract is not None:
        item['abstract'] = abstract.text
    else:
        item['abstract'] = np.NaN
        
    item['abstract_len'] = len(item['abstract']) if item['abstract'] is not np.NaN else 0
    return item

In [12]:
def get_element_tree(filename):
    """Returns parsed ElementTree object of an article with <pmc_id> identifier"""
    if not os.path.exists(filename):
        return None
    
    with open(filename, 'r') as f:
        data = f.read()        
    root = ET.fromstring(data)
    
    return root

In [13]:
def handle_query_responses(db, article_ids):
    """Returns articles DataFrame"""
    filenames = [os.path.join(db, str(article_id)) for article_id in article_ids]
    print('{0} query responses found. Starting...'.format(len(filenames)))
    time.sleep(1)
    
    items = pd.DataFrame(columns=['pmid', 'pmc', 'publisher-id', 'doi', 'abstract_len', 'full_text_len', 'file_size',
                                  'title', 'article-type', 'category', 'authors', 'pub_date', 'keywords',
                                  'volume', 'elocation-id', 'issue', 'pages', 'issn_epub', 'issn_ppub',
                                  'journal-id_nlm-ta', 'journal_title', 'publisher_name', 'publisher_loc',
                                  'copyright', 'license-type', 'license', 'abstract', 'full_text'])
    for i in range(0, len(filenames)):
        clear_output(wait=True)
        
        #if filenames[i] in files_stoplist:
        #    continue
        print('Done {0} of {1}. Working on {2}'.format(i+1, len(filenames), filenames[i]))
        
        root = get_element_tree(filenames[i])
        if db == 'pmc':
            item = parse_element_tree_pmc(root)
        elif db == 'pubmed':
            item = parse_element_tree_pubmed(root)
        item['file_size'] = os.path.getsize(filenames[i])
        items = items.append(item, ignore_index=True)
        if i % 5000 == 0:
            items.to_csv('database/pmc.csv', sep='|', index=False)
        
    return items

In [14]:
def get_article_ids(query, db):
    """Returns all article ids found by <query>"""
    params = {
        'db': db,
        'api_key': api_key,
        'term': query,
        'retmax': max_results,
        'retstart': 0,
        'retmode': retmode
    }
    article_ids = []
    count = params['retmax'] + 1
    
    while params['retstart'] < count:
        try:
            response = requests.get(url=url_search, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured at retstart: {0}'.format(params['retstart']))
        else:
            root = ET.fromstring(response.text)
            for article_id in root.iter('Id'):
                article_ids.append(int(article_id.text))
            count = int(root.find('Count').text)
        finally:
            params['retstart'] += params['retmax']

    return article_ids

In [15]:
def download_query_response(article_id, db, refresh=False):
    """Saves article query response with <article_id> identifier to file"""
    params = {
        'db': db,
        'id': article_id,
        'api_key': api_key,
        'retmode': retmode
    }
    
    filename = os.path.join(db, str(article_id))
    if (os.path.exists(filename)) and (not refresh):
        pass
    else:
        try:
            response = requests.get(url=url_fetch, headers=headers, params=params)
        except requests.RequestException:
            print('Problem has occured with PMC ID: {0}'.format(article_id))
        else:
            data = response.text
            with open(filename, 'w+', encoding='utf-8') as f:
                f.write(data)

In [63]:
def download_all_query_responses(query, db, refresh=False):
    """Downloads all query responses got by <query>"""
    article_ids = get_article_ids(query, db)
    
    article_ids = [str(article_id) for article_id in article_ids]
    files_list = os.listdir(db)
    intersection = list(set(article_ids) & set(files_list))
    
    print('{0} articles found in {1} with query specified.'.format(len(article_ids), db))
    print('{0} articles are already stored in the database.'.format(len(intersection)))
    print('{0} articles will be downloaded.'.format(len(article_ids) - len(intersection)))
    
    for article_id in article_ids:
        download_query_response(article_id, db, refresh)
    
    files_count = len(os.listdir(db))
    print('Total {0} articles stored in the database'.format(files_count))
    return article_ids

### Main

### Download Neurosurgery articles

In [64]:
db = 'pubmed'
query = 'text classification neural network'
article_ids = download_all_query_responses(query=query, db=db, refresh=False)

232 articles found in pubmed with query specified.
232 articles are already stored in the database.
0 articles will be downloaded.
Total 232 articles stored in the database


### Handle query responses

In [132]:
items = handle_query_responses(db, article_ids)
items.head()

Done 231 of 231. Working on pubmed\7949911


Unnamed: 0,pmid,pmc,publisher-id,doi,abstract_len,full_text_len,file_size,title,article-type,category,...,journal_title,publisher_name,publisher_loc,copyright,license-type,license,abstract,full_text,pii,mid
0,32590229,,,10.1016/j.ijmedinf.2020.104225,450,,8217,Clinical questionnaire filling based on questi...,,,...,,,,,,,Electronic Health Records (EHR) are the founda...,,S1386-5056(19)31088-3,
1,32584774,,,10.1109/TNNLS.2020.3002798,1980,,6098,Automated Social Text Annotation With Joint Mu...,,,...,,,,,,,Automated social text annotation is the task o...,,,
2,32570656,,,10.3233/SHTI200439,951,,5973,Predicting Diagnosis Code from Medication List...,,,...,,,,,,,Automated coding and classification systems pl...,,SHTI200439,
3,32558750,,,10.1097/MAO.0000000000002710,147,,10105,Predicting Postoperative Cochlear Implant Perf...,,,...,,,,,,,To predict postoperative cochlear implant perf...,,,
4,32547807,PMC7278512,,10.4258/hir.2020.26.2.104,392,,12809,Analysis of Adverse Drug Reactions Identified ...,,,...,,,,,,,Electronic Health Records (EHRs)-based surveil...,,hir-26-2-104,


In [133]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231 entries, 0 to 230
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   pmid               231 non-null    object
 1   pmc                95 non-null     object
 2   publisher-id       0 non-null      object
 3   doi                222 non-null    object
 4   abstract_len       231 non-null    object
 5   full_text_len      0 non-null      object
 6   file_size          231 non-null    object
 7   title              231 non-null    object
 8   article-type       0 non-null      object
 9   category           0 non-null      object
 10  authors            0 non-null      object
 11  pub_date           0 non-null      object
 12  keywords           0 non-null      object
 13  volume             0 non-null      object
 14  elocation-id       0 non-null      object
 15  issue              0 non-null      object
 16  pages              0 non-null      object
 1

In [868]:
items.to_csv('database/pmc_all.csv', sep='|')

### Debug

In [100]:
filename = 'pubmed/7949911'
root = get_element_tree(filename)
item = {}
parse_element_tree_pubmed(root)

In [102]:
article_meta = root.find('PubmedArticle/MedlineCitation/Article')
#root.find('PubmedArticle/MedlineCitation/Article/ArticleTitle').text
pubmed_data = root.find('PubmedArticle/PubmedData')