# Caching abstract section headings

Tong Shu Li<br>
Created on Wednesday 2015-07-22<br>
Last updated: 2015-08-21

For the abstract-level chemical-induced disease relation extraction task, we want to format the text into sections ([like PMID 20003049](http://www.ncbi.nlm.nih.gov/pubmed/?term=20003049%5Buid%5D)) to make it easier for the workers to read. Previously this was done by querying PubMed for the paper directly, and using the returned information to format the text.

Since the input text is supposed to be treated as free text, we cannot use the PubMed querying method to determine the section headings. Instead, we will query PubMed for the section names of every abstract in the training and development data, and use the cached section headings to parse any new input. We of course will miss some, but assume that those papers will be infrequent.

In [1]:
import requests
import xml.etree.cElementTree as ET

In [2]:
from src.lingpipe.file_util import read_file

### Preprocess the training and development data to grab all the PMIDs:

In [3]:
def parse_pmids(fname):
    pmids = set()
    for line in read_file(fname):
        vals = line.split('|')
        if len(vals) == 3 and vals[1] in ['a', 't']:
            pmids.add(vals[0])
            
    return pmids

In [4]:
fname = "data/training/CDR_TrainingSet.txt"
trainingset = parse_pmids(fname)

In [5]:
len(trainingset)

500

In [6]:
fname = "data/development/CDR_DevelopmentSet.txt"
developmentset = parse_pmids(fname)

In [7]:
len(developmentset)

500

### Query PubMed for the section names:

In [8]:
def query_ncbi(tool, settings):
    BASE = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/{0}".format(tool)
    resp = requests.get(BASE, params = settings)
    assert resp.status_code == requests.codes.ok
    return resp.text

def get_pubmed_article_xml_tree(pubmed_id):
    response = query_ncbi("efetch.fcgi",
                          {"db": "pubmed", "id": pubmed_id, "rettype": "abstract"})

    return ET.fromstring(response)

def parse_article_xml_tree(article_xml_tree):
    """Return title as a unicode string, and abstract as an Element"""
    for element in article_xml_tree.iter("ArticleTitle"):
        article_title = element.text
        break

    for element in article_xml_tree.iter():
        if element.tag == "Abstract":
            return (article_title, element)

    return (article_title, False) # no abstract, title only

def get_section_labels(abstract_xml_tree):
    """Split an abstract XML tree into individual chunks, if they exist.

    Preserves the background/methods/etc format of some papers (eg pmid 24885308)
    """
    section_labels = set()
    for child in abstract_xml_tree.iter("AbstractText"):
        section_name = child.get("Label")
        
        if section_name is not None and section_name != "UNLABELLED":
            section_labels.add(section_name)
            
    return section_labels

def get_abstract_information(pubmed_id):
    article_xml_tree = get_pubmed_article_xml_tree(pubmed_id)
    title, abstract_xml_tree = parse_article_xml_tree(article_xml_tree)

    if abstract_xml_tree:
        return get_section_labels(abstract_xml_tree)
    
    return set()

---

### Query all section names and write to file:

In [9]:
def get_all_section_labels(dataset):
    res = set()
    for i, pmid in enumerate(dataset):
        res |= get_abstract_information(pmid)
        
    return res

In [10]:
train_labels = get_all_section_labels(trainingset)

In [11]:
dev_labels = get_all_section_labels(developmentset)

In [12]:
all_section_labels = train_labels | dev_labels

In [13]:
len(all_section_labels)

77

In [14]:
with open("data/all_uniq_section_names.txt", "w") as fout:
    temp = sorted(list(all_section_labels))
    for name in temp:
        v = fout.write("{0}\n".format(name))