# Parsing and Pre-Processing GENIA Text Data

The data is given in XML format, with the POS of each word identified. Biological entities are also labelled with the type of entity. We parse this data file and tag each word with the POS it is labelled as and also assign a label to it based on whether or not it is a biological entity.

In [35]:
import bs4
from bs4 import BeautifulSoup
data_file_path = './data/test.xml'

Here we parse the XML data file to get the POS of each word and determine the semantic biological meaning of each word (if it has one). The words with a biological meaning will be labelled as a 1, and the non-biological words will be labelled with a -1.

In [37]:
def get_words_part_of_speech(sentence: bs4.element.Tag, words_pos_dict: dict) -> None:
    """
    Extracts words in given sentence and their respective part of speech.
    Updates words_pos_dict (passed by reference) to contain these words and their POS.

    Arg:
        sentence (bs4.element.Tag): The sentence to extract the words and POS from.
        words_pos_dict (dict): The dictionary to update with the words and POS.
    """
    words = sentence.find_all('w')
    for word in words:
        text = word.text.strip()
        if text not in words_pos_dict:
            words_pos_dict[text] = set()
        words_pos_dict[text].add(word['c'])

def get_named_bio_entities(sentence: bs4.element.Tag, named_bio_entities_dict: dict) -> None:
    """
    Extracts named biological entities in given sentence.
    Updates named_bio_entities_dict (passed by reference) to contain these entities.

    Arg:
        sentence (bs4.element.Tag): The sentence to extract the named biological entities from.
        named_bio_entities_dict (dict): The dictionary to update with the named biological entities.
    """
    bio_entities = sentence.find_all('cons')
    for ent in bio_entities:
        text = ent['lex'].strip()
        if text not in named_bio_entities_dict:
            named_bio_entities_dict[text] = set()
        named_bio_entities_dict[text].add(ent['sem'])

def extract_data_from_xml(data_file_path: str):
    """
    Extracts the data from the xml file and returns a list of tuples
    containing the text and the label.
    Note that punctuation marks are kept in the data as was done in the presented paper.
    However, they are not attached to any of the words.
    
    Args:
        data_file_path (str): The path to the xml file.
        
    Returns:
        List of tuples containing the text and the label.
    """
    data = []
    words_pos_dict = dict() # dict of set
    named_bio_entities_dict = dict() # dict of set
    with open(data_file_path, "r") as f:
        xml = f.read()
        soup = BeautifulSoup(xml, 'xml')
        articles = soup.find_all('article')
        for art in articles:
            sentences = art.find_all('sentence')
            for sent in sentences:
                get_words_part_of_speech(sent, words_pos_dict)
                get_named_bio_entities(sent, named_bio_entities_dict)


In [34]:
extract_data_from_xml(data_file_path)

{'IL-2': {'NN'}, 'gene': {'NN'}, 'expression': {'NN'}, 'and': {'CC'}, 'NF-kappa': {'NN'}, 'B': {'*', 'NN'}, 'activation': {'NN'}, 'through': {'IN'}, 'CD28': {'*', 'NN'}, 'requires': {'VBZ'}, 'reactive': {'JJ'}, 'oxygen': {'NN'}, 'production': {'NN'}, 'by': {'IN'}, '5-lipoxygenase': {'NN'}, '.': {'.'}, 'Activation': {'NN'}, 'of': {'IN'}, 'the': {'DT'}, 'surface': {'NN'}, 'receptor': {'NN'}, 'provides': {'VBZ'}, 'a': {'DT'}, 'major': {'JJ'}, 'costimulatory': {'JJ', 'NN'}, 'signal': {'NN'}, 'for': {'IN'}, 'T': {'NN'}, 'cell': {'NN'}, 'resulting': {'VBG'}, 'in': {'IN', 'FW'}, 'enhanced': {'VBN'}, 'interleukin-2': {'NN'}, '(': {'('}, ')': {')'}, 'proliferation': {'NN'}, 'In': {'IN'}, 'primary': {'JJ'}, 'lymphocytes': {'NNS'}, 'we': {'PRP'}, 'show': {'VBP'}, 'that': {'IN', 'WDT', 'DT'}, 'ligation': {'NN'}, 'leads': {'VBZ'}, 'to': {'TO'}, 'rapid': {'JJ'}, 'intracellular': {'JJ'}, 'formation': {'NN'}, 'intermediates': {'NNS'}, 'ROIs': {'NNS'}, 'which': {'WDT'}, 'are': {'VBP'}, 'required': {'VB