# GENIA Data Preprocessing

Authors: Samuel Sarria Hurtado and Paul Sheridan

Last update: 2023-10-02

Description: Preprocess the GENIA Term corpus version 3.02 dataset.

Inputs:
* Raw data (XML): GENIAcorpus3.02.xml

Outputs:
* Document Ids (CSV): GENIAcorpus3.02-doc-ids.csv
* Lex/sem mapping (TSV): GENIAcorpus3.02-keywords.tsv
* Preprocessed documents (JSON): GENIAcorpus3.02-preprocessed.json

## Load Libraries

In [5]:
import json
import numpy as np
import re
import unicodedata
import inflect
import bs4
from bs4 import BeautifulSoup
import pandas as pd

## Load Helper Functions

In [6]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    return words

def take_out_str(arr, s):
  """Goes through an array of strings and removes substring s from every
  element in the array"""
  for i in range(len(arr)):
    if type(arr[i]) is str:
      if s in arr[i]:
        arr[i] = arr[i].replace(s, '')

## Preprocess the Raw Data

### Read in the Raw Data

In [29]:
infile_path = '../0-data-raw/GENIAcorpus3.02.xml'
infile = open(infile_path, 'r')
contents = infile.read()
soup = BeautifulSoup(contents, 'xml')

### Data Cleaning

Here we eliminate tags, and other elements in the dataset, that might be troublesome. These are:
* Coordinated lex attributes.
* Cons tags lacking a sem attribute. Note: that they're all deleted upon deleting the coordinated con tags with a coordinated lex attribute.
* Cons tags without a lex attribute
* All instances of the words "(ABSTRACT TRUNCATED AT 250 WORDS)".
* All instances of the words "(ABSTRACT TRUNCATED AT 400 WORDS)".

In [30]:
# Extract cons:
all_cons = soup.find_all('cons')
total_cons = len(all_cons)
all_cons = np.array(all_cons, dtype=object)
no_sem = soup.find_all('cons', sem=False)
no_sem_total = len(no_sem)

# Excise coordinated lex attributes:
coordinated_arr = []
for i in range(len(all_cons)):
    if 'sem' in all_cons[i].attrs:
      if '(' in all_cons[i].attrs['sem']:
        coordinated_arr.append(all_cons[i])

total_coordinated = len(coordinated_arr)
for i in range(total_coordinated):
  coordinated_arr[i].decompose()

# Excise cons without lex attributes:
no_lex = soup.find_all('cons', lex=False)
no_lex_total = len(no_lex)
for lex in no_lex:
  lex.decompose()

# Excise truncated at 250 eords abstract sentences:
truncated_at_250_words_instances = soup.find_all('sentence', string='(ABSTRACT TRUNCATED AT 250 WORDS)')
truncated_at_250_words_total = len(truncated_at_250_words_instances)
for trunc in truncated_at_250_words_instances:
  trunc.decompose()

# Excise truncated at 400 eords abstract sentences:
truncated_at_400_words_instances = soup.find_all('sentence', string='(ABSTRACT TRUNCATED AT 400 WORDS)')
truncated_at_400_words_total = len(truncated_at_400_words_instances)
for trunc in truncated_at_400_words_instances:
  trunc.decompose()

# Summarize results:
print('Total number of cons:', total_cons)
print('Cons tags without sem attribute:', no_sem_total)
print('Proportion of cons tags without sem attribute:', no_sem_total/total_cons)
print('Cons tags without lex attribute:', no_lex_total)
print('Proportion of cons tags without lex attribute:', no_lex_total/total_cons)
print('Number of cons tags with a coordinated sem attribute:', total_coordinated)
print('Proportion of cons tags with a coordinated sem attribute:', total_coordinated/total_cons)
print('Number of \'(ABSTRACT TRUNCATED AT 250 WORDS)\' instance:', truncated_at_250_words_total)
print('Associated proportion:', truncated_at_250_words_total/total_cons)
print('Number of \'(ABSTRACT TRUNCATED AT 400 WORDS)\' instance:', truncated_at_400_words_total)
print('Associated proportion:', truncated_at_400_words_total/total_cons)

Total number of cons: 97876
Cons tags without sem attribute: 5154
Proportion of cons tags without sem attribute: 0.05265846581388696
Cons tags without lex attribute: 2
Proportion of cons tags without lex attribute: 2.0434018554088846e-05
Number of cons tags with a coordinated sem attribute: 1597
Proportion of cons tags with a coordinated sem attribute: 0.016316563815439944
Number of '(ABSTRACT TRUNCATED AT 250 WORDS)' instance: 24
Associated proportion: 0.0002452082226490662
Number of '(ABSTRACT TRUNCATED AT 400 WORDS)' instance: 3
Associated proportion: 3.0651027831133274e-05


### Test
Test that the above code cell worked. If it worked, all quantities printed (except for the total number of con tags) should be 0.

In [31]:
cons = soup.find_all('cons')
new_total_cons = len(cons)

new_no_sem = soup.find_all('cons', sem=False)
new_no_sem_total = len(new_no_sem)

new_no_lex = soup.find_all('cons', lex=False)
new_no_lex_total = len(new_no_lex)

truncated_at_250_words_instances = soup.find_all('sentence', string='(ABSTRACT TRUNCATED AT 250 WORDS)')
new_truncated_at_250_words_total = len(truncated_at_250_words_instances)

truncated_at_400_words_instances = soup.find_all('sentence', string='(ABSTRACT TRUNCATED AT 400 WORDS)')
new_truncated_at_400_words_total = len(truncated_at_400_words_instances)

new_coordinated_counter = 0
for i in range(new_total_cons):
  if 'sem' in cons[i].attrs:
    if '(' in cons[i].attrs['sem']:
      new_coordinated_counter += 1

print('Total of con tags now:', new_total_cons)
print('Total of con tags without sem attribute now:', new_no_sem_total)
print('Total of con tags without lex attribute now:', new_no_sem_total)
print('Total instances of abstracts truncated at 250 words now:', new_truncated_at_250_words_total)
print('Total instances of abstracts truncated at 400 words now:', new_truncated_at_400_words_total)
print('Total instances of cons tags with coordinated sem attributes:', new_coordinated_counter)

Total of con tags now: 90969
Total of con tags without sem attribute now: 0
Total of con tags without lex attribute now: 0
Total instances of abstracts truncated at 250 words now: 0
Total instances of abstracts truncated at 400 words now: 0
Total instances of cons tags with coordinated sem attributes: 0


### Clean up Lex Attributes
Some lex text contains spaces. This is probably due to annotation errors. We replace spaces with underscores and postfix all lexes with the tag "_lex".

In [32]:
# Create data frame of lexes and sems:
lex_and_sem = []
for con in cons:
  lex_and_sem.append(con.attrs)
lex_and_sem_df = pd.DataFrame(lex_and_sem)

# For any lexes, substitute space characters for underscores and add '_lex' postfix:
for i in range(len(lex_and_sem_df['lex'])):
    if ' ' in lex_and_sem_df['lex'][i]:
        print(lex_and_sem_df['lex'][i])
        lex_and_sem_df['lex'][i] = lex_and_sem_df['lex'][i].replace(' ', '_')
    lex_and_sem_df['lex'][i] += '_lex'

lex_and_sem_df = lex_and_sem_df.drop_duplicates(subset=['lex'])
lex_and_sem_df = lex_and_sem_df.reset_index(drop=True)

unique_lex = np.array(lex_and_sem_df['lex'])
new_lex_col = take_out_str(lex_and_sem_df['lex'], '\"')
lex_and_sem_df = lex_and_sem_df.replace(lex_and_sem_df['lex'], new_lex_col)

IL-2 receptor alpha
IL-2R alpha
CD30 ligand
proteolytic digestion
cytoplasmic component
IL-7R alpha
IL-7R alpha
HSP70B promoter


## Output Files Creation
Write the three output files:
* A CSV file containing the document ID'S and their respective doc index
* A TSV file containing a mapping from each lex to each sem
* A JSON file containing the preprocessed GENIA Term corpus data. Each element in this JSON file is a list of strings. Some are keywords (if they are in the lex list), some aren't. The words won't be in the same order as the raw data.

In [28]:
# Get the text in the bibliomisc tags:
doc_id_tags = soup.find_all('bibliomisc')
doc_ids = [id.get_text() for id in doc_id_tags]

# Write the csv file with the doc ids:
csv_outfile_path = '../0-data-preprocessed/GENIAcorpus3.02-doc-ids.csv'
index = range(len(doc_ids))
doc_ids_df = pd.DataFrame({'index': index, 'ID': doc_ids})
doc_ids_df.to_csv(csv_outfile_path, index=False)

# File contents summary:
display(doc_ids_df)

Unnamed: 0,index,ID
0,0,MEDLINE:95369245
1,1,MEDLINE:95333264
2,2,MEDLINE:95343554
3,3,MEDLINE:95347379
4,4,MEDLINE:95280913
...,...,...
1995,1995,MEDLINE:96011839
1996,1996,MEDLINE:96009598
1997,1997,MEDLINE:95403454
1998,1998,MEDLINE:95385995


In [33]:
# Write semantic classes to TSV:
genia_keywords_outfile_path = '../0-data-preprocessed/GENIAcorpus3.02-keywords.tsv'
lex_and_sem_df.to_csv(genia_keywords_outfile_path, index=False, sep='\t')

# File contents summary:
display(lex_and_sem_df)

Unnamed: 0,lex,sem
0,IL-2_gene_expression_lex,G#other_name
1,IL-2_gene_lex,G#DNA_domain_or_region
2,NF-kappa_B_activation_lex,G#other_name
3,NF-kappa_B_lex,G#protein_molecule
4,CD28_lex,G#protein_molecule
...,...,...
31782,gp160-induced_AP-1_complex_lex,G#protein_complex
31783,protein_synthesis-independent_lex,G#other_name
31784,calcium_channel_blocker_lex,G#other_organic_compound
31785,anti-CD3-induced_interleukin-2_secretion_lex,G#other_name


In [35]:
# Create JSON file:
xml_articles = soup.find_all('article')

joint_descendants = []
for article in xml_articles:
  joint_descendants.append([descendant for descendant in article.descendants])

j_sen = []
for i in range(len(joint_descendants)):
  j_sen.append([])
  for sen in joint_descendants[i]:
    if sen.name == 'sentence':
      j_sen[i].append(sen)

in_sen = []
for i in range(len(j_sen)):
  in_sen.append([])
  for sentence in j_sen[i]:
    in_sen[i].append([descendant for descendant in sentence.descendants])

in_sentences = []
for i in range(len(in_sen)):
  in_sentences.append([])
  for j in range(len(in_sen[i])):
    for k in range(len(in_sen[i][j])):
      in_sentences[i].append(in_sen[i][j][k])

bag_of_content = []
for i in range(len(in_sentences)):
  bag_of_content.append([])
  for j in range(len(in_sentences[i])):
    if type(in_sentences[i][j]) is bs4.element.Tag:
      bag_of_content[i].append(in_sentences[i][j].attrs)
    elif in_sentences[i][j].parent.name == 'sentence':
      bag_of_content[i].append(in_sentences[i][j])

keywords = []
stopwords = []
for i in range(len(bag_of_content)):
  keywords.append([])
  stopwords.append([])
  for j in range(len(bag_of_content[i])):
    if type(bag_of_content[i][j]) is dict:
        keywords[i].append(bag_of_content[i][j]['lex'])
    else:
      stopwords[i].append(bag_of_content[i][j])

for i in range(len(keywords)):
    for j in range(len(keywords[i])):
        if ' ' in keywords[i][j]:
            keywords[i][j] = keywords[i][j].replace(' ', '_')
        keywords[i][j] += '_lex'


for i in range(len(stopwords)):
  stopwords[i] = normalize(stopwords[i])
  stopwords[i] = [word.strip() for word in stopwords[i]]
  stopwords[i] = [word for word in stopwords[i] if len(word) > 0]

combined = []
for i in range(len(stopwords)):
  combined.append(keywords[i] + stopwords[i])

genia = []
for doc in combined:
  genia.append(' '.join(doc))

take_out_str(genia, '\"')

In [36]:
# Write JSON to file:
genia_terms_outfile_path = '../0-data-preprocessed/GENIAcorpus3.02-preprocessed.json'
with open(genia_terms_outfile_path, 'w') as outfile:
    json.dump(genia, outfile)