In [31]:
%load_ext autoreload
%autoreload 2

%pylab

import os
import pandas as pd

# Use this module to load the data in a convenient format
from scienceie_loader import load_tokenized_data, load_data_with_char_offsets, get_entity_span_from_B_index

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using matplotlib backend: MacOSX
Populating the interactive namespace from numpy and matplotlib


In [12]:
data_root = os.path.join(os.getcwd(), 'original_datasets')
data_train = os.path.join(data_root, 'scienceie2017_train/train2')
data_dev = os.path.join(data_root, 'scienceie2017_dev/dev')
data_test = os.path.join(data_root, 'semeval_articles_test')

In [52]:
# Load the training, dev and test set in tokenized format.
# THIS IS THE RECOMMENDED FORMAT TO USE FOR MOST CASES AS IT'S EASIER TO WORK WITH.

# train_docs, dev_docs and test_docs are lists, where each entry is a document.
# The list entry for each document is a list of tuples, (token, label).
# The labels are BIO tags, wher 'B' and 'I' tags also include the type of entity.

# train_rels, dev_rels, and test_rels are also lists, where each entry corresponds to a document.
# The entry for each document is a list of tuples (label, entity1_start_token_index, entity2_start_token_index).
# Therefore the relations are referred to by the index of their first tokens.

train_docs, train_rels, _ = load_tokenized_data(data_train)
dev_docs, dev_rels, _ = load_tokenized_data(data_dev)
test_docs, test_rels, _ = load_tokenized_data(data_test)

print(f'number of training documents: {len(train_docs)}')
print(f'number of dev documents: {len(dev_docs)}')
print(f'number of test documents: {len(test_docs)}')

print(f'Total number of training relations = {np.sum([len(train_rels_doc) for train_rels_doc in train_rels])}')
print(f'Total number of dev relations = {np.sum([len(dev_rels_doc) for dev_rels_doc in dev_rels])}')
print(f'Total number of test relations = {np.sum([len(test_rels_doc) for test_rels_doc in test_rels])}')

number of training documents: 350
number of dev documents: 50
number of test documents: 100
Total number of training relations = 680
Total number of dev relations = 168
Total number of test relations = 196


In [34]:
# a document with its sequence labels
train_docs[0]

[('Within', 'O'),
 ('a', 'O'),
 ('coalescence', 'B-Process'),
 ('approach', 'I-Process'),
 ('as', 'O'),
 ('successfully', 'O'),
 ('applied', 'O'),
 ('earlier', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('light-quark', 'B-Task'),
 ('sector', 'I-Task'),
 (',', 'O'),
 ('we', 'O'),
 ('have', 'O'),
 ('evaluated', 'B-Process'),
 ('transverse-momentum', 'I-Process'),
 ('dependencies', 'I-Process'),
 ('of', 'O'),
 ('charmed', 'B-Material'),
 ('hadrons', 'I-Material'),
 ('in', 'O'),
 ('central', 'B-Process'),
 ('heavy-ion', 'I-Process'),
 ('reactions', 'I-Process'),
 ('at', 'O'),
 ('RHIC.', 'O'),
 ('For', 'O'),
 ('the', 'O'),
 ('charm-quark', 'B-Material'),
 ('distributions', 'I-Task'),
 ('at', 'O'),
 ('hadronization', 'B-Process'),
 ('we', 'O'),
 ('have', 'O'),
 ('considered', 'O'),
 ('two', 'O'),
 ('limiting', 'O'),
 ('scenarios', 'O'),
 (',', 'O'),
 ('i.e.', 'O'),
 (',', 'O'),
 ('no', 'O'),
 ('reinteractions', 'O'),
 ('(', 'O'),
 ('using', 'O'),
 ('spectra', 'B-Material'),
 ('from', 'O'),
 ('PYTHI

In [23]:
# the relations for the example document
train_rels[0]

[['Hyponym-of', 184, 194]]

In [35]:
# print the first entity span
print(get_entity_span_from_B_index(train_docs[0], train_rels[0][0][1]))

# print the second entity span
print(get_entity_span_from_B_index(train_docs[0], train_rels[0][0][2]))

[('baryons', 'B-Material')]
[('complimentary', 'B-Material'), ('probe', 'I-Material')]


In [15]:
# THIS FORMAT REQUIRES YOU TO TOKENISE THE TEXT YOURSELF SO IS ONLY RECOMMENDED 
# IF YOUR SEQUENCE LABELLING METHOD REQUIRES A SPECIAL KIND OF TOKENISATION.

# Load the training, dev and test sets as a list where each entry corresponds to one document and contains
# raw text: ["doc1", ...], a list of entity annotations as character offsets: [[(1, 2, entity_index), ...], ...],
# and a list of relations defined by entity IDs: [[(1, 2), ...], ...].

train_docs_chars = load_data_with_char_offsets(data_train)
dev_docs_chars = load_data_with_char_offsets(data_dev)
test_docs_chars = load_data_with_char_offsets(data_test)

print(f'number of training documents: {len(train_docs_chars)}')
print(f'number of dev documents: {len(dev_docs_chars)}')
print(f'number of test documents: {len(test_docs_chars)}')

number of training documents: 350
number of dev documents: 50
number of test documents: 100


In [42]:
# a sample document
train_docs_chars[0][0]

'Within a coalescence approach as successfully applied earlier in the light-quark sector, we have evaluated transverse-momentum dependencies of charmed hadrons in central heavy-ion reactions at RHIC. For the charm-quark distributions at hadronization we have considered two limiting scenarios, i.e., no reinteractions (using spectra from PYTHIA) and complete thermalization with transverse flow of the bulk matter. The resulting J/ψ (mT-)spectra differ in slope by up to a factor of\xa02 (harder for pQCD c-quarks), and the integrated yield is about a factor of\xa03 larger in the thermal case. For D-mesons, we found that the difference in the slope parameters of the pT-spectra in the two scenarios is less pronounced, but their elliptic flow is about a factor of\xa02 larger for pT⩾1.5\xa0GeV in the thermalized case. The elliptic flow pattern of D-mesons was found to be essentially preserved in the single-electron decay spectra, rendering the latter a very promising observable to address the s

In [43]:
# a set of entities for this document
train_docs_chars[0][1]

[('T1', 'Process', 9, 29),
 ('T2', 'Process', 97, 139),
 ('T3', 'Process', 162, 189),
 ('T4', 'Task', 207, 232),
 ('T5', 'Process', 349, 372),
 ('T6', 'Process', 378, 393),
 ('T7', 'Process', 520, 536),
 ('T8', 'Process', 620, 654),
 ('T9', 'Process', 724, 737),
 ('T10', 'Task', 662, 672),
 ('T11', 'Process', 812, 833),
 ('T12', 'Task', 891, 920),
 ('T13', 'Process', 998, 1018),
 ('T14', 'Process', 1156, 1182),
 ('T15', 'Material', 1132, 1151),
 ('T16', 'Material', 324, 331),
 ('T17', 'Material', 496, 509),
 ('T18', 'Material', 837, 845),
 ('T19', 'Material', 143, 158),
 ('T20', 'Process', 236, 249),
 ('T21', 'Material', 592, 600),
 ('T22', 'Material', 1097, 1104),
 ('T23', 'Process', 1190, 1193),
 ('T24', 'Task', 69, 87),
 ('T25', 'Material', 213, 218),
 ('T26', 'Process', 1026, 1029)]

In [44]:
# a set of relations for this document
train_docs_chars[0][2]

[('Hyponym-of', 'T22', 'T15')]