## Data Loading

In [1]:
import os
import json
from glob import glob
import numpy as np
import pandas as pd
from collections import namedtuple
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
from nltk.tokenize import RegexpTokenizer

In [17]:
train_docs_dir = 'C:/Users/DELL/Desktop/MeasEval/data/eval'

train_texts = ' '.join([open(txt_file, 'r').read() for txt_file in glob(os.path.join(train_docs_dir,'*.txt'))])
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(train_texts)

sentenizer = PunktSentenceTokenizer(trainer.get_params())
tokenizer = RegexpTokenizer(r'\w+|\(|\)|\[|\]|[-–.,]|\S+')

In [18]:
annotations_dir = 'C:/Users/DELL/Desktop/MeasEval/data/eval/tsv'
docs_dir = 'C:/Users/DELL/Desktop/MeasEval/data/eval/text'

Sentence = namedtuple('Sentence', ['start', 'end', 'text'])

tsv_files = glob(os.path.join(annotations_dir,'*.tsv')) if annotations_dir is not None else []
txt_files = glob(os.path.join(docs_dir,'*.txt'))

docs = {'docId': [], 'text': []}
for txt_file in txt_files:
    docs['text'].append(open(txt_file, 'r', encoding='UTF-8').read())
    docs['docId'].append(os.path.split(txt_file)[1].split('.')[0])
#     docs['docId'].append(txt_file.split('/')[-1].split('.')[0])

docs = pd.DataFrame(docs)

docs.loc[:, 'texts'] = docs.text.apply(
    lambda x: [
        Sentence(start, end, text) for (start, end), text in zip(
            sentenizer.span_tokenize(x), sentenizer.tokenize(x)
        )
    ]
)

In [4]:
docs

Unnamed: 0,docId,text,texts
0,S0012821X12004384-1610,"The brief peak in Apectodinium, AOM and low sa...","[(0, 104, The brief peak in Apectodinium, AOM ..."
1,S0012821X12004384-990,Correspondence analysis (first two axes) for: ...,"[(0, 291, Correspondence analysis (first two a..."
2,S0012821X13007309-1649,Lithology in the CTBI includes interbedded sha...,"[(0, 103, Lithology in the CTBI includes inter..."
3,S0016236113008041-3127,Thermodynamic modelling using MTDATA software ...,"[(0, 241, Thermodynamic modelling using MTDATA..."
4,S0016236113008041-3257,SEM–EDS analysis of sorbent sampled from the c...,"[(0, 155, SEM–EDS analysis of sorbent sampled ..."
...,...,...,...
130,S2213671113000738-667,"Next, we investigated the expression of major ...","[(0, 118, Next, we investigated the expression..."
131,S2213671113000738-787,"For in vivo studies, the fixed frozen brains w...","[(0, 133, For in vivo studies, the fixed froze..."
132,S2213671113000908-643,"(F) After 25 days of differentiation, cells ex...","[(0, 91, (F) After 25 days of differentiation,..."
133,S2213671113000921-756,(B) IF analysis of day 8 SOX2-GFP AFE cultures...,"[(0, 77, (B) IF analysis of day 8 SOX2-GFP AFE..."


In [19]:
if annotations_dir is not None:
    annotations = pd.read_csv(tsv_files[0], sep='\t')
    for tsv_file in tsv_files[1:]:
        annotations = annotations.append(pd.read_csv(tsv_file, sep='\t'), ignore_index=True)
    annotations = annotations.replace(np.nan, '{}')
    annotations.loc[:, 'other'] = annotations.other.apply(lambda x: eval(x))
else:
    annotations = pd.DataFrame(columns=['docId', 'annotSet', 'annotType', 'startOffset', 'endOffset', 'annotId', 'text', 'other'])

In [6]:
annotations

Unnamed: 0,docId,annotSet,annotType,startOffset,endOffset,annotId,text,other
0,S0012821X12004384-1610,1,Quantity,90,98,T1-1,2617.4 m,{'unit': 'm'}
1,S0012821X12004384-1610,1,MeasuredEntity,4,14,T3-1,brief peak,{'HasQuantity': 'T1-1'}
2,S0012821X12004384-1610,1,Qualifier,15,30,T4-1,in Apectodinium,{'Qualifies': 'T3-1'}
3,S0012821X12004384-1610,2,Quantity,669,688,T1-2,2619.6 and 2614.7 m,"{'mods': ['IsList'], 'unit': 'm'}"
4,S0012821X12004384-1610,2,MeasuredEntity,638,649,T3-2,other peaks,{'HasQuantity': 'T1-2'}
...,...,...,...,...,...,...,...,...
1485,S2213671113000921-756,1,Quantity,69,75,T1-1,200 μm,{'unit': 'μm'}
1486,S2213671113000921-756,1,MeasuredEntity,48,57,T2-1,scale bar,{'HasQuantity': 'T1-1'}
1487,S2213671113001306-1385,1,Quantity,310,323,T1-1,beyond 1 week,"{'mods': ['IsRange'], 'unit': 'week'}"
1488,S2213671113001306-1385,1,MeasuredEntity,293,308,T2-1,follow-up exams,{'HasQuantity': 'T1-1'}


## Q, ME, MP, Qlr Tagging

In [7]:
# annotating tokens in each sentence for each doc

# examples = [exmaple]
# annotated_sent = [annotated_tokens[]]
# example = {
#     'docId': doc.docId,
#     'sentAnnot': annotated_sent
# }

Token = namedtuple('Token', ['start', 'end', 'text'])
AnnotatedToken = namedtuple('AnnotatedToken', ['start', 'end', 'text', 'label'])
only_labels='Quantity+MeasuredProperty+MeasuredEntity+Qualifier'

examples = []
for doc in docs.itertuples():
    doc_annotations = annotations[(annotations.docId == doc.docId) & (annotations.annotType.isin(only_labels.split('+')))]
#     print(doc)
#     print(doc_annotations)
    
    annotated_sent = []
    for sent in doc.texts:
        tokens = [
            Token(sent.start + start, sent.start + end, token)
            for (start, end), token in zip(
                tokenizer.span_tokenize(sent.text),
                tokenizer.tokenize(sent.text)
            )
        ]
        
#         print(sent)
#         for t in tokens:
#             print(t)
            
        # retrieve labels for each sentence
        AnnotatedSpan = namedtuple('AnnotatedSpan', ['start', 'end', 'label'])
        df = doc_annotations[(doc_annotations.startOffset >= sent.start) & (doc_annotations.endOffset <= sent.end)]
        sent_labels = [AnnotatedSpan(r.startOffset, r.endOffset, r.annotType) for r in df.itertuples()]
#         for l in sent_labels:
#             print("sent_labels", l)
            
        annotated_tokens = []
        for token in tokens:
            # initialise the annotation tag for the current token as 'O'
            tag = 'O'
            for sl in sent_labels:
                # check whether the current token is within the range of the current annotated span
                isInCurrRange = set(range(token.start, token.end)).intersection(set(range(sl.start, sl.end)))
                if isInCurrRange:
                    # check whether it's in the beginning (B) of the span
                    if token.start == sl.start:
                        tag = 'B-' + sl.label
                    # otherwise, it's in the internal (I) of the span
                    else:
                        tag = 'I-' + sl.label
            # append the annotated tag for the current token
            annotated_tokens.append(AnnotatedToken(token.start, token.end, token.text, tag))
        
#         for a in annotated_tokens:
#             print('annotated_tokens', a)
        
        if len(annotated_tokens) == len(tokens):
            annotated_sent.append(annotated_tokens)
    
#     for anns in annotated_sent:
#         for annt in anns:
#             print('annotated_tokens', annt)
            
    example = {
        'docId': doc.docId,
        'sentAnnot': annotated_sent
    }
    examples.append(example)

In [8]:
examples

[{'docId': 'S0012821X12004384-1610',
  'sentAnnot': [[AnnotatedToken(start=0, end=3, text='The', label='O'),
    AnnotatedToken(start=4, end=9, text='brief', label='B-MeasuredEntity'),
    AnnotatedToken(start=10, end=14, text='peak', label='I-MeasuredEntity'),
    AnnotatedToken(start=15, end=17, text='in', label='B-Qualifier'),
    AnnotatedToken(start=18, end=30, text='Apectodinium', label='I-Qualifier'),
    AnnotatedToken(start=30, end=31, text=',', label='O'),
    AnnotatedToken(start=32, end=35, text='AOM', label='O'),
    AnnotatedToken(start=36, end=39, text='and', label='O'),
    AnnotatedToken(start=40, end=43, text='low', label='O'),
    AnnotatedToken(start=44, end=52, text='salinity', label='O'),
    AnnotatedToken(start=53, end=67, text='dinoflagellate', label='O'),
    AnnotatedToken(start=68, end=73, text='cysts', label='O'),
    AnnotatedToken(start=74, end=75, text='(', label='O'),
    AnnotatedToken(start=75, end=85, text='Deflandrea', label='O'),
    AnnotatedToken

In [9]:
# write the annotated NER training data into a file
f = open("test.txt", "w", encoding='UTF-8')

for example in examples:
    annotated_sent = example['sentAnnot']
    for sent in annotated_sent:
        for token in sent:
            f.write(token.text + '\t' + token.label + '\n')
        f.write('\n')

f.close()

In [11]:
# count the number of sentences
num = 0
for example in examples:
    annotated_sent = example['sentAnnot']
    for sent in annotated_sent:
        num += 1
num

842

In [20]:
# split the data into test(2/3) and dev(1/3) set
# dev: 281; test: 561

import random

sample_nums = random.sample(range(842), 281)
sample_nums

[700,
 214,
 500,
 177,
 350,
 76,
 42,
 711,
 616,
 679,
 651,
 115,
 332,
 492,
 404,
 235,
 483,
 551,
 219,
 317,
 296,
 161,
 192,
 7,
 377,
 423,
 566,
 253,
 175,
 327,
 585,
 442,
 714,
 339,
 228,
 322,
 81,
 320,
 345,
 299,
 557,
 59,
 502,
 370,
 588,
 653,
 131,
 703,
 708,
 140,
 127,
 26,
 550,
 270,
 236,
 134,
 833,
 418,
 337,
 149,
 74,
 568,
 23,
 91,
 60,
 619,
 768,
 390,
 336,
 282,
 314,
 344,
 659,
 522,
 108,
 147,
 733,
 58,
 4,
 279,
 757,
 343,
 284,
 101,
 464,
 295,
 394,
 232,
 738,
 486,
 342,
 265,
 193,
 610,
 530,
 433,
 388,
 169,
 791,
 10,
 732,
 373,
 528,
 841,
 8,
 511,
 541,
 447,
 773,
 400,
 107,
 417,
 799,
 817,
 564,
 17,
 278,
 748,
 378,
 495,
 64,
 549,
 543,
 89,
 387,
 539,
 690,
 729,
 470,
 624,
 69,
 480,
 454,
 251,
 231,
 92,
 721,
 202,
 52,
 123,
 199,
 482,
 467,
 672,
 266,
 620,
 163,
 246,
 716,
 654,
 38,
 521,
 481,
 114,
 277,
 494,
 828,
 201,
 212,
 715,
 226,
 821,
 29,
 837,
 602,
 809,
 466,
 647,
 135,
 674,
 243,

In [21]:
sents = []
for example in examples:
    annotated_sent = example['sentAnnot']
    for sent in annotated_sent:
        sents.append(sent)

dev = []
test = []
for i in range(842):
    if i in sample_nums:
        dev.append(sents[i])
    else:
        test.append(sents[i])

In [22]:
len(test)

561

In [23]:
# write the annotated NER dev data into a file
f = open("dev.txt", "w", encoding='UTF-8')

for sent in dev:
    for token in sent:
        f.write(token.text + '\t' + token.label + '\n')
    f.write('\n')

f.close()

In [24]:
# write the annotated NER test data into a file
f = open("test.txt", "w", encoding='UTF-8')

for sent in test:
    for token in sent:
        f.write(token.text + '\t' + token.label + '\n')
    f.write('\n')

f.close()

## Unit Tagging

In [20]:
# annotating tokens in each sentence for each doc

# examples = [exmaple]
# annotated_sent = [annotated_tokens[]]
# example = {
#     'docId': doc.docId,
#     'sentAnnot': annotated_sent
# }
import re

Token = namedtuple('Token', ['start', 'end', 'text'])
AnnotatedToken = namedtuple('AnnotatedToken', ['start', 'end', 'text', 'label'])

examples = []
for doc in docs.itertuples():
    doc_annotations = annotations[(annotations.docId == doc.docId) & (annotations.annotType == 'Quantity')]
#     print(doc)
#     print(doc_annotations)
    
    # get rows containing 'unit' in 'other' column
    idx_list = []
    for index, row in doc_annotations.reset_index(drop=True).iterrows():
        if row['other'].get('unit') != None:
            idx_list.append(index)
    doc_annotations = doc_annotations.iloc[idx_list]
#     print(doc_annotations)

#     # add column 'sentIdx' representing the index of a sentence where each quantity of the doc belongs to
#     sent_idx = []
#     for index, row in doc_annotations.iterrows():
#         # check whether the current quantity is within the range of the current sentence span
#         for i in range(len(doc.texts)):
#             sent = doc.texts[i]
#             isInRange = set(range(row.startOffset, row.endOffset)).intersection(set(range(sent.start, sent.end)))
#             if isInRange:
#                 sent_idx.append(i)
#     doc_annotations.loc[:, 'sentIdx'] = sent_idx
    
    # add columns 'unitStart' and 'unitEnd' in doc_annotations to mark the span of units
    unit_start = []
    for index, row in doc_annotations.iterrows():
        if re.compile(row['other'].get('unit')).search(row.text) != None:
            # 'unit' span is within the range of 'quantity' span: append its start offset
            s = re.compile(row['other'].get('unit')).search(row.text).span()[0] + row.startOffset
            unit_start.append(s)
        else:
            # 'unit' span is not within the range of 'quantity' span: ignore it 
            unit_start.append(-1)

            print(row)
            print(doc)
            print(doc_annotations)
    doc_annotations.loc[:, 'unitStart'] = unit_start
    
    unit_end = []
    for index, row in doc_annotations.iterrows():
        if re.compile(row['other'].get('unit')).search(row.text) != None:
            e = re.compile(row['other'].get('unit')).search(row.text).span()[1] + row.startOffset
            unit_end.append(e)
        else:
            unit_end.append(-1)
    doc_annotations.loc[:, 'unitEnd'] = unit_end
    
    # labeling each sentence
    annotated_sent = []
    for sent in doc.texts:
        # get the span of each token within the current sentence
        tokens = [
            Token(sent.start + start, sent.start + end, token)
            for (start, end), token in zip(
                tokenizer.span_tokenize(sent.text),
                tokenizer.tokenize(sent.text)
            )
        ]
        
#         print(sent)
#         for t in tokens:
#             print(t)
            
        # retrieve labels for each sentence
        AnnotatedSpan = namedtuple('AnnotatedSpan', ['start', 'end', 'label'])
        df = doc_annotations[(doc_annotations.unitStart >= sent.start) & (doc_annotations.unitEnd <= sent.end)]
        sent_labels = [AnnotatedSpan(r.unitStart, r.unitEnd, 'unit') for r in df.itertuples()]
#         for l in sent_labels:
#             print("sent_labels", l)
            
        annotated_tokens = []
        for token in tokens:
            # initialise the annotation tag for the current token as 'O'
            tag = 'O'
            for sl in sent_labels:
                # check whether the current token is within the range of the current annotated span
                isInCurrRange = set(range(token.start, token.end)).intersection(set(range(sl.start, sl.end)))
                if isInCurrRange:
                    # choose not to use BIO tag due to some corner examples
                    # unit is within the span of a token (e.g. quantity='5≤2θ/°≤80', unit='°')
                    tag = 'unit'
#                     # check whether it's in the beginning (B) of the span
#                     if token.start == sl.start:
#                         tag = 'B-' + sl.label
#                     # otherwise, it's in the internal (I) of the span
#                     else:
#                         tag = 'I-' + sl.label
            # append the annotated tag for the current token
            annotated_tokens.append(AnnotatedToken(token.start, token.end, token.text, tag))
        
#         for a in annotated_tokens:
#             print('annotated_tokens', a)
        
        if len(annotated_tokens) == len(tokens):
            annotated_sent.append(annotated_tokens)
    
#     for anns in annotated_sent:
#         for annt in anns:
#             print('annotated_tokens', annt)
            
    example = {
        'docId': doc.docId,
        'sentAnnot': annotated_sent
    }
    examples.append(example)

docId          S0019103512003995-1237
annotSet                            4
annotType                    Quantity
startOffset                       607
endOffset                         613
annotId                          T1-4
text                           17,500
other                   {'unit': 'R'}
Name: 199, dtype: object
Pandas(Index=12, docId='S0019103512003995-1237', text='(a) The C II 1334.5 Å line of HD209458 (solid line, Linsky et al., 2010) fitted with a Voigt profile (see Table 1) and adjusted for absorption by the ISM (dotted line). We assumed that the column density of ground state C+ in the ISM is 2.23 × 1019 m−2. The relative velocity of the ISM with respect to Earth is −6.6 km s−1 and the effective thermal velocity along the LOS to the star is 12.3 km s−1 (Wood et al., 2005). (b) The C II 1335.7 Å line of HD209458 fitted with a Voigt profile. Absorption by the ISM was assumed to be negligible. The model profiles were convolved to a spectral resolution of R = 17,500.',

In [15]:
u = '% (w/v)'
q = '1.2% (w/v)'

re.compile(u).search(q).span()

AttributeError: 'NoneType' object has no attribute 'span'

In [21]:
# split the data into test(2/3) and dev(1/3) set
# dev: 281; test: 561

import random

sample_nums = random.sample(range(842), 281)

sents = []
for example in examples:
    annotated_sent = example['sentAnnot']
    for sent in annotated_sent:
        sents.append(sent)

dev = []
test = []
for i in range(842):
    if i in sample_nums:
        dev.append(sents[i])
    else:
        test.append(sents[i])

In [22]:
# write the annotated NER dev data into a file
f = open("dev.txt", "w", encoding='UTF-8')

for sent in dev:
    for token in sent:
        f.write(token.text + '\t' + token.label + '\n')
    f.write('\n')

f.close()

# write the annotated NER test data into a file
f = open("test.txt", "w", encoding='UTF-8')

for sent in test:
    for token in sent:
        f.write(token.text + '\t' + token.label + '\n')
    f.write('\n')

f.close()