In [2]:
import os
import json
from glob import glob
import numpy as np
import pandas as pd
from collections import namedtuple
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer
from nltk.tokenize import RegexpTokenizer

In [169]:
train_docs_dir = 'C:/Users/DELL/Desktop/MeasEval/data/trial'

train_texts = ' '.join([open(txt_file, 'r').read() for txt_file in glob(os.path.join(train_docs_dir,'*.txt'))])
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(train_texts)

sentenizer = PunktSentenceTokenizer(trainer.get_params())
tokenizer = RegexpTokenizer(r'\w+|\(|\)|\[|\]|[-–.,]|\S+')

In [170]:
annotations_dir = 'C:/Users/DELL/Desktop/MeasEval/data/trial/tsv'
docs_dir = 'C:/Users/DELL/Desktop/MeasEval/data/trial/txt'

Sentence = namedtuple('Sentence', ['start', 'end', 'text'])

tsv_files = glob(os.path.join(annotations_dir,'*.tsv')) if annotations_dir is not None else []
txt_files = glob(os.path.join(docs_dir,'*.txt'))

docs = {'docId': [], 'text': []}
for txt_file in txt_files:
    docs['text'].append(open(txt_file, 'r', encoding='UTF-8').read())
    docs['docId'].append(os.path.split(txt_file)[1].split('.')[0])
#     docs['docId'].append(txt_file.split('/')[-1].split('.')[0])

docs = pd.DataFrame(docs)

docs.loc[:, 'texts'] = docs.text.apply(
    lambda x: [
        Sentence(start, end, text) for (start, end), text in zip(
            sentenizer.span_tokenize(x), sentenizer.tokenize(x)
        )
    ]
)

In [4]:
docs

Unnamed: 0,docId,text,texts
0,S0012821X12004384-1302,Correspondence analysis (CA) and statistical d...,"[(0, 175, Correspondence analysis (CA) and sta..."
1,S0012821X12004384-1405,Dinoflagellate cysts have been used extensivel...,"[(0, 302, Dinoflagellate cysts have been used ..."
2,S0012821X12004384-1415,"Dinoflagellate cyst assemblage 1 (DA1, from 26...","[(0, 64, Dinoflagellate cyst assemblage 1 (DA1..."
3,S0012821X12004384-1594,"Alternatively, an increase in regional precipi...","[(0, 195, Alternatively, an increase in region..."
4,S0012821X12004384-1599,Our results provide the first evidence that th...,"[(0, 129, Our results provide the first eviden..."
...,...,...,...
60,S0022000014000026-17824,The figure also demonstrates the capability of...,"[(0, 157, The figure also demonstrates the cap..."
61,S0022000014000026-18167,"As the conclusion of our paper, we now assess ...","[(0, 242, As the conclusion of our paper, we n..."
62,S0022399913003358-1044,Self-reported hearing difficulty was the stron...,"[(0, 77, Self-reported hearing difficulty was ..."
63,S0022459611006116-1160,The ternary magnesium nitride fluorides were p...,"[(0, 183, The ternary magnesium nitride fluori..."


In [171]:
if annotations_dir is not None:
    annotations = pd.read_csv(tsv_files[0], sep='\t')
    for tsv_file in tsv_files[1:]:
        annotations = annotations.append(pd.read_csv(tsv_file, sep='\t'), ignore_index=True)
    annotations = annotations.replace(np.nan, '{}')
    annotations.loc[:, 'other'] = annotations.other.apply(lambda x: eval(x))
else:
    annotations = pd.DataFrame(columns=['docId', 'annotSet', 'annotType', 'startOffset', 'endOffset', 'annotId', 'text', 'other'])

In [6]:
annotations

Unnamed: 0,docId,annotSet,annotType,startOffset,endOffset,annotId,text,other
0,S0012821X12004384-1302,1,Quantity,553,557,T91-1,Five,{'mods': ['IsCount']}
1,S0012821X12004384-1302,1,MeasuredEntity,558,565,T101-1,samples,{'HasQuantity': 'T91-1'}
2,S0012821X12004384-1302,2,Quantity,588,637,T12-2,"2619.60, 2617.35, 2617.44, 2614.73, and 2614.71 m","{'mods': ['IsList'], 'unit': 'm'}"
3,S0012821X12004384-1302,2,MeasuredEntity,558,565,T102-2,samples,{'HasQuantity': 'T12-2'}
4,S0012821X12004384-1302,2,Qualifier,571,584,T152-2,below the CIE,{'Qualifies': 'T102-2'}
...,...,...,...,...,...,...,...,...
827,S0022459611006116-1195,2,MeasuredProperty,192,201,T32-2,step size,{'HasQuantity': 'T42-2'}
828,S0022459611006116-1195,2,MeasuredEntity,139,152,T52-2,Initial scans,{'HasProperty': 'T32-2'}
829,S0022459611006116-1195,3,Quantity,230,236,T13-3,50 min,{'unit': 'min'}
830,S0022459611006116-1195,3,MeasuredProperty,217,226,T23-3,scan time,{'HasQuantity': 'T13-3'}


## Q, ME, MP, Qlr tagging

In [9]:
# annotating tokens in each sentence for each doc

# examples = [exmaple]
# annotated_sent = [annotated_tokens[]]
# example = {
#     'docId': doc.docId,
#     'sentAnnot': annotated_sent
# }

Token = namedtuple('Token', ['start', 'end', 'text'])
AnnotatedToken = namedtuple('AnnotatedToken', ['start', 'end', 'text', 'label'])
only_labels='Quantity+MeasuredProperty+MeasuredEntity+Qualifier'

examples = []
for doc in docs.itertuples():
    doc_annotations = annotations[(annotations.docId == doc.docId) & (annotations.annotType.isin(only_labels.split('+')))]
#     print(doc)
#     print(doc_annotations)
    
    annotated_sent = []
    for sent in doc.texts:
        tokens = [
            Token(sent.start + start, sent.start + end, token)
            for (start, end), token in zip(
                tokenizer.span_tokenize(sent.text),
                tokenizer.tokenize(sent.text)
            )
        ]
        
#         print(sent)
#         for t in tokens:
#             print(t)
            
        # retrieve labels for each sentence
        AnnotatedSpan = namedtuple('AnnotatedSpan', ['start', 'end', 'label'])
        df = doc_annotations[(doc_annotations.startOffset >= sent.start) & (doc_annotations.endOffset <= sent.end)]
        sent_labels = [AnnotatedSpan(r.startOffset, r.endOffset, r.annotType) for r in df.itertuples()]
#         for l in sent_labels:
#             print("sent_labels", l)
            
        annotated_tokens = []
        for token in tokens:
            # initialise the annotation tag for the current token as 'O'
            tag = 'O'
            for sl in sent_labels:
                # check whether the current token is within the range of the current annotated span
                isInCurrRange = set(range(token.start, token.end)).intersection(set(range(sl.start, sl.end)))
                if isInCurrRange:
                    # check whether it's in the beginning (B) of the span
                    if token.start == sl.start:
                        tag = 'B-' + sl.label
                    # otherwise, it's in the internal (I) of the span
                    else:
                        tag = 'I-' + sl.label
            # append the annotated tag for the current token
            annotated_tokens.append(AnnotatedToken(token.start, token.end, token.text, tag))
        
#         for a in annotated_tokens:
#             print('annotated_tokens', a)
        
        if len(annotated_tokens) == len(tokens):
            annotated_sent.append(annotated_tokens)
    
#     for anns in annotated_sent:
#         for annt in anns:
#             print('annotated_tokens', annt)
            
    example = {
        'docId': doc.docId,
        'sentAnnot': annotated_sent
    }
    examples.append(example)

In [10]:
examples

[{'docId': 'S0012821X12004384-1302',
  'sentAnnot': [[AnnotatedToken(start=0, end=14, text='Correspondence', label='O'),
    AnnotatedToken(start=15, end=23, text='analysis', label='O'),
    AnnotatedToken(start=24, end=25, text='(', label='O'),
    AnnotatedToken(start=25, end=27, text='CA', label='O'),
    AnnotatedToken(start=27, end=28, text=')', label='O'),
    AnnotatedToken(start=29, end=32, text='and', label='O'),
    AnnotatedToken(start=33, end=44, text='statistical', label='O'),
    AnnotatedToken(start=45, end=54, text='diversity', label='O'),
    AnnotatedToken(start=55, end=63, text='analysis', label='O'),
    AnnotatedToken(start=64, end=68, text='were', label='O'),
    AnnotatedToken(start=69, end=76, text='carried', label='O'),
    AnnotatedToken(start=77, end=80, text='out', label='O'),
    AnnotatedToken(start=81, end=83, text='on', label='O'),
    AnnotatedToken(start=84, end=87, text='the', label='O'),
    AnnotatedToken(start=88, end=101, text='palynological', lab

In [13]:
# write the annotated NER training data into a file
f = open("trial.txt", "w", encoding='UTF-8')

for example in examples:
    annotated_sent = example['sentAnnot']
    for sent in annotated_sent:
        for token in sent:
            f.write(token.text + '\t' + token.label + '\n')
        f.write('\n')

f.close()

In [14]:
# count the number of sentences
num = 0
for example in examples:
    annotated_sent = example['sentAnnot']
    for sent in annotated_sent:
        num += 1
num

519

## unit tagging

In [225]:
# annotating tokens in each sentence for each doc

# examples = [exmaple]
# annotated_sent = [annotated_tokens[]]
# example = {
#     'docId': doc.docId,
#     'sentAnnot': annotated_sent
# }
import re

Token = namedtuple('Token', ['start', 'end', 'text'])
AnnotatedToken = namedtuple('AnnotatedToken', ['start', 'end', 'text', 'label'])

examples = []
for doc in docs.itertuples():
    doc_annotations = annotations[(annotations.docId == doc.docId) & (annotations.annotType == 'Quantity')]
#     print(doc)
#     print(doc_annotations)
    
    # get rows containing 'unit' in 'other' column
    idx_list = []
    for index, row in doc_annotations.reset_index(drop=True).iterrows():
        if row['other'].get('unit') != None:
            idx_list.append(index)
    doc_annotations = doc_annotations.iloc[idx_list]
#     print(doc_annotations)

#     # add column 'sentIdx' representing the index of a sentence where each quantity of the doc belongs to
#     sent_idx = []
#     for index, row in doc_annotations.iterrows():
#         # check whether the current quantity is within the range of the current sentence span
#         for i in range(len(doc.texts)):
#             sent = doc.texts[i]
#             isInRange = set(range(row.startOffset, row.endOffset)).intersection(set(range(sent.start, sent.end)))
#             if isInRange:
#                 sent_idx.append(i)
#     doc_annotations.loc[:, 'sentIdx'] = sent_idx
    
    # add columns 'unitStart' and 'unitEnd' in doc_annotations to mark the span of units
    unit_start = []
    for index, row in doc_annotations.iterrows():
        if re.compile(row['other'].get('unit')).search(row.text) != None:
            # 'unit' span is within the range of 'quantity' span: append its start offset
            s = re.compile(row['other'].get('unit')).search(row.text).span()[0] + row.startOffset
            unit_start.append(s)
        else:
            # 'unit' span is not within the range of 'quantity' span: ignore it 
            unit_start.append(-1)

            print(row)
            print(doc)
            print(doc_annotations)
    doc_annotations.loc[:, 'unitStart'] = unit_start
        
    # !!!!!!!!!
    unit_end = []
    for index, row in doc_annotations.iterrows():
        if re.compile(row['other'].get('unit')).search(row.text) != None:
            e = re.compile(row['other'].get('unit')).search(row.text).span()[1] + row.startOffset
            unit_end.append(e)
        else:
            unit_end.append(-1)
    doc_annotations.loc[:, 'unitEnd'] = unit_end
    #!!!!!!!!!
    
    # labeling each sentence
    annotated_sent = []
    for sent in doc.texts:
        # get the span of each token within the current sentence
        tokens = [
            Token(sent.start + start, sent.start + end, token)
            for (start, end), token in zip(
                tokenizer.span_tokenize(sent.text),
                tokenizer.tokenize(sent.text)
            )
        ]
        
#         print(sent)
#         for t in tokens:
#             print(t)
            
        # retrieve labels for each sentence
        AnnotatedSpan = namedtuple('AnnotatedSpan', ['start', 'end', 'label'])
        df = doc_annotations[(doc_annotations.unitStart >= sent.start) & (doc_annotations.unitEnd <= sent.end)]
        sent_labels = [AnnotatedSpan(r.unitStart, r.unitEnd, 'unit') for r in df.itertuples()]
#         for l in sent_labels:
#             print("sent_labels", l)
            
        annotated_tokens = []
        for token in tokens:
            # initialise the annotation tag for the current token as 'O'
            tag = 'O'
            #!!!!!!!!
            for sl in sent_labels:
                # check whether the current token is within the range of the current annotated span
                isInCurrRange = set(range(token.start, token.end)).intersection(set(range(sl.start, sl.end)))
                if isInCurrRange:
                    # choose not to use BIO tag due to some corner examples
                    # unit is within the span of a token (e.g. quantity='5≤2θ/°≤80', unit='°')
                    tag = 'unit'
#                     # check whether it's in the beginning (B) of the span
#                     if token.start == sl.start:
#                         tag = 'B-' + sl.label
#                     # otherwise, it's in the internal (I) of the span
#                     else:
#                         tag = 'I-' + sl.label
            #!!!!!!!!
            # append the annotated tag for the current token
            annotated_tokens.append(AnnotatedToken(token.start, token.end, token.text, tag))
        
#         for a in annotated_tokens:
#             print('annotated_tokens', a)
        
        if len(annotated_tokens) == len(tokens):
            annotated_sent.append(annotated_tokens)
    
#     for anns in annotated_sent:
#         for annt in anns:
#             print('annotated_tokens', annt)
            
    example = {
        'docId': doc.docId,
        'sentAnnot': annotated_sent
    }
    examples.append(example)

cnt 0
docId          S0016236113008041-3171
annotSet                            2
annotType                    Quantity
startOffset                       325
endOffset                         327
annotId                        T102-2
text                               22
other                 {'unit': 'ppm'}
Name: 238, dtype: object
Pandas(Index=17, docId='S0016236113008041-3171', text='Fig. 6 shows the concentrations of minor elements in the solid sorbent, where increasing bed inventory resulted in increasing values observed for Co, Ni, Cu, Mo, Cd, and Sn. In the case of Cu and Sn, values were obtained that were lower than that of the blank for a bed inventory of 4.5 kg, which then increased to values of 22 and 0.22 ppm respectively for 13 kg. In the case of Gd, Dy and U, small decreases in concentration were found for an increasing inventory, however values are low at <1 ppm and the changes observed are small. The remaining elements (Zr, Le, Ce, Pt, and Nd) recorded an increase in co

docId          S0016236113008041-3290
annotSet                            2
annotType                    Quantity
startOffset                       203
endOffset                         207
annotId                         T52-2
text                             0.63
other                 {'unit': 'ppm'}
Name: 332, dtype: object
Pandas(Index=21, docId='S0016236113008041-3290', text='Fig. 10 shows how gaseous major element concentrations were affected by flue gas SO2 concentration. Concentrations of all were very low at <1 ppm, with Fe and Si present in the highest concentrations at 0.63 and 0.51 ppm respectively. For several elements, 1000 ppm SO2 resulted in either the highest concentrations, as was the case for Na, Al, and Si, or the lowest concentrations, as was the case for Fe, when compared to 0 and 2000 ppm SO2.', texts=[Sentence(start=0, end=4, text='Fig.'), Sentence(start=5, end=99, text='10 shows how gaseous major element concentrations were affected by flue gas SO2 concentratio

In [226]:
examples

[{'docId': 'S0012821X12004384-1302',
  'sentAnnot': [[AnnotatedToken(start=0, end=14, text='Correspondence', label='O'),
    AnnotatedToken(start=15, end=23, text='analysis', label='O'),
    AnnotatedToken(start=24, end=25, text='(', label='O'),
    AnnotatedToken(start=25, end=27, text='CA', label='O'),
    AnnotatedToken(start=27, end=28, text=')', label='O'),
    AnnotatedToken(start=29, end=32, text='and', label='O'),
    AnnotatedToken(start=33, end=44, text='statistical', label='O'),
    AnnotatedToken(start=45, end=54, text='diversity', label='O'),
    AnnotatedToken(start=55, end=63, text='analysis', label='O'),
    AnnotatedToken(start=64, end=68, text='were', label='O'),
    AnnotatedToken(start=69, end=76, text='carried', label='O'),
    AnnotatedToken(start=77, end=80, text='out', label='O'),
    AnnotatedToken(start=81, end=83, text='on', label='O'),
    AnnotatedToken(start=84, end=87, text='the', label='O'),
    AnnotatedToken(start=88, end=101, text='palynological', lab

In [227]:
# write the annotated NER training data into a file
f = open("trial.txt", "w", encoding='UTF-8')

for example in examples:
    annotated_sent = example['sentAnnot']
    for sent in annotated_sent:
        for token in sent:
            f.write(token.text + '\t' + token.label + '\n')
        f.write('\n')

f.close()

## Useless 

In [100]:
# sentenizer.span_tokenize()
# x = 'Five samples from below the CIE at 2619.60, 2617.35, 2617.44, 2614.73, and 2614.71 m (indicated in Fig.'
# print(sentenizer.span_tokenize(x))
# l = []
# for (start, end), text in zip(sentenizer.span_tokenize(x), sentenizer.tokenize(x)):
#     l.append(Sentence(start, end, text))
# l
# docs.loc[:, 'texts_unit'] = docs.text.apply(
#     lambda x: [
#         Sentence(start, end, text) for (start, end), text in zip(
#             sentenizer.span_tokenize(x), sentenizer.tokenize(x)
#         )
#     ]
# )

# for sent in doc.texts:
#     tokens = [
#         Token(sent.start + start, sent.start + end, token)
#         for (start, end), token in zip(
#             tokenizer.span_tokenize(sent.text),
#             tokenizer.tokenize(sent.text)
#         )
#     ]
unit = "m per second"
quantity = "10 m per second m"
sent_text = "the speed of the car is 10 m per second m"

tk_unit = tokenizer.tokenize(unit)
print(tk_unit)

q_tokens = [
        Token(start, end, token)
        for (start, end), token in zip(
            tokenizer.span_tokenize(quantity),
            tokenizer.tokenize(quantity)
        )
    ]

mark = -1
cnt = len(tk_unit)
cnt2 = 0
# for t_u in tk_unit:
#     flag = 0
#     for t_q in q_tokens:
#         if t_u == t_q.text:
#             cnt2 += 1
#             flag = 1
#         else:
#             continue
position = 0
flag = 0
match = False
for i in range(len(q_tokens)):
    while match == False:
        for j in range(len(tk_unit)):
            if position == j:
                if q_tokens[i].text == tk_unit[j]:
                    # the current token in the quantity matchs the current token in the unit 
                    flag = 1
                    cnt2 += 1
                    position = j+1
                    match = True
                    break
                else:
                    # the current quantity token doesn't match the current unit token - not the exact match
                    # reset all parameters
                    flag = 0
                    cnt2 = 0
                    position = 0
    #             break
    if flag == 1 and cnt2 == cnt:
        mark = q_tokens[i-(cnt-1)].start
        break

# for (l, r), t in zip(tokenizer.span_tokenize(quantity),tokenizer.tokenize(quantity)):
#     q_tokens.append(Token(l, r, t))
#     for t_u in tk_unit:
#         if t_u == t:
#             cnt -= 1
#         else:
#             break
#     if cnt == 0:
#         mark = l
#         break

s = -1
e = -1
for i in range(len(q_tokens)):
    if q_tokens[i].start == mark:
        s = mark
        end_tk = q_tokens[i+cnt-1]
        e = end_tk.end
print(s)
print(e)
print(q_tokens)

['m', 'per', 'second']
0
2
[Token(start=0, end=2, text='10'), Token(start=3, end=4, text='m'), Token(start=5, end=8, text='per'), Token(start=9, end=15, text='second'), Token(start=16, end=17, text='m')]


In [84]:
annotations

Unnamed: 0,docId,annotSet,annotType,startOffset,endOffset,annotId,text,other
0,S0012821X12004384-1302,1,Quantity,553,557,T91-1,Five,{'mods': ['IsCount']}
1,S0012821X12004384-1302,1,MeasuredEntity,558,565,T101-1,samples,{'HasQuantity': 'T91-1'}
2,S0012821X12004384-1302,2,Quantity,588,637,T12-2,"2619.60, 2617.35, 2617.44, 2614.73, and 2614.71 m","{'mods': ['IsList'], 'unit': 'm'}"
3,S0012821X12004384-1302,2,MeasuredEntity,558,565,T102-2,samples,{'HasQuantity': 'T12-2'}
4,S0012821X12004384-1302,2,Qualifier,571,584,T152-2,below the CIE,{'Qualifies': 'T102-2'}
...,...,...,...,...,...,...,...,...
827,S0022459611006116-1195,2,MeasuredProperty,192,201,T32-2,step size,{'HasQuantity': 'T42-2'}
828,S0022459611006116-1195,2,MeasuredEntity,139,152,T52-2,Initial scans,{'HasProperty': 'T32-2'}
829,S0022459611006116-1195,3,Quantity,230,236,T13-3,50 min,{'unit': 'min'}
830,S0022459611006116-1195,3,MeasuredProperty,217,226,T23-3,scan time,{'HasQuantity': 'T13-3'}


In [103]:
import re

unit = "m per second"
quantity = "10 m m per second m"
sent_text = "the speed of the car is 10 m per second m"

# stra = 'bread is good'
p = re.compile(unit)
result1 = p.search(quantity).span()
print(result1[0])
result2 = p.search(quantity).group()
print(result2)


5
m per second


In [102]:
quantity = "10 m m per second m"
q_tokens = [
        Token(start, end, token)
        for (start, end), token in zip(
            tokenizer.span_tokenize(quantity),
            tokenizer.tokenize(quantity)
        )
    ]
print(q_tokens)

[Token(start=0, end=2, text='10'), Token(start=3, end=4, text='m'), Token(start=5, end=6, text='m'), Token(start=7, end=10, text='per'), Token(start=11, end=17, text='second'), Token(start=18, end=19, text='m')]
