In [1]:
import glob
import pandas as pd
from nltk.tokenize import word_tokenize

## Reading Files

In [2]:
train_test_split = pd.read_csv('../data/PE/train-test-split.csv', sep=';')

In [3]:
essays_txt = []
for file in sorted(glob.glob("../data/PE/*.txt")):
    essay = open(file).readlines()
    essays_txt.append(essay)
    
essays_ann = []
for file in sorted(glob.glob("../data/PE/*.ann")):
    essay = open(file).readlines()
    essays_ann.append(essay)
    
len(glob.glob("../data/PE/*.txt")), len(glob.glob("../data/PE/*.ann"))

(402, 402)

### Getting Arg Segments Boundaries

In [4]:
essays_segments = []

for essay in essays_ann:    
    segments = []
    
    for line in essay:
        if line[0] == 'T':
            _, label_s_e, text = line.rstrip().split('\t')
            label, start, end = label_s_e.split()
            segments.append((label, int(start), int(end), text))
            
    segments.sort(key = lambda element : element[1])
    essays_segments.append(segments)

### Getting BIO segments for each essay

In [5]:
essays_BIO = []
for text, segments in zip(essays_txt, essays_segments):
    B_I, O = [], []
    text_str = ''.join(text)
    
    #first non Arg segment
    assert segments[0][1] != 0
    O.append(text_str[:segments[0][1]])
    
    # looping through arg segments of an essay
    for i, seg in enumerate(segments):
        _, start, end, seg_text = seg
        assert text_str[start:end] == seg_text
        
        B_I.append(text_str[start:end])
        
        # O text segment starts from end of this Arg segment till start of next Arg segment
        '''TODO: I need to check for cases where there are two adjacent Arg segments'''
        if i+1 < len(segments):
            O.append(text_str[end: segments[i+1][1]])
    
    #last non Arg segment if exists
    if segments[-1][2] < len(text_str):
        O.append(text_str[segments[-1][2]:])
    
    essays_BIO.append([B_I, O])

### Count BIO tokens in each essay using multiple tokenization teqchniques  (srt.split, nltk, stanford)

In [8]:
BIO_counts = []
for BI, O in essays_BIO:
    B_count = len(BI)
    
    I_count = 0
    for text in BI:
#         I_count += len(text.split()) - 1
        I_count += len(word_tokenize(text)) - 1
    
    O_count = 0
    for text in O:
#         O_count += len(text.split())
        O_count += len(word_tokenize(text))
    
    BIO_counts.append((B_count, I_count, O_count))

In [11]:
# special handling of counting '\n' needed?
new_line_counts = []
for BI, O in essays_BIO:
    new_line = 0
    for text in O:
        new_line += len([c for c in text if c == '\n'])
    new_line_counts.append(new_line)
    

#### Stanford's CoreNLP

In [6]:
import os
os.environ["CORENLP_HOME"] = '/home/research/interns/talhindi/tools/stanford-corenlp-4.0.0'

import stanza
from stanza.server import CoreNLPClient
client = CoreNLPClient(annotators=['tokenize'], timeout=30000, memory='16G')
client.start()

Starting server with command: java -Xmx16G -cp /home/research/interns/talhindi/tools/stanford-corenlp-4.0.0/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-2e1aa8ee117243b4.props -preload tokenize


In [14]:
# saving stanford's counts because they are heavy to compute
BIO_counts_stanford = []
for BI, O in essays_BIO:
    B_count = len(BI)
    
    I_count = 0
    for text in BI:
        ann = client.annotate(text)
        sent_tokens = 0
        for sent in ann.sentence:
            sent_tokens += len(sent.token)
        I_count += sent_tokens + len(ann.sentencelessToken) 
    
    O_count = 0
    for text in O:
        ann = client.annotate(text)
        sent_tokens = 0
        for sent in ann.sentence:
            sent_tokens += len(sent.token)
        O_count += sent_tokens + len(ann.sentencelessToken) 
        
    
    BIO_counts_stanford.append((B_count, I_count, O_count))

In [None]:
client.stop()

### Counting Total number of Segments from each type in Training and Test sets

In [10]:
# nltk
train_B, train_I, train_O = 0, 0, 0
test_B, test_I, test_O = 0, 0, 0

for group, counts in zip(train_test_split.SET, BIO_counts):
    if group == "TRAIN":
        train_B += counts[0]
        train_I += counts[1]
        train_O += counts[2]
    else:
        test_B += counts[0]
        test_I += counts[1]
        test_O += counts[2]

print('\tB\t  I\t  O')        
print('train: {}\t{}\t{}'.format(train_B, train_I, train_O))
print('test:  {}\t{}\t{}'.format(test_B, test_I, test_O))

	B	  I	  O
train: 4823	74716	37937
test:  1266	18565	9375


In [12]:
# nltk without 
train_B, train_I, train_O = 0, 0, 0
test_B, test_I, test_O = 0, 0, 0

for group, counts, nl_counts in zip(train_test_split.SET, BIO_counts, new_line_counts):
    if group == "TRAIN":
        train_B += counts[0]
        train_I += counts[1]
        train_O += counts[2] + nl_counts
    else:
        test_B += counts[0]
        test_I += counts[1]
        test_O += counts[2]  + nl_counts

print('\tB\t  I\t  O')        
print('train: {}\t{}\t{}'.format(train_B, train_I, train_O))
print('test:  {}\t{}\t{}'.format(test_B, test_I, test_O))

	B	  I	  O
train: 4823	74716	39810
test:  1266	18565	9841


In [114]:
# corenlp, only sentencelesstokens
train_B, train_I, train_O = 0, 0, 0
test_B, test_I, test_O = 0, 0, 0

for group, counts, nl_counts in zip(train_test_split.SET, BIO_counts_stanford, new_line_counts):
    if group == "TRAIN":
        train_B += counts[0]
        train_I += counts[1]
        train_O += counts[2] + nl_counts
    else:
        test_B += counts[0]
        test_I += counts[1]
        test_O += counts[2]  + nl_counts

print('\tB\t  I\t  O')        
print('train: {}\t{}\t{}'.format(train_B, train_I, train_O))
print('test:  {}\t{}\t{}'.format(test_B, test_I, test_O))

	B	  I	  O
train: 4823	75220	34765
test:  1266	18680	8519


In [15]:
# core
train_B, train_I, train_O = 0, 0, 0
test_B, test_I, test_O = 0, 0, 0

for group, counts in zip(train_test_split.SET, BIO_counts_stanford):
    if group == "TRAIN":
        train_B += counts[0]
        train_I += counts[1]
        train_O += counts[2]
    else:
        test_B += counts[0]
        test_I += counts[1]
        test_O += counts[2]

print('\tB\t  I\t  O')        
print('train: {}\t{}\t{}'.format(train_B, train_I, train_O))
print('test:  {}\t{}\t{}'.format(test_B, test_I, test_O))

	B	  I	  O
train: 4823	80043	38037
test:  1266	19946	9399


In [16]:
train_B, train_I, train_O = 0, 0, 0
test_B, test_I, test_O = 0, 0, 0

for group, counts, nl_counts in zip(train_test_split.SET, BIO_counts_stanford, new_line_counts):
    if group == "TRAIN":
        train_B += counts[0]
        train_I += counts[1]
        train_O += counts[2] + nl_counts
    else:
        test_B += counts[0]
        test_I += counts[1]
        test_O += counts[2]  + nl_counts

print('\tB\t  I\t  O')        
print('train: {}\t{}\t{}'.format(train_B, train_I, train_O))
print('test:  {}\t{}\t{}'.format(test_B, test_I, test_O))

	B	  I	  O
train: 4823	80043	39910
test:  1266	19946	9865
