In [35]:
import os
import glob

## Reading from merged essay files

In [29]:
def token_to_sent_labels(file):
    
    labeled_sentences = []
    sent, sent_token_labels = '', []
    
    for i, line in enumerate(file):
        if line.rstrip() != '':
            try:
                token, label = line.rstrip().split()
            except:
                print(i, line)
            sent += token + ' '
            sent_token_labels.append(label)
        else:
            sent_label = any([label in ['Arg-I', 'I-claim', 'I-premise'] for label in sent_token_labels])
            if sent_label:
                labeled_sentences.append((sent.strip(), 1))
            else:
                labeled_sentences.append((sent.strip(), 0))
            sent, sent_token_labels = '', []
    
    return labeled_sentences
        

In [30]:
path = '../data/SG2017_tok/'
train = open(os.path.join(path,'train.txt')).readlines()
test = open(os.path.join(path,'test.txt')).readlines()

print('train')
train_sent = token_to_sent_labels(train)
print('test')
test_sent = token_to_sent_labels(test)

train
test


In [31]:
path = '../data/SG2017_claim/'
train = open(os.path.join(path,'train.txt')).readlines()
test = open(os.path.join(path,'test.txt')).readlines()

print('train')
train_sent = token_to_sent_labels(train)
print('test')
test_sent = token_to_sent_labels(test)

train
test


## Reading from separated essay files

In [113]:
def essay_list_token_to_essay_list_sent(file_list):
    
    essay_labeled_sentences, labeled_sentences = [], []
    sent, sent_token_labels = '', []
    
    for file_id, file in enumerate(file_list):
        
        # first sentence id
        prev_sent_id = '0'
        
        for i, line in enumerate(file):
            try:
                sent_id, token_id, token, label = line.rstrip().split('\t')
            except:
                print(file_id, i, line.rstrip().split('\t'))
            
            if sent_id == prev_sent_id:
                if token not in ['_NEW_LINE__NEW_LINE_', '_NEW_LINE_']:
                    sent += token + ' '
                    sent_token_labels.append(label)
                prev_sent_id = sent_id
            
            else:
                
                sent_label = any([label in ['Arg-I', 'I-claim', 'I-premise'] for label in sent_token_labels])
                if sent_label:
                    essay_labeled_sentences.append((sent.strip(), 1))
                else:
                    essay_labeled_sentences.append((sent.strip(), 0))
                
                # initializing with first token of the next sentence
                sent, sent_token_labels = token+' ', [label]
                prev_sent_id = sent_id
        
        
        sent_label = any([label in ['Arg-I', 'I-claim', 'I-premise'] for label in sent_token_labels])
        if sent_label:
            essay_labeled_sentences.append((sent.strip(), 1))
        else:
            essay_labeled_sentences.append((sent.strip(), 0))
        sent, sent_token_labels = '', []
        
        labeled_sentences.append(essay_labeled_sentences)
        essay_labeled_sentences = []
    
    return labeled_sentences


def essay_sent_list_to_merged_sent_list(file_sent_list, output_file, get_next_sent=False):
        labeled_sent = []
        
        if not get_next_sent:
            for file in file_sent_list:
                for line in file:
                    sent, label = line
                    labeled_sent.append('{}\t{}\n'.format(sent, label))
        
        # sent with next
        else:
            for file in file_sent_list:
                for i, line in enumerate(file[:-1]):
                    sent, label = line
                    next_sent, _ =  file[i+1]
                    labeled_sent.append('{}\t{}\t{}\n'.format(sent, next_sent, label))
                
                sent, label = file[-1]
                labeled_sent.append('{}\t{}\t{}\n'.format(sent, 'END_OF_ESSAY', label))
        
        
        with open(output_file,'w') as writer:
            for line in labeled_sent:
                writer.write(line)
        
        return labeled_sent
        

In [114]:
train_files = []
for file in sorted(glob.glob("../data/SG2017_tok/train/*.tsv")):
    essay = open(file).readlines()
    train_files.append(essay[1:])

test_files = []
for file in sorted(glob.glob("../data/SG2017_tok/test/*.tsv")):
    essay = open(file).readlines()
    test_files.append(essay[1:])


train_sent = essay_sent_list_to_merged_sent_list(essay_list_token_to_essay_list_sent(train_files),
                                                '../data/SG2017_tok/sentences/train.tsv')
test_sent = essay_sent_list_to_merged_sent_list(essay_list_token_to_essay_list_sent(test_files),
                                               '../data/SG2017_tok/sentences/test.tsv')

train_sent_next = essay_sent_list_to_merged_sent_list(essay_list_token_to_essay_list_sent(train_files),
                                                      '../data/SG2017_tok/sentences_next/train.tsv',
                                                      get_next_sent=True)
test_sent_next = essay_sent_list_to_merged_sent_list(essay_list_token_to_essay_list_sent(test_files),
                                                     '../data/SG2017_tok/sentences_next/test.tsv',
                                                     get_next_sent=True)

In [115]:
train_files = []
for file in sorted(glob.glob("../data/SG2017_claim/train/*.tsv")):
    essay = open(file).readlines()
    train_files.append(essay[1:])

test_files = []
for file in sorted(glob.glob("../data/SG2017_claim/test/*.tsv")):
    essay = open(file).readlines()
    test_files.append(essay[1:])


train_sent = essay_sent_list_to_merged_sent_list(essay_list_token_to_essay_list_sent(train_files),
                                                '../data/SG2017_claim/sentences/train.tsv')
test_sent = essay_sent_list_to_merged_sent_list(essay_list_token_to_essay_list_sent(test_files),
                                               '../data/SG2017_claim/sentences/test.tsv')

train_sent_next = essay_sent_list_to_merged_sent_list(essay_list_token_to_essay_list_sent(train_files),
                                                      '../data/SG2017_claim/sentences_next/train.tsv',
                                                      get_next_sent=True)
test_sent_next = essay_sent_list_to_merged_sent_list(essay_list_token_to_essay_list_sent(test_files),
                                                     '../data/SG2017_claim/sentences_next/test.tsv',
                                                     get_next_sent=True)