In [1]:
import glob
import pandas as pd
import numpy as np
import json
import pickle
import string
import copy
from collections import defaultdict
from collections import Counter

import spacy
import networkx as nx
model_dir = '/Users/talhindi/miniconda3/lib/python3.7/site-packages/en_core_web_sm/en_core_web_sm-2.1.0'

## Reading Data

In [2]:
train_test_split = pd.read_csv('../data/SG2017/train-test-split.csv', sep=';')

In [3]:
essays_txt_prg_list = []
for file in sorted(glob.glob("../data/SG2017/*.txt")):
    essay = open(file).readlines()
    essays_txt_prg_list.append(essay)

essay_txt_str = []
for essay in essays_txt_prg_list:
    essay_txt_str.append(''.join(essay))
    
essays_ann = []
for file in sorted(glob.glob("../data/SG2017/*.ann")):
    essay = open(file).readlines()
    essays_ann.append(essay)

In [4]:
essays_segments = []

for essay in essays_ann:    
    segments = []
    
    for line in essay:
        if line[0] == 'T':
            _, label_s_e, text = line.rstrip().split('\t')
            label, start, end = label_s_e.split()
            segments.append((label, int(start), int(end), text))
            
    segments.sort(key = lambda element : element[1])
    essays_segments.append(segments)

## Labels

In [9]:
def get_labels(essay_spacy, segments):
    '''O = 0, Arg-B = 1, Arg-I = 2'''
    
    doc_len = len(essay_spacy)
    
    labels = []
    tokens = []
    arg_seg_starts = [start for arg_type, start, end, text in segments]
    
    for token in essay_spacy:
        arg_I_token = False

        if token.idx in arg_seg_starts:
            labels.append('Arg-B')
#             labels.append(1.0)
            tokens.append(token.text)
            assert token.text in segments[arg_seg_starts.index(token.idx)][-1]
        else:
            for _, start, end, _ in segments:
                if token.idx > start and token.idx+len(token) <= end:
                    labels.append('Arg-I')
#                     labels.append(2.0)
                    tokens.append(token.text)
                    arg_I_token = True
            if not arg_I_token:
                labels.append('O')
#                 labels.append(0.0)
                tokens.append(token.text)

    assert len(labels) == doc_len
    return tokens, labels

## Spacy

In [6]:
nlp = spacy.load(model_dir)

essay_spacy = []
for essay in essay_txt_str:
    essay_spacy.append(nlp(essay))

In [7]:
# counting labels from each type
# without new lines
token_labels = []
train_BIO = defaultdict(int)
test_BIO = defaultdict(int)

for doc, segments, group in zip(essay_spacy, essays_segments, train_test_split.SET):
    tokens, labels = get_labels(doc, segments)
    
    if group == "TRAIN":
        for label in  labels:
            train_BIO[label] += 1
    else:
        for label in  labels:
            test_BIO[label] += 1
    
train_BIO,test_BIO

(defaultdict(int, {0.0: 39617, 1.0: 4823, 2.0: 75312}),
 defaultdict(int, {0.0: 9801, 1.0: 1266, 2.0: 18748}))

## Probability Feature

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [72]:
'''probability-feature:
        is the conditional probability of the current token t_i 
        being the beginning of an argument component (“Arg-B”) given its preceding tokens (up to 3 prev_tokens).
        using MLE on the training data
'''
def train_vectorizer(essay_spacy, essays_segments, train_test_split, labeling_function, B_labels='Arg-B'):
    
    argB_train_segments = []
    for essay, segments, group in zip(essay_spacy, essays_segments, train_test_split):
        tokens, labels = labeling_function(essay, segments)

        for i, (t, l)  in enumerate(zip(tokens, labels)):
            if l == B_labels:
                if group == 'TRAIN':
                    argB_train_segments.append(' '.join([tokens[i-3],tokens[i-2],tokens[i-1]]) )
        
    vect = CountVectorizer(ngram_range=(1,3))
    vect.fit(argB_train_segments)
        
    return vect
        

def get_probability_features(doc, vectorizer):
    
    features = []
    for i, token in enumerate(doc):
        if i == 0:
            prev_context = ''
        elif i == 1:
            prev_context = doc[0].text
        elif i == 2:
            prev_context = ' '.join([doc[0].text, doc[1].text])
        else:
            prev_context = ' '.join([doc[i-3].text, doc[i-2].text, doc[i-1].text])
            
        grams = vectorizer.transform([prev_context])[0]
        features.append({'probability_feature': grams.count_nonzero()/ grams.shape[1]})
    
    return features

In [73]:
vectorizer = train_vectorizer(essay_spacy, essays_segments, train_test_split.SET, get_labels)
open('../features/SG2017_train/probability.jsonlines', 'w')
open('../features/SG2017_test/probability.jsonlines', 'w')

token_id = 0
for i, (doc, segments, group) in enumerate(zip(essay_spacy, essays_segments, train_test_split.SET)):

    features = get_probability_features(doc, vectorizer)
    tokens, labels = get_labels(doc, segments)

    if group == "TRAIN":
        with open('../features/SG2017_train/probability.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1
    else:
        with open('../features/SG2017_test/probability.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1

# Archive

In [51]:
argB_train_segments, argB_test_segments, other_test_segments, all_segments = [], [], [], []

for essay, segments, group in zip(essay_spacy, essays_segments, train_test_split.SET):
    tokens, labels = get_labels(essay, segments)
    
    for i, (t, l)  in enumerate(zip(tokens, labels)):
        if l == 'Arg-B':
            if group == 'TRAIN':
                argB_train_segments.append(' '.join([tokens[i-3],tokens[i-2],tokens[i-1]]) )
            else:
                argB_test_segments.append(' '.join([tokens[i-3],tokens[i-2],tokens[i-1]]) )
#                 argB_test_segments.append([tokens[i-1],
#                             ' '.join([tokens[i-2],tokens[i-1]]),
#                             ' '.join([tokens[i-3],tokens[i-2],tokens[i-1]])])
        else:
            if group == 'TEST':
                other_test_segments.append(' '.join([tokens[i-3],tokens[i-2],tokens[i-1]]) )
#                 other_test_segments.append([tokens[i-1],
#                             ' '.join([tokens[i-2],tokens[i-1]]),
#                             ' '.join([tokens[i-3],tokens[i-2],tokens[i-1]])]) 
        
        all_segments.append([tokens[i-1],
                            ' '.join([tokens[i-2],tokens[i-1]]),
                            ' '.join([tokens[i-3],tokens[i-2],tokens[i-1]])])


In [55]:
vect = CountVectorizer(ngram_range=(1,3))
vect.fit_transform(argB_train_segments)

<4823x4698 sparse matrix of type '<class 'numpy.int64'>'
	with 13942 stored elements in Compressed Sparse Row format>

In [42]:
vec_counts = []
for vec in train_grams:
    vec_counts.append(vec.count_nonzero())

Counter(vec_counts)

Counter({6: 619, 3: 3006, 1: 1195, 5: 3})

In [48]:
argB_test_vec_counts = []
for prev_tokens in argB_test_segments:
    grams = vect.transform(prev_tokens)
    counts = 0
    for gram in grams:
        counts += gram.count_nonzero()
    
    argB_test_vec_counts.append(counts)

Counter(argB_test_vec_counts).most_common()

[(4, 357),
 (2, 286),
 (3, 242),
 (1, 128),
 (10, 71),
 (7, 56),
 (0, 48),
 (8, 28),
 (5, 22),
 (6, 21),
 (9, 7)]

In [50]:
other_test_vec_counts = []
for prev_tokens in other_test_segments:
    grams = vect.transform(prev_tokens)
    counts = 0
    for gram in grams:
        counts += gram.count_nonzero()
    
    other_test_vec_counts.append(counts)

Counter(other_test_vec_counts).most_common()

[(6, 6269),
 (4, 5265),
 (3, 5250),
 (5, 3633),
 (7, 2122),
 (2, 1824),
 (1, 1813),
 (8, 1233),
 (0, 758),
 (9, 288),
 (10, 94)]

In [53]:
argB_test_vec_counts = []
for prev_tokens in argB_test_segments:
    grams = vect.transform([prev_tokens])[0]
    argB_test_vec_counts.append(grams.count_nonzero())

Counter(argB_test_vec_counts).most_common()

[(1, 448), (3, 374), (2, 284), (6, 71), (0, 48), (4, 34), (5, 7)]

In [54]:
other_test_vec_counts = []
for prev_tokens in other_test_segments:
    grams = vect.transform([prev_tokens])[0]
    other_test_vec_counts.append(grams.count_nonzero())

Counter(other_test_vec_counts).most_common()

[(2, 11425), (3, 7964), (1, 5585), (4, 2435), (0, 758), (5, 288), (6, 94)]

In [47]:
# all_vec_counts = []
# for prev_tokens in all_segments:
#     grams = vect.transform(prev_tokens)
#     counts = 0
#     for gram in grams:
#         counts += gram.count_nonzero()
    
#     all_vec_counts.append(counts)

Counter(all_vec_counts).most_common()

[(6, 32687),
 (4, 29488),
 (3, 25094),
 (5, 18123),
 (7, 11738),
 (2, 9744),
 (1, 8889),
 (8, 7688),
 (0, 3358),
 (9, 1550),
 (10, 1208)]

In [None]:
# mle_train_segments = []
argB_segments, other_segments, vocab = [], [], []
for essay, segments, group in zip(essay_spacy, essays_segments, train_test_split.SET):
    tokens, labels = get_labels(essay, segments)
    for i, (t, l)  in enumerate(zip(tokens, labels)):
        if l == 'Arg-B':
            print('Previous Tokens --> {}:{}  {}:{}  {}:{}'.format(labels[i-3], repr(tokens[i-3]),
                                                    labels[i-2], repr(tokens[i-2]), labels[i-1], repr(tokens[i-1])))
            print(group,'{}: {}'.format(l, t))
            print()
            if group == 'TRAIN':
                argB_segments.append([tokens[i-3],tokens[i-2],tokens[i-1], 'Arg-B'])
#                 mle_train_segments.append([tokens[i-3],tokens[i-2],tokens[i-1], 'Arg-B'])
                vocab.append(t)
        
        # for Arg-I and O tokens
        elif group == 'TRAIN':
            other_segments.append(tokens[i-3:i])
#             mle_train_segments.append([tokens[i-3],tokens[i-2],tokens[i-1], 'O'])
            vocab.append(t)

vocab = set(vocab)

In [152]:
from nltk.util import everygrams
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE

n = 4
train_data, padded_sents = padded_everygram_pipeline(n, argB_segments)

model = MLE(n) # Lets train a 3-grams maximum likelihood estimation model.
model.fit(train_data, padded_sents)

In [159]:
argb_train_scores, other_train_scores = [], []
argb_test_scores, other_test_scores = [], []

for essay, segments, group in zip(essay_spacy, essays_segments, train_test_split.SET):
    tokens, labels = get_labels(essay, segments)
    for i, (t, l)  in enumerate(zip(tokens, labels)):
        if l == 'Arg-B' and group == 'TRAIN':
            argb_train_scores.append(model.score('Arg-B',(tokens[i-3],tokens[i-2],tokens[i-1])))
        elif group == 'TRAIN':
            other_train_scores.append(model.score('Arg-B',(tokens[i-3],tokens[i-2],tokens[i-1])))
        elif l == 'Arg-B':
            argb_test_scores.append(model.score('Arg-B',(tokens[i-3],tokens[i-2],tokens[i-1])))
        else:
            other_test_scores.append(model.score('Arg-B',(tokens[i-3],tokens[i-2],tokens[i-1])))

In [143]:
len([s for s in other_train_scores if s > 0]), len(other_train_scores)

(2617, 114929)

In [144]:
len([s for s in argb_test_scores if s > 0]), len(argb_test_scores)

(620, 1266)

In [145]:
len([s for s in other_test_scores if s > 0]), len(other_test_scores)

(604, 28549)

In [158]:
ngrams = everygrams(mle_train_segments, max_len=3)

model = MLE(n) # Lets train a 3-grams maximum likelihood estimation model.
model.fit(ngrams, vocab)

In [None]:
train_data, padded_sents = padded_everygram_pipeline(4, argB_segments)

for ngramlize_sent in train_data:
    print(list(ngramlize_sent))
    print()
print('#############')
list(padded_sents)