In [88]:
import glob
import pandas as pd
import json
import pickle
import string
import copy

import os
os.environ["CORENLP_HOME"] = '/Users/talhindi/Downloads/stanford-corenlp-4.0.0'
from collections import defaultdict
from collections import Counter

import stanza
from stanza.server import CoreNLPClient


In [242]:
# import CoreNLP_pb2
import corenlp

## Reading Data

In [2]:
train_test_split = pd.read_csv('../data/SG2017/train-test-split.csv', sep=';')

In [3]:
essays_txt_prg_list = []
for file in sorted(glob.glob("../data/SG2017/*.txt")):
    essay = open(file).readlines()
    essays_txt_prg_list.append(essay)

essay_txt_str = []
for essay in essays_txt_prg_list:
    essay_txt_str.append(''.join(essay))
    
essays_ann = []
for file in sorted(glob.glob("../data/SG2017/*.ann")):
    essay = open(file).readlines()
    essays_ann.append(essay)

In [4]:
essays_segments = []

for essay in essays_ann:    
    segments = []
    
    for line in essay:
        if line[0] == 'T':
            _, label_s_e, text = line.rstrip().split('\t')
            label, start, end = label_s_e.split()
            segments.append((label, int(start), int(end), text))
            
    segments.sort(key = lambda element : element[1])
    essays_segments.append(segments)

# Tokenization

In [None]:
client = CoreNLPClient(annotators=['tokenize', 'ssplit'], timeout=30000, memory='16G')
client.start()

In [6]:
essay_txt_str_tok, essays_txt_prg_list_tok = [], []

for essay_doc, essay_prgs in zip (essay_txt_str, essays_txt_prg_list):
    essay_txt_str_tok.append(client.annotate(essay_doc))

    prg_list_tok = []
    for prg in essay_prgs:
        prg_list_tok.append(client.annotate(prg))
    
    essays_txt_prg_list_tok.append(prg_list_tok)

In [7]:
client.stop()

In [None]:
# can't pickle!
pickle.dump(essay_txt_str_tok, open('../pkl/SG2017/essay_txt_str_tok.p', 'wb'))
pickle.dump(essays_txt_prg_list_tok, open('../pkl/SG2017/essays_txt_prg_list_tok.p', 'wb'))

## Labels

In [49]:
def get_labels(essay_doc_tok, segments):
    '''O = 0, Arg-B = 1, Arg-I = 2'''
    
    doc_len = sum([len(sent.token) for sent in essay_doc_tok.sentence])
    
    labels = []
    tokens = []
    arg_seg_starts = [start for arg_type, start, end, text in segments]
    
    for sent in essay_doc_tok.sentence:
        for token in sent.token:
            arg_I_token = False

            if token.beginChar in arg_seg_starts:
#                 labels.append('B')
                labels.append(1.0)
                tokens.append(token.word)
                assert token.word in segments[arg_seg_starts.index(token.beginChar)][-1]
            else:
                for _, start, end, _ in segments:
                    if token.beginChar > start and token.endChar <= end:
#                         labels.append('I')
                        labels.append(2.0)
                        tokens.append(token.word)
                        arg_I_token = True
                if not arg_I_token:
#                     labels.append('O')
                    labels.append(0.0)
                    tokens.append(token.word)

    assert len(labels) == doc_len
    return tokens, labels
    

In [None]:
# counting labels from each type
token_labels = []
train_BIO = defaultdict(int)
test_BIO = defaultdict(int)

for doc_tok, segments, group in zip(essay_txt_str_tok, essays_segments, train_test_split.SET):
    tokens, labels = get_labels(doc_tok, segments)
    
    if group == "TRAIN":
        for label in  labels:
            train_BIO[label] += 1
    else:
        for label in  labels:
            test_BIO[label] += 1
    
train_BIO,test_BIO
# defaultdict(int, {0.0: 38039, 1.0: 4822, 2.0: 75216}),
# defaultdict(int, {0.0: 9400, 1.0: 1266, 2.0: 18678})

## Structural Features

In [64]:
'''token position: 
        Token present in introduction or conclusion;  ----> ignore for now
        token is first or last token in sentence; 
        relative and absolute token position in document, paragraph and sentence'''

def get_positions(essay_tok):
    
    doc_len, prg_lengths, sent_lengths = get_lengths(essay_tok)
    
    positions = []
    doc_pos, sent_id = 0, 0
    
    for prg_id, prg in enumerate(essay_tok):
        
        prg_pos = 0
        for sent in prg.sentence:
            
            sent_pos = 0
            for i, token in enumerate(sent.token):
                
                if i == 0: 
                    positions.append({'doc_abs_pos': doc_pos, 'prg_abs_pos': prg_pos, 'sent_abs_pos': sent_pos,
                                'doc_rel_pos': round(doc_pos/doc_len,4), 
                                'prg_rel_pos': round(prg_pos/prg_lengths[prg_id],4),
                                'sent_rel_pos': round(sent_pos/sent_lengths[sent_id],4), 'is_first_in_sent': 1.0})
                elif i == len(sent.token)-1: 
                    positions.append({'doc_abs_pos': doc_pos, 'prg_abs_pos': prg_pos, 'sent_abs_pos': sent_pos,
                                'doc_rel_pos': round(doc_pos/doc_len,4), 
                                'prg_rel_pos': round(prg_pos/prg_lengths[prg_id],4),
                                'sent_rel_pos': round(sent_pos/sent_lengths[sent_id],4), 'is_last_in_sent': 1.0})
                else:
                    positions.append({'doc_abs_pos': doc_pos, 'prg_abs_pos': prg_pos, 'sent_abs_pos': sent_pos,
                                'doc_rel_pos': round(doc_pos/doc_len,4), 
                                'prg_rel_pos': round(prg_pos/prg_lengths[prg_id],4),
                                'sent_rel_pos': round(sent_pos/sent_lengths[sent_id],4)})
                
                doc_pos += 1; prg_pos += 1; sent_pos += 1;
            
            sent_id += 1
    
    return positions



def get_lengths(essay_tok):
    '''Returns essay length, and length of each paragraph and sentence in the essay'''
    doc_len, prg_lengths, sent_lengths = 0, [], []

    for prg in essay_tok:

        prg_len = 0
        for sent in prg.sentence:

            sent_len = 0
            for token in sent.token:
                doc_len += 1; prg_len += 1; sent_len += 1
            sent_lengths.append(sent_len)

        prg_lengths.append(prg_len)

    assert doc_len == sum(prg_lengths) == sum(sent_lengths)
    
    return doc_len, prg_lengths, sent_lengths

In [102]:
'''punctuation:
        Token precedes or follows any punctuation, full stop, comma and semicolon;
        token is any punctuation or full stop'''      

def get_punc_features(essay_doc_tok):
    
    token_features = {}
    set_reset_features(token_features)
    
    tokens, features = [], []
    for sent_id, sent in enumerate(essay_doc_tok.sentence):
        for token_id, token in enumerate(sent.token):
            tokens.append(token.word)
    
    
    for i, token in enumerate(tokens):
        if token in string.punctuation:
                token_features['punc'] = True
                if token == ".": token_features['fullstop'] = True
                    
        if i == 0:
            next_punc_features(tokens[i+1], token_features)
        elif i == len(tokens)-1:
            prev_punc_features(tokens[i-1], token_features)
        else:
            prev_punc_features(tokens[i-1], token_features)
            next_punc_features(tokens[i+1], token_features)

        # adding features of this token to the list of features
        features.append(copy.deepcopy(token_features))

        # resetting features to process the next token
        set_reset_features(token_features)
        
    return features

    
def set_reset_features(token_features):
    token_features['punc'], token_features['fullstop'] = False, False
    token_features['punc_prev'], token_features['fullstop_prev'] = False, False
    token_features['comma_prev'], token_features['semicolon_prev'] = False, False
    token_features['punc_next'], token_features['fullstop_next'] = False, False
    token_features['comma_next'], token_features['semicolon_next'] = False, False

def prev_punc_features(prev_token, token_features):    
    if prev_token in string.punctuation:
        token_features['punc_prev'] = True
        if prev_token == ".":
            token_features['fullstop_prev'] = True
        if prev_token == ",":
            token_features['comma_prev'] = True
        if prev_token == ";":
            token_features['semicolon_prev'] = True

def next_punc_features(next_token, token_features):
    if next_token in string.punctuation:
        token_features['punc_next'] = True
        if next_token == ".":
            token_features['fullstop_next'] = True
        if next_token == ",":
            token_features['comma_next'] = True
        if next_token == ";":
            token_features['semicolon_next'] = True

def punc_features_to_json(this_token_features):
    json_features = {}
    
    for key in sorted(this_token_features.keys()):
        value = this_token_features[key]
        if value:
            json_features[key] = 1.0
        
    return json.dumps(json_features)

In [112]:
'''position of covering sentence
        Absolute and relative position of the token’s covering sentence in the document and paragraph'''

def tok_sent_pos(essay_tok):
    
    prg_lengths = [len(prg.sentence) for prg in essay_tok]
    doc_len = sum(prg_lengths)
    
    doc_pos, positions = 0, []
    for prg_id, prg in enumerate(essay_tok):
        
        prg_pos = 0
        for sent in prg.sentence:
            for i, token in enumerate(sent.token):
                positions.append({'sent_doc_abs_pos': doc_pos, 'sent_prg_abs_pos': prg_pos,
                                'sent_doc_rel_pos': round(doc_pos/doc_len,4), 
                                'sent_prg_rel_pos': round(prg_pos/prg_lengths[prg_id],4)})
            
            doc_pos += 1; prg_pos += 1;
    
    return positions

### Feature Extraction

In [66]:
# position features

token_id = 0
open('../features/SG2017_train/token_position.jsonlines', 'w')
open('../features/SG2017_test/token_position.jsonlines', 'w')

for doc_tok, prg_list_tok, segments, group in zip(essay_txt_str_tok, essays_txt_prg_list_tok, essays_segments, train_test_split.SET):
    
    features = get_positions(prg_list_tok)
    tokens, labels = get_labels(doc_tok, segments)
    
    if group == "TRAIN":
        with open('../features/SG2017_train/token_position.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1
    else:
        with open('../features/SG2017_test/token_position.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1

In [104]:
# punc features

token_id = 0
open('../features/SG2017_train/token_punc.jsonlines', 'w')
open('../features/SG2017_test/token_punc.jsonlines', 'w')

for doc_tok, segments, group in zip(essay_txt_str_tok, essays_segments, train_test_split.SET):
    
    features = get_punc_features(doc_tok)
    tokens, labels = get_labels(doc_tok, segments)
    
    if group == "TRAIN":
        with open('../features/SG2017_train/token_punc.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, punc_features_to_json(f), token_id))
                token_id +=1
                
    else:
        with open('../features/SG2017_test/token_punc.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, punc_features_to_json(f), token_id))
                token_id +=1

In [114]:
# position of covering sentence

token_id = 0
open('../features/SG2017_train/token_position_sent.jsonlines', 'w')
open('../features/SG2017_test/token_position_sent.jsonlines', 'w')

for doc_tok, prg_list_tok, segments, group in zip(essay_txt_str_tok, essays_txt_prg_list_tok, essays_segments, train_test_split.SET):
    
    features = tok_sent_pos(prg_list_tok)
    tokens, labels = get_labels(doc_tok, segments)
    
    if group == "TRAIN":
        with open('../features/SG2017_train/token_position_sent.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1
    else:
        with open('../features/SG2017_test/token_position_sent.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1

# Parsing and Tagging

In [225]:
client = CoreNLPClient(annotators=['pos', 'parse'], timeout=30000, memory='16G')
client.start()

Starting server with command: java -Xmx16G -cp /Users/talhindi/Downloads/stanford-corenlp-4.0.0/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-efef7f818a764f3d.props -preload pos,parse


In [None]:
essay_txt_str_pos_parse, essays_txt_prg_list_pos_parse = [], []

for essay_doc, essay_prgs in zip (essay_txt_str, essays_txt_prg_list):
    essay_txt_str_pos_parse.append(client.annotate(essay_doc))

    prg_list_pos_parse = []
    for prg in essay_prgs:
        prg_list_pos_parse.append(client.annotate(prg))
    
    essays_txt_prg_list_pos_parse.append(prg_list_pos_parse)

In [226]:
doc = client.annotate(essay_txt_str[0])

prg_list_pos_parse = []
for prg in essays_txt_prg_list[0]:
    prg_list_pos_parse.append(client.annotate(prg))

In [228]:
client.stop()

In [227]:
constituency_parse = doc.sentence[0].parseTree
child = constituency_parse.child[0].child[0]

In [254]:
print(type(constituency_parse.child[0]))
print('********')
print(len(constituency_parse.child[0].child))
for child in constituency_parse.child[0].child:
    print(type(child))
    print(len(child.child))
    print('--')
    for grandchild in child.child:
        print(type(grandchild))
        print(len(grandchild.child))
        print('---')
        for grandGrandChild in grandchild.child:
            print(type(grandGrandChild))
        
    print('****')

<class 'CoreNLP_pb2.ParseTree'>
********
4
<class 'CoreNLP_pb2.ParseTree'>
1
--
<class 'CoreNLP_pb2.ParseTree'>
0
---
****
<class 'CoreNLP_pb2.ParseTree'>
1
--
<class 'CoreNLP_pb2.ParseTree'>
1
---
<class 'CoreNLP_pb2.ParseTree'>
****
<class 'CoreNLP_pb2.ParseTree'>
2
--
<class 'CoreNLP_pb2.ParseTree'>
1
---
<class 'CoreNLP_pb2.ParseTree'>
<class 'CoreNLP_pb2.ParseTree'>
2
---
<class 'CoreNLP_pb2.ParseTree'>
<class 'CoreNLP_pb2.ParseTree'>
****
<class 'CoreNLP_pb2.ParseTree'>
1
--
<class 'CoreNLP_pb2.ParseTree'>
0
---
****


In [None]:
def show_Tree(sentence):
    

In [204]:
client = CoreNLPClient(annotators=['tokenize', 'mwt', 'pos', 'lemma', 'depparse'], timeout=30000, memory='16G', port=9001)
client.start()

doc = client.annotate(essay_txt_str[0])
sentence = doc.sentence[0]
client.stop()

In [224]:
for token in sentence.token:
    print(token.word, end=' ')
print()    
for node in sentence.basicDependencies.node:
    print(node.index,  end=' ')

for edge in sentence.basicDependencies.edge:
    print(edge.source, edge.target)

Should students be taught to compete or to cooperate ? 
1 2 3 4 5 6 7 8 9 10 4 1
4 2
4 3
4 6
4 10
6 5
6 9
9 7
9 8


## Syntactic Features

In [181]:
def traverse(t):
    try:
#         t.value()
        print('(', t.value(), end=" ")
    except AttributeError:
        print()
#         print(t, end=" ")
#     else:
        # Now we know that t.node is defined
        for child in t:
            traverse(child)
        print(')', end=" ")

In [135]:
def get_pos(essay_doc_pos):
    '''Part-of-speech: The token’s part-of-speech'''
    pos_features = []
    for sent in essay_doc_pos.sentence:
        for token in sent.token:
            pos_features.append({'pos_{}'.format(token.pos): 1.0})
    
    return pos_features
            

def get_LCA():
    '''Lowest common ancestor (LCA):
        Normalized length of the path to the LCA with the *following* and *preceding* token in the parse tree'''
    pass

def get_LCA_type():
    '''LCA types: The two constituent types of the LCA of the current token and its preceding and following token'''
    pass

### Feature Extraction

In [138]:
# pos tags

token_id = 0
open('../features/SG2017_train/token_pos.jsonlines', 'w')
open('../features/SG2017_test/token_pos.jsonlines', 'w')

for doc_tok, segments, group in zip(essay_txt_str_tok, essays_segments, train_test_split.SET):
    
    features = get_pos(doc_tok)
    tokens, labels = get_labels(doc_tok, segments)
    
    if group == "TRAIN":
        with open('../features/SG2017_train/token_pos.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1
    else:
        with open('../features/SG2017_test/token_pos.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1

## LexSyn and Probability Features

In [None]:
'''LexSyn 1:
        We use lexical head projection rules (Collins 2003) implemented in the Stanford tool suite
        to lexicalize the constituent parse tree. 
        For each token t, we extract its uppermost node n in the parse tree 
        with the lexical head t and define a lexico- syntactic feature as 
        the combination of t and the constituent type of n.'''

'''LexSyn 2:
        We also consider the child node of n in the path to t and its right sibling, 
        and combine their lexical heads and constituent types as described by Soricut and Marcu (2003).'''

In [None]:
'''probability-feature:
        is the conditional probability of the current token t_i 
        being the beginning of an argument component (“Arg-B”) given its preceding tokens.
        using MLE on the training data
'''

# Archive

In [None]:
'''dump of old get_punc function'''

    #checking if the token is a word or punctuations
    if token.word in string.punctuation:
        token_features['punc'] = True
        if token.word == ".": token_features['fullstop'] = True

    # no prev to the first token in the essay
    if sent_id == 0 and token_id == 0:
        prev_token = ''
        next_token = sent.token[1].word
        next_punc_features(next_token, token_features)

    # no next to the last token in the essay
    elif sent_id == len(essay_doc_tok.sentence)-1 and token_id == len(sent.token)-1:
        prev_token = sent.token[-2].word
        prev_punc_features(prev_token, token_features)
        next_token = ''

    # token is neither first nor last in the essay
    else:

        # token is neither first not last in the sentence
        if token_id > 0 and token_id < len(sent.token)-1:
            prev_token = sent.token[token_id-1].word
            next_token = sent.token[token_id+1].word

        # token is first in the sentence
        elif token_id == 0:
            prev_token = essay_doc_tok.sentence[sent_id-1].token[-1].word
            next_token = sent.token[token_id+1].word

        # token is last in the sentence
        elif token_id == len(sent.token)-1:
            prev_token = sent.token[token_id-1].word
            next_token = essay_doc_tok.sentence[sent_id+1].token[0].word

        # this should never get executed 
        else:
            print('something is wrong with token_id {} in sent_id {}'.format(token_id, sent_id))

        prev_punc_features(prev_token, token_features)
        next_punc_features(next_token, token_features)

In [115]:
with CoreNLPClient(annotators=['tokenize','ssplit','pos','lemma','ner', 'parse', 'depparse','coref'], timeout=30000, memory='16G') as client:
    for prg in essays_txt_prg_list[0]:
        # submit the request to the server
        ann = client.annotate(prg.rstrip())

        # get the first sentence
        sentence = ann.sentence[0]

        # get the constituency parse of the first sentence
        print('---')
        print('constituency parse of first sentence')
        constituency_parse = sentence.parseTree
        print(constituency_parse)

        # get the first subtree of the constituency parse
        print('---')
        print('first subtree of constituency parse')
        print(constituency_parse.child[0])

        # get the value of the first subtree
        print('---')
        print('value of first subtree of constituency parse')
        print(constituency_parse.child[0].value)

        # get the dependency parse of the first sentence
        print('---')
        print('dependency parse of first sentence')
        dependency_parse = sentence.basicDependencies
        print(dependency_parse)

        # get the first token of the first sentence
        print('---')
        print('first token of first sentence')
        token = sentence.token[0]
        print(token)

        # get the part-of-speech tag
        print('---')
        print('part of speech tag of token')
        token.pos
        print(token.pos)

#         # get the named entity tag
#         print('---')
#         print('named entity tag of token')
#         print(token.ner)

#         # get an entity mention from the first sentence
#         print('---')
#         print('first entity mention in sentence')
#         print(sentence.mentions[0])

#         # access the coref chain
#         print('---')
#         print('coref chains for the example')
#         print(ann.corefChain)

Starting server with command: java -Xmx16G -cp /Users/talhindi/Downloads/stanford-corenlp-4.0.0/* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-0e1b40fd00ca471e.props -preload tokenize,ssplit,pos,lemma,ner,parse,depparse,coref
---
constituency parse of first sentence
child {
  child {
    child {
      value: "Should"
    }
    value: "MD"
  }
  child {
    child {
      child {
        value: "students"
      }
      value: "NNS"
    }
    value: "NP"
  }
  child {
    child {
      child {
        value: "be"
      }
      value: "VB"
    }
    child {
      child {
        child {
          value: "taught"
        }
        value: "VBN"
      }
      child {
        child {
          child {
            child {
              child {
                value: "to"
              }
              value: "TO"
            }
            child {
              child {
                child 

IndexError: list index (0) out of range