In [1]:
import glob
import pandas as pd
import numpy as np
import json
import pickle
import string
import copy
from collections import defaultdict
from collections import Counter

import spacy
import networkx as nx
model_dir = '/Users/talhindi/miniconda3/lib/python3.7/site-packages/en_core_web_sm/en_core_web_sm-2.1.0'

## Reading Data

In [2]:
train_test_split = pd.read_csv('../data/SG2017/train-test-split.csv', sep=';')

In [3]:
essays_txt_prg_list = []
for file in sorted(glob.glob("../data/SG2017/*.txt")):
    essay = open(file).readlines()
    essays_txt_prg_list.append(essay)

essay_txt_str = []
for essay in essays_txt_prg_list:
    essay_txt_str.append(''.join(essay))
    
essays_ann = []
for file in sorted(glob.glob("../data/SG2017/*.ann")):
    essay = open(file).readlines()
    essays_ann.append(essay)

In [6]:
essays_segments = []

for essay in essays_ann:    
    segments = []
    
    for line in essay:
        if line[0] == 'T':
            _, label_s_e, text = line.rstrip().split('\t')
            label, start, end = label_s_e.split()
            segments.append((label, int(start), int(end), text))
            
    segments.sort(key = lambda element : element[1])
    essays_segments.append(segments)

## Labels

In [31]:
def get_labels(essay_spacy, segments):
    '''O = 0, Arg-B = 1, Arg-I = 2'''
    
    doc_len = len(essay_spacy)
    
    labels = []
    tokens = []
    arg_seg_starts = [start for arg_type, start, end, text in segments]
    
    for token in essay_spacy:
        arg_I_token = False

        if token.idx in arg_seg_starts:
#             labels.append('Arg-B')
            labels.append(1.0)
            tokens.append(token.text)
            assert token.text in segments[arg_seg_starts.index(token.idx)][-1]
        else:
            for _, start, end, _ in segments:
                if token.idx > start and token.idx+len(token) <= end:
#                     labels.append('Arg-I')
                    labels.append(2.0)
                    tokens.append(token.text)
                    arg_I_token = True
            if not arg_I_token:
#                 labels.append('O')
                labels.append(0.0)
                tokens.append(token.text)

    assert len(labels) == doc_len
    return tokens, labels

## Spacy

In [8]:
nlp = spacy.load(model_dir)

essay_spacy = []
for essay in essay_txt_str:
    essay_spacy.append(nlp(essay))

In [7]:
# counting labels from each type
# without new lines
token_labels = []
train_BIO = defaultdict(int)
test_BIO = defaultdict(int)

for doc, segments, group in zip(essay_spacy, essays_segments, train_test_split.SET):
    tokens, labels = get_labels(doc, segments)
    
    if group == "TRAIN":
        for label in  labels:
            train_BIO[label] += 1
    else:
        for label in  labels:
            test_BIO[label] += 1
    
train_BIO,test_BIO

(defaultdict(int, {'O': 39617, 'Arg-B': 4823, 'Arg-I': 75312}),
 defaultdict(int, {'O': 9801, 'Arg-B': 1266, 'Arg-I': 18748}))

## LexSyn Features

In [48]:
'''LexSyn 1:
        We use lexical head projection rules (Collins 2003) implemented in the Stanford tool suite
        to lexicalize the constituent parse tree. 
        For each token t, we extract its uppermost node n in the parse tree 
        with the lexical head t and define a lexico- syntactic feature as 
        the combination of t and the constituent type of n.'''

def get_lex_dep_token_context(doc, hops=1):
    '''A modification of SG2017 LexSyn features. We get the relation governing the token and its previous and
    next tokens. We also go N hops deep in retrieving those relations where N(hops) is a input to this function'''
    features = []
    for sent in doc.sents:
        for i, token in enumerate(sent):
            token_features = {}
            this_token = token
            prev_token = sent[i-1] if i > 0 else 'NO_TOKEN' 
            next_token = sent[i+1] if i < len(sent)-1 else 'NO_TOKEN'
            
#             print(this_token, prev_token, next_token)
            
            for j in range(hops):
#                 print(j)
                if type(this_token) is not str:
                    token_features['dep_{}_{}'.format(j, this_token.dep_)] = 1.0
                    token_features['token_dep_{}_{}_{}'.format(j, this_token.dep_, this_token)] = 1.0
                    this_token = token.head if this_token.dep_ != 'ROOT' else 'NO_TOKEN'
                
                if type(prev_token) is not str:
                    get_lex_dep_token_prev(sent[i-1], token_features, j)
                    prev_token = token.head if prev_token.dep_ != 'ROOT' else 'NO_TOKEN'
                
                if type(next_token) is not str:
                    get_lex_dep_token_next(sent[i+1], token_features, j)
                    next_token = token.head if next_token.dep_ != 'ROOT' else 'NO_TOKEN'

            features.append(copy.deepcopy(token_features))
            del token_features
    return features

def get_lex_dep_token_prev(prev_token, token_features, j):
    token_features['prev_dep_{}_{}'.format(j, prev_token.dep_)] = 1.0
    token_features['prev_token_dep_{}_{}_{}'.format(j,prev_token.dep_, prev_token)] = 1.0

def get_lex_dep_token_next(next_token, token_features, j):
    token_features['next_dep_{}_{}'.format(j, next_token.dep_)] = 1.0
    token_features['next_token_dep_{}_{}_{}'.format(j, next_token.dep_, next_token)] = 1.0
            

'''LexSyn 2:
        We also consider the child node of n in the path to t and its right sibling, 
        and combine their lexical heads and constituent types as described by Soricut and Marcu (2003).'''
def get_sibling_features(doc):
    pass


### Feature Extraction

In [49]:
# lexSyn features, 1 hop

token_id = 0
open('../features/SG2017_train/LexSyn_1hop.jsonlines', 'w')
open('../features/SG2017_test/LexSyn_1hop.jsonlines', 'w')

for i, (doc, segments, group) in enumerate(zip(essay_spacy, essays_segments, train_test_split.SET)):
#     print('essay: ', i)
    features = get_lex_dep_token_context(doc)
    tokens, labels = get_labels(doc, segments)
    
    if group == "TRAIN":
        with open('../features/SG2017_train/LexSyn_1hop.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1
    else:
        with open('../features/SG2017_test/LexSyn_1hop.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1

In [50]:
# lexSyn features, 2 hops

token_id = 0
open('../features/SG2017_train/LexSyn_2hops.jsonlines', 'w')
open('../features/SG2017_test/LexSyn_2hops.jsonlines', 'w')

for i, (doc, segments, group) in enumerate(zip(essay_spacy, essays_segments, train_test_split.SET)):
#     print('essay: ', i)
    features = get_lex_dep_token_context(doc, 2)
    tokens, labels = get_labels(doc, segments)
    
    if group == "TRAIN":
        with open('../features/SG2017_train/LexSyn_2hops.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1
    else:
        with open('../features/SG2017_test/LexSyn_2hops.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1

# Archive

In [56]:
argB_dep, other_dep = [], []
for i, (doc, segments, group) in enumerate(zip(essay_spacy, essays_segments, train_test_split.SET)):
    if group == "TRAIN":
        tokens, labels = get_labels(doc, segments)
        assert len(doc) == len(labels)
        for token, label in zip(doc, labels):
            if label == 1.0:
                argB_dep.append(token.dep_)
            elif label == 2.0:
                other_dep.append(token.dep_)

In [58]:
Counter(argB_dep).most_common()

[('nsubj', 1821),
 ('det', 840),
 ('amod', 426),
 ('prep', 367),
 ('advmod', 259),
 ('csubj', 233),
 ('mark', 196),
 ('compound', 128),
 ('expl', 123),
 ('nsubjpass', 115),
 ('poss', 66),
 ('advcl', 46),
 ('ROOT', 30),
 ('cc', 26),
 ('aux', 22),
 ('neg', 18),
 ('pobj', 18),
 ('npadvmod', 17),
 ('preconj', 14),
 ('subtok', 7),
 ('predet', 7),
 ('nmod', 6),
 ('nummod', 5),
 ('csubjpass', 4),
 ('pcomp', 4),
 ('appos', 4),
 ('dobj', 3),
 ('punct', 3),
 ('intj', 3),
 ('conj', 3),
 ('dep', 2),
 ('attr', 2),
 ('auxpass', 2),
 ('xcomp', 1),
 ('acl', 1),
 ('ccomp', 1)]

In [57]:
Counter(other_dep).most_common()

[('punct', 11743),
 ('prep', 11451),
 ('pobj', 11094),
 ('det', 7891),
 ('amod', 7726),
 ('nsubj', 7230),
 ('dobj', 6585),
 ('aux', 6263),
 ('advmod', 6102),
 ('ROOT', 5609),
 ('conj', 4105),
 ('cc', 3741),
 ('poss', 2405),
 ('mark', 2175),
 ('compound', 2080),
 ('ccomp', 2024),
 ('advcl', 2008),
 ('acomp', 1667),
 ('xcomp', 1586),
 ('', 1549),
 ('relcl', 1352),
 ('attr', 1222),
 ('auxpass', 1108),
 ('pcomp', 1040),
 ('neg', 849),
 ('nsubjpass', 814),
 ('acl', 729),
 ('case', 397),
 ('npadvmod', 316),
 ('prt', 302),
 ('nummod', 268),
 ('agent', 215),
 ('dative', 209),
 ('expl', 197),
 ('appos', 187),
 ('nmod', 136),
 ('csubj', 124),
 ('preconj', 110),
 ('subtok', 86),
 ('predet', 69),
 ('oprd', 68),
 ('dep', 39),
 ('quantmod', 29),
 ('parataxis', 11),
 ('intj', 9),
 ('csubjpass', 9)]

In [28]:
for sent in essay_spacy[0].sents:
    for token in sent:
        print("{0}/{1} <--{2}-- {3}/{4} <--{5}--".format(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_,
                                           token.head.dep_))

Should/MD <--aux-- taught/VBN <--ROOT--
students/NNS <--nsubjpass-- taught/VBN <--ROOT--
be/VB <--auxpass-- taught/VBN <--ROOT--
taught/VBN <--ROOT-- taught/VBN <--ROOT--
to/TO <--aux-- compete/VB <--xcomp--
compete/VB <--xcomp-- taught/VBN <--ROOT--
or/CC <--cc-- compete/VB <--xcomp--
to/TO <--aux-- cooperate/VB <--conj--
cooperate/VB <--conj-- compete/VB <--xcomp--
?/. <--punct-- taught/VBN <--ROOT--


/_SP <---- ?/. <--punct--
It/PRP <--nsubjpass-- said/VBN <--ROOT--
is/VBZ <--auxpass-- said/VBN <--ROOT--
always/RB <--advmod-- said/VBN <--ROOT--
said/VBN <--ROOT-- said/VBN <--ROOT--
that/IN <--mark-- promote/VB <--ccomp--
competition/NN <--nsubj-- promote/VB <--ccomp--
can/MD <--aux-- promote/VB <--ccomp--
effectively/RB <--advmod-- promote/VB <--ccomp--
promote/VB <--ccomp-- said/VBN <--ROOT--
the/DT <--det-- development/NN <--dobj--
development/NN <--dobj-- promote/VB <--ccomp--
of/IN <--prep-- development/NN <--dobj--
economy/NN <--pobj-- of/IN <--prep--
./. <--punct-- said/VBN <

In [None]:
# one hop only
'''LexSyn 1:
        We use lexical head projection rules (Collins 2003) implemented in the Stanford tool suite
        to lexicalize the constituent parse tree. 
        For each token t, we extract its uppermost node n in the parse tree 
        with the lexical head t and define a lexico- syntactic feature as 
        the combination of t and the constituent type of n.'''

def get_lex_dep_token_context(doc, hops=1):
    '''A modification of SG2017 LexSyn features. We get the relation governing the token and its previous and
    next tokens. We also go N hops deep in retrieving those relations where N(hops) is a input to this function'''
    features = []
    for sent in doc.sents:
        for i, token in enumerate(sent):
            token_features = {}
            token_features['dep_{}'.format(token.dep_)] = 1.0
            token_features['token_dep_{}'.format(token.dep_)] = 1.0
            if i == 0:
                get_lex_dep_token_next(sent[i+1], token_features)
            elif i == len(sent)-1:
                get_lex_dep_token_prev(sent[i-1], token_features)
            else:
                get_lex_dep_token_prev(sent[i-1], token_features)
                get_lex_dep_token_next(sent[i+1], token_features)
            
            features.append(copy.deepcopy(token_features))
            del token_features
    return features

def get_lex_dep_token_prev(prev_token, token_features):
    token_features['prev_dep_{}'.format(prev_token.dep_)] = 1.0
    token_features['prev_token_dep_{}'.format(prev_token.dep_)] = 1.0

def get_lex_dep_token_next(next_token, token_features):
    token_features['next_dep_{}'.format(next_token.dep_)] = 1.0
    token_features['next_token_dep_{}'.format(next_token.dep_)] = 1.0

In [None]:
def get_sdp_path(doc, subj, obj, lca_matrix):
    lca = lca_matrix[subj, obj]
  
    current_node = doc[subj]
    subj_path = [current_node]
    if lca != -1: 
        if lca != subj: 
            while current_node.head.i != lca:
                current_node = current_node.head
                subj_path.append(current_node)
            subj_path.append(current_node.head)
            
    current_node = doc[obj]
    obj_path = [current_node]
    if lca != -1: 
        if lca != obj: 
            while current_node.head.i != lca:
                current_node = current_node.head
                obj_path.append(current_node)
            obj_path.append(current_node.head)
  
    return subj_path + obj_path[::-1][1:]
  

nlp = spacy.load(model_dir)
doc = nlp(u'Convulsions that occur after DTaP are caused by a fever, and fever may cause headache.')
# set head entity index and tail entity index
head, tail = 0, 9

sdp = get_sdp_path(doc, head, tail, doc.get_lca_matrix())
print(sdp)