In [1]:
import glob
import pandas as pd
import numpy as np
import json
import pickle
import string
import copy
from collections import defaultdict
from collections import Counter

import spacy
import networkx as nx
model_dir = '/Users/talhindi/miniconda3/lib/python3.7/site-packages/en_core_web_sm/en_core_web_sm-2.1.0'

## Reading Data

In [2]:
train_test_split = pd.read_csv('../data/SG2017/train-test-split.csv', sep=';')

In [3]:
essays_txt_prg_list = []
for file in sorted(glob.glob("../data/SG2017/*.txt")):
    essay = open(file).readlines()
    essays_txt_prg_list.append(essay)

essay_txt_str = []
for essay in essays_txt_prg_list:
    essay_txt_str.append(''.join(essay))
#     essay_str = ''
#     for prg in essay:
#         essay_str += prg.rstrip()+' '
#     essay_txt_str.append(essay_str)

    
essays_ann = []
for file in sorted(glob.glob("../data/SG2017/*.ann")):
    essay = open(file).readlines()
    essays_ann.append(essay)

In [4]:
essays_segments = []

for essay in essays_ann:    
    segments = []
    
    for line in essay:
        if line[0] == 'T':
            _, label_s_e, text = line.rstrip().split('\t')
            label, start, end = label_s_e.split()
            segments.append((label, int(start), int(end), text))
            
    segments.sort(key = lambda element : element[1])
    essays_segments.append(segments)

## Labels

In [5]:
def get_labels_spacy(essay_spacy, segments):
    '''O = 0, Arg-B = 1, Arg-I = 2'''
    
    doc_len = len(essay_spacy)
    
    labels = []
    tokens = []
    arg_seg_starts = [start for arg_type, start, end, text in segments]
    
    for token in essay_spacy:
        arg_I_token = False

        if token.idx in arg_seg_starts:
#                 labels.append('B')
            labels.append(1.0)
            tokens.append(token.text)
            assert token.text in segments[arg_seg_starts.index(token.idx)][-1]
        else:
            for _, start, end, _ in segments:
                if token.idx > start and token.idx+len(token) <= end:
#                         labels.append('I')
                    labels.append(2.0)
                    tokens.append(token.text)
                    arg_I_token = True
            if not arg_I_token:
#                     labels.append('O')
                labels.append(0.0)
                tokens.append(token.text)

    assert len(labels) == doc_len
    return tokens, labels

In [23]:
# with new lines
train_BIO,test_BIO

(defaultdict(int, {0.0: 39617, 1.0: 4823, 2.0: 75312}),
 defaultdict(int, {0.0: 9801, 1.0: 1266, 2.0: 18748}))

## Spacy

In [6]:
nlp = spacy.load(model_dir)

essay_spacy = []
for essay in essay_txt_str:
    essay_spacy.append(nlp(essay))

In [7]:
# counting labels from each type
# without new lines
token_labels = []
train_BIO = defaultdict(int)
test_BIO = defaultdict(int)

for doc, segments, group in zip(essay_spacy, essays_segments, train_test_split.SET):
    tokens, labels = get_labels_spacy(doc, segments)
    
    if group == "TRAIN":
        for label in  labels:
            train_BIO[label] += 1
    else:
        for label in  labels:
            test_BIO[label] += 1
    
train_BIO,test_BIO

(defaultdict(int, {0.0: 39617, 1.0: 4823, 2.0: 75312}),
 defaultdict(int, {0.0: 9801, 1.0: 1266, 2.0: 18748}))

## Syntactic Features

In [9]:
def get_pos(essay_spacy):
    '''Part-of-speech: The token’s part-of-speech'''
    pos_features = []
    for token in essay_spacy:
        pos_features.append({'pos_{}'.format(token.pos_): 1.0})
    
    return pos_features
            
    
    
def get_lca_features_sent(sent, get_average, get_types=False):
    '''Lowest common ancestor (LCA):
        Normalized length of the path to the LCA with the *following* and *preceding* token in the parse tree'''    
    edges = []
    for token in sent:
        for child in token.children:
            edges.append(('{0}-{1}'.format(token.text, token.i),'{0}-{1}'.format(child.text, child.i)))
            
    graph = nx.Graph(edges)
    lca_matrix = sent.get_lca_matrix()
#     print(graph, edges, lca_matrix)
    
    lca_prev_next_path_sent, lca_types_sent = [], []
    for token_id, token in enumerate(sent):
        if token_id == 0:
            token_prev_lca = -1
            token_next_lca = lca_matrix[token_id, token_id+1]
        elif token_id == len(sent)-1:
            token_prev_lca = lca_matrix[token_id, token_id-1]
            token_next_lca = -1
        else:
            token_prev_lca = lca_matrix[token_id, token_id-1]
            token_next_lca = lca_matrix[token_id, token_id+1]
        
        # adding index to tokens to retrieve node in graph. node-name = token-index
        source_token = '{0}-{1}'.format(token.text, token.i)
        lca_types_token = {}
        
        # token, previous_token shortest path to lca
        if token_prev_lca != -1:
            source_prev_token = '{0}-{1}'.format(sent[token_id-1].text, sent[token_id-1].i)
            target_token_prev_lca = '{0}-{1}'.format(sent[token_prev_lca].text, sent[token_prev_lca].i)
#             lca_types_token['lca_prev_{}'.format(sent[token_prev_lca].pos_)] = 1.0
            lca_types_token['lca_prev_{}'.format(sent[token_prev_lca].dep_)] = 1.0
            
            lca_prev_path_token = nx.shortest_path_length(graph, source=source_token, target=target_token_prev_lca)
            lca_prev_path_prev = nx.shortest_path_length(graph, source=source_prev_token, target=target_token_prev_lca)
            
            if get_average:
                lca_prev_path = np.mean((lca_prev_path_token, lca_prev_path_prev))
            else:
                lca_prev_path = lca_prev_path_token
        else:
            lca_prev_path = -1
            
        # token, next_token shortest path to lca
        if token_next_lca != -1:
            source_next_token = '{0}-{1}'.format(sent[token_id+1].text, sent[token_id+1].i)
            target_token_next_lca = '{0}-{1}'.format(sent[token_next_lca].text, sent[token_next_lca].i)
#             lca_types_token['lca_next_{}'.format(sent[token_next_lca].pos_)] = 1.0
            lca_types_token['lca_next_{}'.format(sent[token_next_lca].dep_)] = 1.0
            
            lca_next_path_token = nx.shortest_path_length(graph, source=source_token, target=target_token_next_lca)
            lca_next_path_next = nx.shortest_path_length(graph, source=source_next_token, target=target_token_next_lca)
            
            if get_average:
                lca_next_path = np.mean((lca_next_path_token, lca_next_path_next))
            else:
                lca_next_path = lca_next_path_token
        else:
            lca_next_path = -1
        
        # adding LCA features of this token
        if get_average:
            lca_prev_next_path_sent.append({'lca_prev_path_avg': lca_prev_path, 'lca_next_path_avg': lca_next_path})
        else:
            lca_prev_next_path_sent.append({'lca_prev_path': lca_prev_path, 'lca_next_path': lca_next_path})
        lca_types_sent.append(lca_types_token)
     
    # returning LCA features for all tokens in the sentence
    if not get_types:
        return lca_prev_next_path_sent
    else:
        return lca_types_sent
    

def get_lca_features_doc(doc, get_average=True):
    token_lca, sent_id = [], 0
    
    for sent in doc.sents:
#         print(sent_id)
        if len(sent) > 1:
            sent_lca = get_lca_features_sent(sent, get_average, False)
        else:
            assert len(sent) == 1
            sent_lca = [{'lca_prev_path': 0, 'lca_next_path': 0}]
        
        for feature in sent_lca:
            token_lca.append(feature)
        sent_id += 1
    
    return token_lca


def get_lca_types_doc(doc):
    '''LCA types: The two constituent types of the LCA of the current token and its preceding and following token'''
    token_lca, sent_id = [], 0
    
    for sent in doc.sents:
#         print(sent_id)
        if len(sent) > 1:
            sent_lca = get_lca_features_sent(sent, False, True)
        else:
            assert len(sent) == 1
            sent_lca = [{}]
        
        for feature in sent_lca:
            token_lca.append(feature)
        sent_id += 1
    
    return token_lca

### Feature Extraction

In [80]:
# pos tags

token_id = 0
open('../features/SG2017_train/token_pos_spacy.jsonlines', 'w')
open('../features/SG2017_test/token_pos_spacy.jsonlines', 'w')

for doc, segments, group in zip(essay_spacy, essays_segments, train_test_split.SET):
    
    features = get_pos(doc)
    tokens, labels = get_labels_spacy(doc, segments)
    
    if group == "TRAIN":
        with open('../features/SG2017_train/token_pos_spacy.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1
    else:
        with open('../features/SG2017_test/token_pos_spacy.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1

In [88]:
# LCA 

token_id = 0
open('../features/SG2017_train/token_LCA_spacy.jsonlines', 'w')
open('../features/SG2017_test/token_LCA_spacy.jsonlines', 'w')

for i, (doc, segments, group) in enumerate(zip(essay_spacy, essays_segments, train_test_split.SET)):
    
#     print(i)
    features = get_lca_features_doc(doc, False)
    tokens, labels = get_labels_spacy(doc, segments)
    
    if group == "TRAIN":
        with open('../features/SG2017_train/token_LCA_spacy.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1
    else:
        with open('../features/SG2017_test/token_LCA_spacy.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1

In [None]:
# LCA avg

token_id = 0
open('../features/SG2017_train/token_LCA_avg_spacy.jsonlines', 'w')
open('../features/SG2017_test/token_LCA_avg_spacy.jsonlines', 'w')

for i, (doc, segments, group) in enumerate(zip(essay_spacy, essays_segments, train_test_split.SET)):
    
#     print(i)
    features = get_lca_features_doc(doc)
    tokens, labels = get_labels_spacy(doc, segments)
    
    if group == "TRAIN":
        with open('../features/SG2017_train/token_LCA_spacy.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1
    else:
        with open('../features/SG2017_test/token_LCA_spacy.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1

In [10]:
# LCA type

token_id = 0
open('../features/SG2017_train/token_LCA_type_spacy.jsonlines', 'w')
open('../features/SG2017_test/token_LCA_type_spacy.jsonlines', 'w')

for i, (doc, segments, group) in enumerate(zip(essay_spacy, essays_segments, train_test_split.SET)):
    
#     print(i)
    features = get_lca_types_doc(doc)
    tokens, labels = get_labels_spacy(doc, segments)
    
    if group == "TRAIN":
        with open('../features/SG2017_train/token_LCA_type_spacy.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1
    else:
        with open('../features/SG2017_test/token_LCA_type_spacy.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1

In [None]:
# LCA type -- old == pos

token_id = 0
open('../features/SG2017_train/token_LCA_pos_spacy.jsonlines', 'w')
open('../features/SG2017_test/token_LCA_pos_spacy.jsonlines', 'w')

for i, (doc, segments, group) in enumerate(zip(essay_spacy, essays_segments, train_test_split.SET)):
    
#     print(i)
    features = get_lca_types_doc(doc)
    tokens, labels = get_labels_spacy(doc, segments)
    
    if group == "TRAIN":
        with open('../features/SG2017_train/token_LCA_pos_spacy.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1
    else:
        with open('../features/SG2017_test/token_LCA_pos_spacy.jsonlines', 'a') as file:
            for f, l in zip(features, labels):
                file.write('{{"y": {}, "x": {}, "id": {}}}\n'.format(l, json.dumps(f), token_id))
                token_id +=1

# Archive

In [82]:
# Load spacy's dependency tree into a networkx graph
edges = []
for token in doc:
    for child in token.children:
        edges.append(('{0}'.format(token.lower_),'{0}'.format(child.lower_)))


graph = nx.Graph(edges)

# Get the length and path
entity1 = 'Convulsions'.lower()
entity2 = 'fever'

print(nx.shortest_path_length(graph, source=entity1, target=entity2))
print(nx.shortest_path(graph, source=entity1, target=entity2))

3
['convulsions', 'caused', 'by', 'fever']


In [13]:
def get_sdp_path(doc, subj, obj, lca_matrix):
    lca = lca_matrix[subj, obj]
  
    current_node = doc[subj]
    subj_path = [current_node]
    if lca != -1: 
        if lca != subj: 
            while current_node.head.i != lca:
                current_node = current_node.head
                subj_path.append(current_node)
            subj_path.append(current_node.head)
            
    current_node = doc[obj]
    obj_path = [current_node]
    if lca != -1: 
        if lca != obj: 
            while current_node.head.i != lca:
                current_node = current_node.head
                obj_path.append(current_node)
            obj_path.append(current_node.head)
  
    return subj_path + obj_path[::-1][1:]

In [11]:
nlp = spacy.load(model_dir)
doc = nlp(u'Convulsions that occur after DTaP are caused by a fever.')

for token in doc:
    print((token.head.text, token.text, token.dep_))

('caused', 'Convulsions', 'nsubjpass')
('occur', 'that', 'nsubj')
('Convulsions', 'occur', 'relcl')
('occur', 'after', 'prep')
('caused', 'DTaP', 'nsubjpass')
('caused', 'are', 'auxpass')
('caused', 'caused', 'ROOT')
('caused', 'by', 'agent')
('fever', 'a', 'det')
('by', 'fever', 'pobj')
('caused', '.', 'punct')


In [12]:
doc.get_lca_matrix()

array([[ 0,  0,  0,  0,  6,  6,  6,  6,  6,  6,  6],
       [ 0,  1,  2,  2,  6,  6,  6,  6,  6,  6,  6],
       [ 0,  2,  2,  2,  6,  6,  6,  6,  6,  6,  6],
       [ 0,  2,  2,  3,  6,  6,  6,  6,  6,  6,  6],
       [ 6,  6,  6,  6,  4,  6,  6,  6,  6,  6,  6],
       [ 6,  6,  6,  6,  6,  5,  6,  6,  6,  6,  6],
       [ 6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6],
       [ 6,  6,  6,  6,  6,  6,  6,  7,  7,  7,  6],
       [ 6,  6,  6,  6,  6,  6,  6,  7,  8,  9,  6],
       [ 6,  6,  6,  6,  6,  6,  6,  7,  9,  9,  6],
       [ 6,  6,  6,  6,  6,  6,  6,  6,  6,  6, 10]], dtype=int32)

In [12]:
nlp = spacy.load('en')
doc = nlp('Wall Street Journal just published an interesting piece on crypto currencies')
 
for token in doc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(
        token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))

Wall/NNP <--compound-- Street/NNP
Street/NNP <--compound-- Journal/NNP
Journal/NNP <--nsubj-- published/VBD
just/RB <--advmod-- published/VBD
published/VBD <--ROOT-- published/VBD
an/DT <--det-- piece/NN
interesting/JJ <--amod-- piece/NN
piece/NN <--dobj-- published/VBD
on/IN <--prep-- piece/NN
crypto/NN <--amod-- currencies/NNS
currencies/NNS <--pobj-- on/IN


In [21]:
nlp = spacy.load('en')
doc = nlp(essay_txt_str[0])

In [46]:
# sents = list(doc.sents)
sents[2], sents[2][0], sents[2][0].head
sents[2], sents[2][9], sents[2][9].head

(In order to survive in the competition, companies continue to improve their products and service, and as a result, the whole society prospers.,
 continue,
 continue)

In [26]:
for sent in doc.sents:
    for token in sent:
        print("{0}/{1} <--{2}-- {3}/{4}".format(token.text, token.tag_, token.dep_, token.head.text, token.head.tag_))
    print()

Should/MD <--aux-- taught/VBN
students/NNS <--nsubjpass-- taught/VBN
be/VB <--auxpass-- taught/VBN
taught/VBN <--ROOT-- taught/VBN
to/TO <--aux-- compete/VB
compete/VB <--xcomp-- taught/VBN
or/CC <--cc-- compete/VB
to/TO <--aux-- cooperate/VB
cooperate/VB <--conj-- compete/VB
?/. <--punct-- taught/VBN


/_SP <---- ?/.

It/PRP <--nsubjpass-- said/VBN
is/VBZ <--auxpass-- said/VBN
always/RB <--advmod-- said/VBN
said/VBN <--ROOT-- said/VBN
that/IN <--mark-- promote/VB
competition/NN <--nsubj-- promote/VB
can/MD <--aux-- promote/VB
effectively/RB <--advmod-- promote/VB
promote/VB <--ccomp-- said/VBN
the/DT <--det-- development/NN
development/NN <--dobj-- promote/VB
of/IN <--prep-- development/NN
economy/NN <--pobj-- of/IN
./. <--punct-- said/VBN

In/IN <--prep-- continue/VBP
order/NN <--pobj-- In/IN
to/TO <--aux-- survive/VB
survive/VB <--acl-- order/NN
in/IN <--prep-- survive/VB
the/DT <--det-- competition/NN
competition/NN <--pobj-- in/IN
,/, <--punct-- continue/VBP
companies/NNS <--nsubj

In [4]:
#!/usr/bin/env python
# coding: utf8
"""This example shows how to navigate the parse tree including subtrees
attached to a word.

Based on issue #252:
"In the documents and tutorials the main thing I haven't found is
examples on how to break sentences down into small sub thoughts/chunks. The
noun_chunks is handy, but having examples on using the token.head to find small
(near-complete) sentence chunks would be neat. Lets take the example sentence:
"displaCy uses CSS and JavaScript to show you how computers understand language"

This sentence has two main parts (XCOMP & CCOMP) according to the breakdown:
[displaCy] uses CSS and Javascript [to + show]
show you how computers understand [language]

I'm assuming that we can use the token.head to build these groups."

Compatible with: spaCy v2.0.0+
Last tested with: v2.1.0
"""
from __future__ import unicode_literals, print_function

import plac
import spacy


# @plac.annotations(model=("Model to load", "positional", None, str))
def parse(model="en_core_web_sm"):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)

    doc = nlp(
        "displaCy uses CSS and JavaScript to show you how computers "
        "understand language"
    )

    # The easiest way is to find the head of the subtree you want, and then use
    # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
    # is the one that does what you're asking for most directly:
    for word in doc:
        if word.dep_ in ("xcomp", "ccomp"):
            print("".join(w.text_with_ws for w in word.subtree))

    # It'd probably be better for `word.subtree` to return a `Span` object
    # instead of a generator over the tokens. If you want the `Span` you can
    # get it via the `.right_edge` and `.left_edge` properties. The `Span`
    # object is nice because you can easily get a vector, merge it, etc.
    for word in doc:
        if word.dep_ in ("xcomp", "ccomp"):
            subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
            print(subtree_span.text, "|", subtree_span.root.text)

    # You might also want to select a head, and then select a start and end
    # position by walking along its children. You could then take the
    # `.left_edge` and `.right_edge` of those tokens, and use it to calculate
    # a span.

    

    # Expected output:
    # to show you how computers understand language
    # how computers understand language
    # to show you how computers understand language | show
    # how computers understand language | understand
