# 1. RST parser playground

This RST parser trains classifiers on RST trees from RST treebank. Here you can provide your own custom sentences to see what trees are generated. If you wish to see how it is implemented, scroll down until you reach "RST Implementation" part.

Major part of the implementation is located in the "rst.py" file. It contains the definition of RSTTree class with algorithms for processing lisp expressions and mainting cumulative data. The file also contains function "create_tree" which creates a new RSTTree given list of sentences and trained neural models for prediction.

In [1]:
from rst import RSTTree

### Load POS and relation dictionaries

In [2]:
import pickle

with open('data/parser/relation_dicts.pickle', 'rb') as handle:
    relation2idx, idx2relation = pickle.load(handle)

with open('data/parser/pos2idx.pickle', 'rb') as handle:
    pos2idx = pickle.load(handle)

### Load word embeddings

In [3]:
import gensim
import logging
logging.basicConfig(level=logging.CRITICAL)

In [4]:
EMBED_SIZE = 100
embed_model = gensim.models.Word2Vec.load("data/parser/word2vec")

In [5]:
import numpy as np

def get_sentence_embedding(sentence, embed_model):
        embeddings = [embed_model[word] for word in sentence if word in embed_model.wv.vocab]
        if len(embeddings) == 0:
            return None
        word_sum = np.zeros(EMBED_SIZE, dtype='float64')
        word_count = 0
        for word in embeddings:
            word_sum += word
            word_count += 1
        return word_sum / word_count

In [6]:
import spacy
from spacy.attrs import POS
nlp = spacy.load("en")

def get_sentence_vector(sentence, embed_model):
    embedding = get_sentence_embedding(sentence, embed_model)
    if embedding is None:
        return None
    doc = nlp(" ".join(sentence))
    root = [token for token in doc if token.head == token][0]
    return np.r_[len(sentence), (np.arange(POS) == pos2idx[root.pos_]).astype(np.float64), embedding]

In [7]:
def get_vector(lhs, rhs, embed_model):
    if lhs.text is None or rhs.text is None:
        return None
    lhs_vector = get_sentence_vector(lhs.text, embed_model)
    if lhs_vector is None:
        return None
    rhs_vector = get_sentence_vector(rhs.text, embed_model)
    if rhs_vector is None:
        return None
    return np.r_[lhs_vector, rhs_vector]

### Load train and test sets

In [8]:
with open('data/parser/connection_train_set.pickle', 'rb') as handle:
    (conn_train_X, conn_train_Y) = pickle.load(handle)

with open('data/parser/connection_test_set.pickle', 'rb') as handle:
    (conn_test_X, conn_test_Y) = pickle.load(handle)

In [9]:
with open('data/parser/relation_train_set.pickle', 'rb') as handle:
    (rel_train_X, rel_train_Y) = pickle.load(handle)

with open('data/parser/relation_test_set.pickle', 'rb') as handle:
    (rel_test_X, rel_test_Y) = pickle.load(handle)

In [10]:
with open('data/parser/nuclearity_train_set.pickle', 'rb') as handle:
    (nuc_train_X, nuc_train_Y) = pickle.load(handle)

with open('data/parser/nuclearity_test_set.pickle', 'rb') as handle:
    (nuc_test_X, nuc_test_Y) = pickle.load(handle)

### Load NN models

In [11]:
from keras.models import load_model

Using TensorFlow backend.


In [12]:
connection_model = load_model("data/parser/connection_model.h5")
relation_model = load_model("data/parser/relation_model.h5")
nuclearity_model = load_model("data/parser/nuclearity_model.h5")

### Playground

In [13]:
from rst import create_tree

In [14]:
sent1 = "Spencer J. Volk, president and chief operating officer of this consumer and industrial products company, was elected a director."
sent2 = "Mr. Volk, 55 years old, succeeds Duncan Dwight,"
sent3 = "who retired in September."
sentence_list = [sent1, sent2, sent3]
result = create_tree(
    sentence_list,
    lambda lhs, rhs: get_vector(lhs, rhs, embed_model),
    idx2relation,
    connection_model,
    relation_model,
    nuclearity_model)
print(result.output_lisp())

  after removing the cwd from sys.path.


( root (span 0 2) 
  ( nucleus (leaf 0) (rel2par span) (text spencer j . volk , president and chief operating officer of this consumer and industrial products company , was elected a director .) )
  ( satellite (span 1 2) (rel2par elaboration-additional)   
    ( nucleus (leaf 1) (rel2par span) (text mr . volk , 55 years old , succeeds duncan dwight ,) )  
    ( satellite (leaf 2) (rel2par elaboration-additional-e) (text who retired in september .) )  
)
)


# 2. RST implementation

In [1]:
from rst import RSTTree

## 2.1 Read the corpus into the list of RSTTree structures

We need to get a list of trees to navigate the sentences and to later generate them from NN predictions

In [2]:
import os

class CorpusReader:

    def __init__(self, rst_root):
        self.rst_root = rst_root
    
    def load_test_trees(self):
        return self._load_trees("TEST")
    
    def load_train_trees(self):
        return self._load_trees("TRAINING")
    
    def _load_trees(self, dirsuffix):
        root_with_suffix = os.path.join(self.rst_root, dirsuffix)
        for dirname in os.listdir(root_with_suffix):
            dirname = os.path.join(root_with_suffix, dirname)
            if os.path.isdir(dirname):
                for filename in os.listdir(dirname):
                    filename = os.path.join(dirname, filename)
                    if os.path.isfile(filename) and len(filename) > 9 and filename[-9:] == "lisp.name":
                        with open(filename, encoding="utf-8") as file:
                            try:
                                contents = file.read()
                                tree = RSTTree.from_sexp(contents, filename)
                                tree.construct_text()
                                yield tree
                            except AssertionError as err:
                                print("Error in ", filename)

In [3]:
corpus_reader = CorpusReader("/Users/vpraid/Downloads/RSTDT/data/RSTtrees-WSJ-main-1.0")

## 2.2 Use gensim to transform sentences into word vectors

### Create EDU reader

In [4]:
class EduReader:
    
    def __init__(self, reader):
        self.reader = reader

    def __iter__(self):
        for tree in self.reader.load_train_trees():
            for edu in tree.all_edus():
                yield edu

edu_reader = EduReader(corpus_reader)

### Load gensim and create a model

In [5]:
import gensim
import logging
logging.basicConfig(level=logging.CRITICAL)

In [6]:
EMBED_SIZE = 100
embed_model = gensim.models.Word2Vec(edu_reader, size=EMBED_SIZE, min_count=2, window=5, iter=100)
embed_model.save("data/parser/word2vec")

In [7]:
def check_similar(model):
    pretrained_weights = model.wv.syn0
    vocab_size, emdedding_size = pretrained_weights.shape
    print('Result embedding shape:', pretrained_weights.shape)
    print('Checking similar words:')
    for word in ['money', 'bank', 'company']:
        most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in model.most_similar(word)[:4])
        print('  %s -> %s' % (word, most_similar))

check_similar(embed_model)

Result embedding shape: (6938, 100)
Checking similar words:
  money -> so (0.43), manville (0.37), liability (0.37), potential (0.37)
  bank -> montreal (0.50), assurance (0.46), imperial (0.44), california (0.38)
  company -> transaction (0.40), group (0.35), acquisition (0.33), saatchi (0.33)


  
  import sys


In [8]:
def get_sentence_embedding(sentence, embed_model):
        embeddings = [embed_model[word] for word in sentence if word in embed_model.wv.vocab]
        if len(embeddings) == 0:
            return None
        word_sum = np.zeros(EMBED_SIZE, dtype='float64')
        word_count = 0
        for word in embeddings:
            word_sum += word
            word_count += 1
        return word_sum / word_count

## 2.3 Get POS tags from spaCy

In [9]:
import spacy
from spacy.attrs import POS

nlp = spacy.load("en")

In [10]:
pos2idx = {}
def fill_pos_tags():
    for edu in edu_reader:
        doc = nlp(" ".join(edu))
        for token in doc:
            if token.pos_ not in pos2idx:
                pos2idx[token.pos_] = len(pos2idx)

In [12]:
import pickle

fill_pos_tags()
with open('data/parser/pos2idx.pickle', 'wb') as handle:
    pickle.dump(pos2idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [43]:
def get_sentence_vector(sentence, embed_model):
    embedding = get_sentence_embedding(sentence, embed_model)
    if embedding is None:
        return None
    doc = nlp(" ".join(sentence))
    root = [token for token in doc if token.head == token][0]
    return np.r_[len(sentence), (np.arange(POS) == pos2idx[root.pos_]).astype(np.float64), embedding]

## 2.4 Train connection classifier

In [14]:
import tqdm
from tqdm import tqdm_notebook, tnrange

import pickle
import numpy as np

tqdm.monitor_interval = 0

In [21]:
def get_sentence_vector(sentence, embed_model):
    embedding = get_sentence_embedding(sentence, embed_model)
    if embedding is None:
        return None
    doc = nlp(" ".join(sentence))
    root = [token for token in doc if token.head == token][0]
    return np.r_[len(sentence), (np.arange(POS) == pos2idx[root.pos_]).astype(np.float64), embedding]

In [16]:
def are_connected(lhs, rhs):
    if lhs.parent != rhs.parent:
        return False
    if lhs.parent.nuclearity == RSTTree.MONONUCLEAR:
        return True
    assert lhs.type == 'nucleus' and rhs.type == 'nucleus'
    return np.abs(lhs.index - rhs.index) == 1

In [17]:
from itertools import product
from random import shuffle

def shuffled(x):
    y = x[:]
    shuffle(y)
    return y

def get_connection_set(trees):
    pairs = []
    labels = []
    for tree in tqdm_notebook(trees):
        
        subtrees = tree.all_trees()
        for subtree in subtrees:
            if subtree.nuclearity == RSTTree.MONONUCLEAR:
                pair = get_vector(subtree.nuclei[0], subtree.satellite, embed_model)
                if pair is None:
                    continue
                pairs.append(pair)
                labels.append(1)
            else:
                for lhs, rhs in zip(subtree.nuclei, subtree.nuclei[1:]):
                    pair = get_vector(lhs, rhs, embed_model)
                    if pair is None:
                        continue
                    pairs.append(pair)
                    labels.append(1)

        for i, (left, right) in enumerate(product(shuffled(subtrees), shuffled(subtrees))):
            if left == right or are_connected(left, right):
                continue
            if i > 30:
                break
            pair = get_vector(left, right, embed_model)
            if pair is None:
                continue
            pairs.append(pair)
            labels.append(0)

    shape = len(pairs), pairs[0].shape[0]
    return np.concatenate(pairs).reshape(shape), np.array(labels)

### Prepare and save training connection data
Do not run this cell unless you are ready to wait 15 minutes for it to finish.

In [18]:
conn_train_X, conn_train_Y = get_connection_set(corpus_reader.load_train_trees())
with open('data/parser/connection_train_set.pickle', 'wb') as handle:
    pickle.dump((conn_train_X, conn_train_Y), handle, protocol=pickle.HIGHEST_PROTOCOL)

  





### Prepare and save test connection data

In [19]:
conn_test_X, conn_test_Y = get_connection_set(corpus_reader.load_test_trees())
with open('data/parser/connection_test_set.pickle', 'wb') as handle:
    pickle.dump((conn_test_X, conn_test_Y), handle, protocol=pickle.HIGHEST_PROTOCOL)

  





### Build and train the network

In [20]:
import keras
from keras import regularizers
from keras.models import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.


In [21]:
connection_model = Sequential()
connection_model.add(Dense(256, input_dim=conn_train_X.shape[1], activation='relu'))
connection_model.add(Dropout(0.5))
connection_model.add(Dense(128, activation='relu'))
connection_model.add(Dropout(0.5))
connection_model.add(Dense(64, activation='relu'))
connection_model.add(Dropout(0.5))
connection_model.add(Dense(1, activation='sigmoid'))
connection_model.compile(loss='binary_crossentropy', optimizer="adam", metrics=['accuracy'])

In [22]:
connection_model.fit(conn_train_X, conn_train_Y, batch_size=128, verbose=0, epochs=15)
connection_model.save("data/parser/connection_model.h5")

### Evaluate the network

In [23]:
score = connection_model.evaluate(conn_train_X, conn_train_Y, verbose=0)
print(score)

[0.31997504003903682, 0.8492356352239685]


In [24]:
score = connection_model.evaluate(conn_test_X, conn_test_Y, verbose=0)
print(score)

[0.64221432453101401, 0.76533742331288346]


## 2.5 Train relation classifier

In [25]:
from keras.utils import to_categorical

def get_relation_set(trees, populate_relations = False, relation2idx = dict()):
    pairs = []
    labels = []
    for tree in tqdm_notebook(trees):
        
        subtrees = tree.all_trees()
        for subtree in subtrees:
            
            if subtree.relation is None:
                continue
            
            if populate_relations and subtree.relation not in relation2idx:
                relation2idx[subtree.relation] = len(relation2idx)
            
            elif not populate_relations and subtree.relation not in relation2idx.keys():
                continue
            
            if subtree.nuclearity == RSTTree.MONONUCLEAR:
                pair = get_vector(subtree.nuclei[0], subtree.satellite, embed_model)
                if pair is None:
                    continue
                pairs.append(pair)
                labels.append(relation2idx[subtree.relation])
            
            else:
                for lhs, rhs in zip(subtree.nuclei, subtree.nuclei[1:]):
                    pair = get_vector(lhs, rhs, embed_model)
                    if pair is None:
                        continue
                    pairs.append(pair)
                    labels.append(relation2idx[subtree.relation])

    shape = len(pairs), pairs[0].shape[0]
    return np.concatenate(pairs).reshape(shape), to_categorical(labels, num_classes=len(relation2idx)), relation2idx

### Prepare training relation data
Do not run this cell unless you are ready to wait 10 minutes for it to finish.

In [26]:
rel_train_X, rel_train_Y, relation2idx = get_relation_set(corpus_reader.load_train_trees(), True)
idx2relation = { index : relation for relation, index in relation2idx.items() }
with open('data/parser/relation_train_set.pickle', 'wb') as handle:
    pickle.dump((rel_train_X, rel_train_Y), handle, protocol=pickle.HIGHEST_PROTOCOL)

  





### Prepare test relation data

In [27]:
rel_test_X, rel_test_Y, _ = get_relation_set(
    corpus_reader.load_test_trees(),
    populate_relations=False,
    relation2idx=relation2idx)
with open('data/parser/relation_test_set.pickle', 'wb') as handle:
    pickle.dump((rel_test_X, rel_test_Y), handle, protocol=pickle.HIGHEST_PROTOCOL)

  





### Save relation dictionaries

In [28]:
with open('data/parser/relation_dicts.pickle', 'wb') as handle:
    pickle.dump((relation2idx, idx2relation), handle, protocol=pickle.HIGHEST_PROTOCOL)

### Build and train the network

In [29]:
relation_model = Sequential()
relation_model.add(Dense(256, input_dim=rel_train_X.shape[1], activation='relu'))
relation_model.add(Dropout(0.5))
relation_model.add(Dense(128, activation='relu'))
relation_model.add(Dropout(0.5))
relation_model.add(Dense(64, activation='relu'))
relation_model.add(Dropout(0.5))
relation_model.add(Dense(rel_train_Y.shape[1], activation='softmax'))
relation_model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

In [30]:
relation_model.fit(rel_train_X, rel_train_Y, batch_size=128, verbose=0, epochs=50)
relation_model.save("data/parser/relation_model.h5")

### Evaluate the network

In [31]:
score = relation_model.evaluate(rel_train_X, rel_train_Y, verbose=0)
print(score)

[1.7708597283738221, 0.5371059627846263]


In [32]:
score = relation_model.evaluate(rel_test_X, rel_test_Y, verbose=0)
print(score)

[2.490546958448034, 0.41297366769174465]


## 2.6 Train nuclearity classifier

In [33]:
NUCLEUS_L = [1, 0]
NUCLEUS_R = [0, 1]
NUCLEUS_B = [1, 1]

def get_nuclearity_set(trees):
    pairs = []
    labels = []
    for tree in tqdm_notebook(trees):
        
        subtrees = tree.all_trees()
        for subtree in subtrees:
            
            if subtree.nuclearity == RSTTree.MONONUCLEAR:
                pair = get_vector(subtree.nuclei[0], subtree.satellite, embed_model)
                if pair is None:
                    continue
                pairs.append(pair)
                label = NUCLEUS_L if subtree.nuclei[0].span[1] <= subtree.satellite.span[0] else NUCLEUS_R
                labels.append(label)
            
            else:
                for lhs, rhs in zip(subtree.nuclei, subtree.nuclei[1:]):
                    pair = get_vector(lhs, rhs, embed_model)
                    if pair is None:
                        continue
                    pairs.append(pair)
                    labels.append(NUCLEUS_B)

    shape = len(pairs), pairs[0].shape[0]
    return np.concatenate(pairs).reshape(shape), np.array(labels)

### Prepare and save nuclearity relation data
Do not run this cell unless you are ready to wait 10 minutes for it to finish.

In [34]:
nuc_train_X, nuc_train_Y = get_nuclearity_set(corpus_reader.load_train_trees())
with open('data/parser/nuclearity_train_set.pickle', 'wb') as handle:
    pickle.dump((nuc_train_X, nuc_train_Y), handle, protocol=pickle.HIGHEST_PROTOCOL)

  





### Prepare and save nuclearity test data

In [35]:
nuc_test_X, nuc_test_Y = get_nuclearity_set(corpus_reader.load_test_trees())
with open('data/parser/nuclearity_test_set.pickle', 'wb') as handle:
    pickle.dump((nuc_test_X, nuc_test_Y), handle, protocol=pickle.HIGHEST_PROTOCOL)

  





### Build and train the network

In [36]:
nuclearity_model = Sequential()
nuclearity_model.add(Dense(256, input_dim=nuc_train_X.shape[1], activation='relu'))
nuclearity_model.add(Dropout(0.5))
nuclearity_model.add(Dense(128, activation='relu'))
nuclearity_model.add(Dropout(0.5))
nuclearity_model.add(Dense(64, activation='relu'))
nuclearity_model.add(Dropout(0.5))
nuclearity_model.add(Dense(nuc_train_Y.shape[1], activation='softmax'))
nuclearity_model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

In [37]:
nuclearity_model.fit(nuc_train_X, nuc_train_Y, batch_size=128, verbose=0, epochs=10)
nuclearity_model.save("data/parser/nuclearity_model.h5")

### Evaluate the network

In [38]:
score = nuclearity_model.evaluate(nuc_train_X, nuc_train_Y, verbose=0)
print(score)

[0.58974824515622326, 0.92464868971208547]


In [39]:
score = nuclearity_model.evaluate(nuc_test_X, nuc_test_Y, verbose=0)
print(score)

[0.63903165138684781, 0.90192307661741211]
