In [11]:
import spacy
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
from tqdm import autonotebook as tqdm
import os
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from spacy.tokens import Doc
from spacy.matcher import Matcher
import re
from collections import Counter
from collections.abc import Iterable
from spacy.training import biluo_tags_to_spans
from itertools import combinations
import iobes


In [7]:
nlp = spacy.load("en_core_web_sm")
data_dir = Path("./data/teaching-dataset")
with (data_dir / "relation_extraction_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "relation_extraction_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = []
        for line in f.read().decode("utf-8").split("\n"):
            relations = []
            for relation in re.finditer(r"\(\((\d+),(\d+)\),\((\d+),(\d+)\)\)", line):
                relation = (
                    (int(relation.group(1)), int(relation.group(2))),
                    (int(relation.group(3)), int(relation.group(4))),
                )
                relations.append(relation)
            labels.append(relations)
assert len(sentences) == len(labels)
doc = nlp(Doc(nlp.vocab, words=sentences[0]))
doc

The longest serving spacecraft goes into retirement . 

In [8]:
train_sentences, test_sentences, train_labels, test_labels =  train_test_split(sentences, labels, test_size=0.2, random_state=42)
print(len(train_sentences))
print(len(test_sentences))

748
188


In [45]:
class is_causal_predictor:
    def __init__(self, sentences, labels, n=None):
        self.sentences = sentences
        self.labels = labels
        self.n = n
        
        if n != None:
            self.init_words(sentences, labels)
            self.causal_cues = self.get_causal_cues(self.n)
        else:
            self.init_causal_cues_best_n(sentences, labels)


    def init_words(self, sentences, labels):
        self.words = []
        self.nonCausalWords = []

        for label, sentence in zip(labels, sentences):
            if type(sentence) == list:
                sentence = ' '.join(sentence)
            if label != []: #if sentence is causal
                doc = nlp(sentence)
                wordsHelp = [token.text for token in doc if not token.is_stop and not token.is_punct and token.pos_ != "NOUN" and token.pos_ != "ADJ"] 
                self.words.extend(wordsHelp) #append all words to a list if they are NOT nouns & NOT punctuation & NOT adjectives

            else: #if sentence is NOT causal
                doc = nlp(sentence)
                wordsHelp = [token.text for token in doc if not token.is_stop and not token.is_punct and token.pos_ != "NOUN" and token.pos_ != "ADJ"] 
                self.nonCausalWords.extend(wordsHelp)  
    

    # returns n best causal cues
    def get_causal_cues(self, n):
        def flatten(lis): #pretty ugly solution but we have to flatten the list since every new sentence adds "[]" which Counter can't deal with
            for item in lis:
                if isinstance(item, Iterable) and not isinstance(item, str):
                    for x in flatten(item):
                        yield x
                else:        
                    yield item
        
        def get_n_lemmata(causal_freq, n):
            # sort words
            sorted_words = np.array(causal_freq.most_common(len(causal_freq)))[:,0]
            converted_return = []
            for word in sorted_words:
                # lemmatize
                doc = nlp(str(word))
                word = " ".join([token.lemma_ for token in doc])
                if not word in converted_return:
                    converted_return.append(word)
                # break if n lemmata found
                if len(converted_return)==n:
                    break
            return converted_return

        causal_freq = Counter(self.words)
        nonCausal_freq = Counter(self.nonCausalWords)
        for word in causal_freq:
            causal_freq[word] = causal_freq[word]/(nonCausal_freq[word]+1)
        return get_n_lemmata(causal_freq, n)


    def predict_causality(self, sentence):
        if type(sentence) == list: # convert to str
            sentence = ' '.join(sentence)
        matcher = Matcher(nlp.vocab)
        pattern = [[{"LEMMA": cue}] for cue in self.causal_cues]
        matcher.add("CAUSAL", pattern)
        doc = nlp(sentence)
        matches = matcher(doc)
        return bool(matches)


    # Find best value for n given a testset and initialize causal cues according to best n
    def init_causal_cues_best_n(self, sentences, labels, step_size=5):
        train_sentences, test_sentences, train_labels, test_labels =  train_test_split(sentences, labels, test_size=0.2, random_state=42)
        self.init_words(train_sentences, train_labels)

        new_labels = []
        for label in test_labels:
            if label != []:
                new_labels.append(1)
            else:
                new_labels.append(0)

        old_f1 = 0
        f1 = 0
        n = 0
        while f1 >= old_f1:
            n = n+step_size
            predictions = []

            # predict
            self.causal_cues = self.get_causal_cues(n)
            for sentence, l in zip(test_sentences, test_labels):
                p = self.predict_causality(sentence)
                predictions.append(p)

            # evaluate
            old_f1 = f1
            tp = sum([int(p) == 1 and int(l) == 1 for p, l in zip(predictions, new_labels)])
            fp = sum([int(p) == 1 and int(l) == 0 for p, l in zip(predictions, new_labels)])
            fn = sum([int(p) == 0 and int(l) == 1 for p, l in zip(predictions, new_labels)])
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = 2 * precision * recall / (precision + recall)
            x = 0

        self.f1 = old_f1
        self.n = n - step_size
        self.causal_cues = self.get_causal_cues(self.n)


In [47]:
class relation_classificator:
    def __init__(self):
        pass

    def parse_sentence(self, sentence, spans):
        words = []
        tags = []
        for word, tag in zip(sentence, spans):
            words.append(word)
            tags.append(tag)
        doc = Doc(nlp.vocab, words=words)
        doc = nlp(doc)
        tags = iobes.bio_to_bilou(tags)
        doc.ents = biluo_tags_to_spans(doc, tags)
        return doc

    def token_in_between_events(self, token, outer1, outer2):
        return  token.i <= outer1.start and token.i >= outer2.end or \
                token.i <= outer2.start and token.i >= outer1.end

    def inside_event(self, event, token):
        return event.start <= token.i and event.end >= token.i

    def events_inside_subtree(self, verb, event1, event2):
        a = False
        b = False
        for sub in verb.subtree:
            if self.inside_event(event1, sub):
                a = True
        for sub in verb.subtree:
            if self.inside_event(event2, sub):
                b = True
        return a and b

    def backwards(self, verb, doc):
        keyword = None
        backwards = False
        for child in verb.children:
            if child.dep_ in ["agent"]:
                backwards = True
                keyword = child
            if child.text in ["from"]:
                backwards = True
                keyword = child
        next_word = doc[verb.i+1]
        if next_word.dep_ in ["aux"]:
            backwards = True
            keyword = next_word
        return backwards, keyword

    def get_next_event(self, doc, position):
        lowest_distance = float('inf')
        for event in doc.ents:
            if abs(event.start - position) <= lowest_distance and position > event.end or position < event.start:
                next_event = event
                lowest_distance = abs(event.start - position)
                return next_event

    def handle_cause_of(self, predictions, doc):
        for token in doc:
            if token.text == "cause" and doc[token.i+1].text in ["of", "for"]:
                for event1, event2 in combinations(doc.ents, 2):
                    if self.token_in_between_events(token, event1, event2):
                        predictions.append(((event1.start, event1.end), (event2.start, event2.end)))
                if len(predictions) == 0:
                    effect = self.get_next_event(doc, token.i)
                    cause = self.get_next_event(doc, effect.end)
                    predictions.append(((cause.start, cause.end), (effect.start, effect.end)))
        return predictions

    def handle_because(self, predictions, doc):
        for token in doc:
            if token.text == "because" or token.text == "due" or token.text == "common" and doc[token.i+1].text == "with":
                for event1, event2 in combinations(doc.ents, 2):
                    if self.token_in_between_events(token, event1, event2):
                        predictions.append(((event2.start, event2.end), (event1.start, event1.end)))
                if len(predictions) == 0:
                    effect = self.get_next_event(doc, token.i)
                    cause = self.get_next_event(doc, effect.end)
                    predictions.append(((cause.start, cause.end), (effect.start, effect.end)))
        return predictions


    def predict(self, sentence, spans):
        doc = self.parse_sentence(sentence, spans)
        events = doc.ents
        predictions = []

        predictions = self.handle_cause_of(predictions, doc)
        predictions = self.handle_because(predictions, doc)

        # find verbs
        verb_pattern = [[{'POS': 'VERB'}]]
        verb_matcher = Matcher(nlp.vocab)
        verb_matcher.add("verbs", verb_pattern)
        matches = verb_matcher(doc)
        verbs = [(doc[start:end]) for _, start, end in matches]
        verbs = [doc[verb.start] for verb in verbs] # get tokens instead of spans

        # remove verbs inside events
        for verb in verbs:
            for event in events:
                if verb.i >= event.start and verb.i < event.end:
                    verbs.remove(verb)

        for event1, event2 in combinations(doc.ents, 2):
            # entities are actually in order
            for verb in verbs:
                is_backwards, keyword = self.backwards(verb, doc)
                if is_backwards:
                    if self.events_inside_subtree(verb, event1, event2) and self.token_in_between_events(keyword, event1, event2):
                        predictions.append(((event2.start, event2.end), (event1.start, event1.end)))
                else:
                    if self.events_inside_subtree(verb, event1, event2) and self.token_in_between_events(verb, event1, event2):
                        predictions.append(((event1.start, event1.end), (event2.start, event2.end)))

        # if len(predictions) == 0:
            for event1, event2 in combinations(doc.ents, 2):
            # entities are actually in order
                for verb in verbs:
                    is_backwards, keyword = self.backwards(verb, doc)
                    if is_backwards:
                        if self.token_in_between_events(keyword, event1, event2):
                            predictions.append(((event2.start, event2.end), (event1.start, event1.end)))
                    else:
                        if self.token_in_between_events(verb, event1, event2):
                            predictions.append(((event1.start, event1.end), (event2.start, event2.end)))

        if len(predictions) == 0:
            for event1, event2 in combinations(doc.ents, 2):
                predictions.append(((event1.start, event1.end), (event2.start, event2.end)))

        predictions = set(predictions)

        return predictions

In [30]:
class pipeline:
    def __init__(self, train_sentences, train_labels) :
        self.train_sentences = train_sentences
        self.train_labels = train_labels
        
        self.is_causal_predictor = is_causal_predictor(train_sentences,train_labels)
        self.span_extractor = None
        self.relation_classificator = relation_classificator()

    def predict(self, sentence):
        is_causal = self.is_causal_predictor.predict_causality(sentence)
        if is_causal:
            spans = self.span_extractor.predict()
            prediction = self.relation_classificator.predict(sentence, spans)
        else:
            prediction = []
        return prediction

In [46]:
pipe = pipeline(train_sentences, train_labels)
pipe.is_causal_predictor.causal_cues

['associate',
 'lead',
 'cause',
 'include',
 'increase',
 'agep',
 'hht',
 'result',
 'induce',
 'commonly',
 'relate',
 'occur',
 'trigger',
 'Haiti',
 '1',
 'uroporphyrinogen',
 'approximately',
 'develop',
 'lobe',
 'relatively',
 'particularly',
 '10',
 'underlie',
 'call',
 'AVMs']