In [74]:
import spacy
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
from tqdm import autonotebook as tqdm
import os
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from spacy.tokens import Doc
import re
from collections import Counter
from collections.abc import Iterable
from spacy.training import biluo_tags_to_spans
from itertools import combinations
import iobes

from zipfile import ZipFile
from pathlib import Path
from seqeval import scheme
from tqdm import autonotebook as tqdm

import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

import sklearn_crfsuite

nlp = spacy.load("en_core_web_sm")


In [75]:
nlp = spacy.load("en_core_web_sm")
data_dir = Path("./data/teaching-dataset")
with (data_dir / "relation_extraction_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "relation_extraction_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = []
        for line in f.read().decode("utf-8").split("\n"):
            relations = []
            for relation in re.finditer(r"\(\((\d+),(\d+)\),\((\d+),(\d+)\)\)", line):
                relation = (
                    (int(relation.group(1)), int(relation.group(2))),
                    (int(relation.group(3)), int(relation.group(4))),
                )
                relations.append(relation)
            labels.append(relations)
assert len(sentences) == len(labels)
doc = nlp(Doc(nlp.vocab, words=sentences[0]))
doc

The longest serving spacecraft goes into retirement . 

In [76]:
train_sentences, test_sentences, train_labels, test_labels =  train_test_split(sentences, labels, test_size=0.2, random_state=42)
print(len(train_sentences))
print(len(test_sentences))

748
188


In [77]:
class is_causal_predictor:
    def __init__(self, sentences, labels, n=None):
        self.sentences = sentences
        self.labels = labels
        self.n = n
        
        if n != None:
            self.init_words(sentences, labels)
            self.causal_cues = self.get_causal_cues(self.n)
        else:
            self.init_causal_cues_best_n(sentences, labels)


    def init_words(self, sentences, labels):
        self.words = []
        self.nonCausalWords = []

        for label, sentence in zip(labels, sentences):
            if type(sentence) == list:
                sentence = ' '.join(sentence)
            if label != []: #if sentence is causal
                doc = nlp(sentence)
                wordsHelp = [token.text for token in doc if not token.is_stop and not token.is_punct and token.pos_ != "NOUN" and token.pos_ != "ADJ"] 
                self.words.extend(wordsHelp) #append all words to a list if they are NOT nouns & NOT punctuation & NOT adjectives

            else: #if sentence is NOT causal
                doc = nlp(sentence)
                wordsHelp = [token.text for token in doc if not token.is_stop and not token.is_punct and token.pos_ != "NOUN" and token.pos_ != "ADJ"] 
                self.nonCausalWords.extend(wordsHelp)  
    

    # returns n best causal cues
    def get_causal_cues(self, n):
        def flatten(lis): #pretty ugly solution but we have to flatten the list since every new sentence adds "[]" which Counter can't deal with
            for item in lis:
                if isinstance(item, Iterable) and not isinstance(item, str):
                    for x in flatten(item):
                        yield x
                else:        
                    yield item
        
        def get_n_lemmata(causal_freq, n):
            # sort words
            sorted_words = np.array(causal_freq.most_common(len(causal_freq)))[:,0]
            converted_return = []
            for word in sorted_words:
                # lemmatize
                doc = nlp(str(word))
                word = " ".join([token.lemma_ for token in doc])
                if not word in converted_return:
                    converted_return.append(word)
                # break if n lemmata found
                if len(converted_return)==n:
                    break
            return converted_return

        causal_freq = Counter(self.words)
        nonCausal_freq = Counter(self.nonCausalWords)
        for word in causal_freq:
            causal_freq[word] = causal_freq[word]/(nonCausal_freq[word]+1)
        return get_n_lemmata(causal_freq, n)


    def predict_causality(self, sentence):
        if type(sentence) == list: # convert to str
            sentence = ' '.join(sentence)
        matcher = Matcher(nlp.vocab)
        pattern = [[{"LEMMA": cue}] for cue in self.causal_cues]
        matcher.add("CAUSAL", pattern)
        doc = nlp(sentence)
        matches = matcher(doc)
        return bool(matches)


    # Find best value for n given a testset and initialize causal cues according to best n
    def init_causal_cues_best_n(self, sentences, labels, step_size=5):
        train_sentences, test_sentences, train_labels, test_labels =  train_test_split(sentences, labels, test_size=0.2, random_state=42)
        self.init_words(train_sentences, train_labels)

        new_labels = []
        for label in test_labels:
            if label != []:
                new_labels.append(1)
            else:
                new_labels.append(0)

        old_f1 = 0
        f1 = 0
        n = 0
        while f1 >= old_f1:
            n = n+step_size
            predictions = []

            # predict
            self.causal_cues = self.get_causal_cues(n)
            for sentence, l in zip(test_sentences, test_labels):
                p = self.predict_causality(sentence)
                predictions.append(p)

            # evaluate
            old_f1 = f1
            tp = sum([int(p) == 1 and int(l) == 1 for p, l in zip(predictions, new_labels)])
            fp = sum([int(p) == 1 and int(l) == 0 for p, l in zip(predictions, new_labels)])
            fn = sum([int(p) == 0 and int(l) == 1 for p, l in zip(predictions, new_labels)])
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = 2 * precision * recall / (precision + recall)
            x = 0

        self.f1 = old_f1
        self.n = n - step_size
        self.causal_cues = self.get_causal_cues(self.n)


In [78]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, d, t) for w, p, d, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['DEP'].values.tolist(),
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [79]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    deptag = sent[i][2]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'deptag': deptag,
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        deptag1 = sent[i-1][2]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:deptag': deptag1,
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        # print(i)
        deptag1 = sent[i+1][2]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:deptag': deptag1,
        })
    else:
        features['EOS'] = True
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, deptag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, deptag, label in sent]

def flatten(l):
    output = []
    for sublist in l:
        for item in sublist:
            output.append(item)

        output.append('')
    return output

In [80]:
# training ML-model with data from exercise 2
def train_step_2():
    data_dir = Path("./data/teaching-dataset")
    with (data_dir / "span_extraction_text_train.zip").open("rb") as file:
        zip_file = ZipFile(file)
        with zip_file.open("input.txt") as f:
            sentences = [
                sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
            ]
    with (data_dir / "span_extraction_references_train.zip").open("rb") as file:
        zip_file = ZipFile(file)
        with zip_file.open("references.txt") as f:
            labels = [
                sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
            ]

    sentence_number=[]
    words=[]
    tags=[]
    dependencies=[]
    events=[]
    for i in range(len(sentences)):
        doc = Doc(nlp.vocab, words=sentences[i])
        doc = nlp(doc)
        iterator = zip(doc, labels[i])
        for token, label in iterator:
            sentence_number.append(f'Sentence: {i}')
            words.append(str(token))
            tags.append(token.pos_)
            dependencies.append(token.dep_)
            events.append(label)

    train_data = {'Sentence #': sentence_number,'Word': words, 'POS': tags, 'DEP': dependencies, 'Tag': events}
    df = pd.DataFrame(data=train_data)
    # print(df)

    df.isnull().sum()

    df = df.fillna(method='ffill')
    df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique();

    df.groupby('Tag').size().reset_index(name='counts');

    X = df.drop('Tag', axis=1)
    v = DictVectorizer(sparse=False)
    X = v.fit_transform(X.to_dict('records'))
    y = df.Tag.values
    classes = np.unique(y)
    classes = classes.tolist()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
    X_train.shape, y_train.shape;

    getter = SentenceGetter(df)
    sentences = getter.sentences

    X = [sent2features(s) for s in sentences]
    y = [sent2labels(s) for s in sentences]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)

    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train, y_train);

    y_pred = crf.predict(X_test)
    # print(metrics.flat_classification_report(y_pred=y_pred, y_true=y_test, labels = new_classes))
    # sollte eigentlich funktionieren tuts aber nicht
    flat_y_true = flatten(y_test)
    flat_y_pred = flatten(y_pred)
    print(classification_report(flat_y_true, flat_y_pred))
    return(crf)

In [81]:
train_step_2()



              precision    recall  f1-score   support

                   1.00      1.00      1.00         5
     B-EVENT       0.73      0.50      0.59        16
     I-EVENT       0.59      0.59      0.59        32
           O       0.80      0.84      0.82        83

    accuracy                           0.75       136
   macro avg       0.78      0.73      0.75       136
weighted avg       0.75      0.75      0.75       136



AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

AttributeError: 'CRF' object has no attribute 'keep_tempfiles'

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, d) for w, p, d in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['DEP'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        

        # print(self.grouped.head(20))
        self.sentences = [s for s in self.grouped]
        
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

class span_extractor:
    def __init__(self):
        pass

    def create_dataframe(self, test_sentences):
        sentence_number=[]
        words=[]
        tags=[]
        dependencies=[]
        events=[]
        for i in range(len(test_sentences)):
            doc = Doc(nlp.vocab, words=test_sentences[i])
            doc = nlp(doc)
            for token in doc:
                sentence_number.append(f'Sentence: {i}')
                words.append(str(token))
                tags.append(token.pos_)
                dependencies.append(token.dep_)

        train_data = {'Sentence #': sentence_number,'Word': words, 'POS': tags, 'DEP': dependencies}
        df = pd.DataFrame(data=train_data)
        # print(df)

    def getSentences(self, df):
        getter = SentenceGetter(df)
        sentences = getter.sentences
        # for sentence in sentences:
        #     print(sentence)

    def predict(self, sentences, crf):
        X = [sent2features(s) for s in sentences]
        y_pred = crf.predict(X)
        # print(y_pred)
        flat_y_pred = flatten(y_pred)

        return flat_y_pred
    


    

In [None]:
class relation_classificator:
    def __init__(self):
        pass

    def parse_sentence(self, sentence, spans):
        words = []
        tags = []
        for word, tag in zip(sentence, spans):
            words.append(word)
            tags.append(tag)
        doc = Doc(nlp.vocab, words=words)
        doc = nlp(doc)
        tags = iobes.bio_to_bilou(tags)
        doc.ents = biluo_tags_to_spans(doc, tags)
        return doc

    def token_in_between_events(self, token, outer1, outer2):
        return  token.i <= outer1.start and token.i >= outer2.end or \
                token.i <= outer2.start and token.i >= outer1.end

    def inside_event(self, event, token):
        return event.start <= token.i and event.end >= token.i

    def events_inside_subtree(self, verb, event1, event2):
        a = False
        b = False
        for sub in verb.subtree:
            if self.inside_event(event1, sub):
                a = True
        for sub in verb.subtree:
            if self.inside_event(event2, sub):
                b = True
        return a and b

    def backwards(self, verb, doc):
        keyword = None
        backwards = False
        for child in verb.children:
            if child.dep_ in ["agent"]:
                backwards = True
                keyword = child
            if child.text in ["from"]:
                backwards = True
                keyword = child
        next_word = doc[verb.i+1]
        if next_word.dep_ in ["aux"]:
            backwards = True
            keyword = next_word
        return backwards, keyword

    def get_next_event(self, doc, position):
        lowest_distance = float('inf')
        for event in doc.ents:
            if abs(event.start - position) <= lowest_distance and position > event.end or position < event.start:
                next_event = event
                lowest_distance = abs(event.start - position)
                return next_event

    def handle_cause_of(self, predictions, doc):
        for token in doc:
            if token.text == "cause" and doc[token.i+1].text in ["of", "for"]:
                for event1, event2 in combinations(doc.ents, 2):
                    if self.token_in_between_events(token, event1, event2):
                        predictions.append(((event1.start, event1.end), (event2.start, event2.end)))
                if len(predictions) == 0:
                    effect = self.get_next_event(doc, token.i)
                    cause = self.get_next_event(doc, effect.end)
                    predictions.append(((cause.start, cause.end), (effect.start, effect.end)))
        return predictions

    def handle_because(self, predictions, doc):
        for token in doc:
            if token.text == "because" or token.text == "due" or token.text == "common" and doc[token.i+1].text == "with":
                for event1, event2 in combinations(doc.ents, 2):
                    if self.token_in_between_events(token, event1, event2):
                        predictions.append(((event2.start, event2.end), (event1.start, event1.end)))
                if len(predictions) == 0:
                    effect = self.get_next_event(doc, token.i)
                    cause = self.get_next_event(doc, effect.end)
                    predictions.append(((cause.start, cause.end), (effect.start, effect.end)))
        return predictions


    def predict(self, sentence, spans):
        doc = self.parse_sentence(sentence, spans)
        events = doc.ents
        predictions = []

        predictions = self.handle_cause_of(predictions, doc)
        predictions = self.handle_because(predictions, doc)

        # find verbs
        verb_pattern = [[{'POS': 'VERB'}]]
        verb_matcher = Matcher(nlp.vocab)
        verb_matcher.add("verbs", verb_pattern)
        matches = verb_matcher(doc)
        verbs = [(doc[start:end]) for _, start, end in matches]
        verbs = [doc[verb.start] for verb in verbs] # get tokens instead of spans

        # remove verbs inside events
        for verb in verbs:
            for event in events:
                if verb.i >= event.start and verb.i < event.end:
                    verbs.remove(verb)

        for event1, event2 in combinations(doc.ents, 2): # events are actually in order
            for verb in verbs:
                # find out if backwards relation is given
                is_backwards, keyword = self.backwards(verb, doc)
                # predict if both events inside the verb's subtree and the verb is in between events
                if is_backwards:
                    if self.events_inside_subtree(verb, event1, event2) and self.token_in_between_events(keyword, event1, event2):
                        predictions.append(((event2.start, event2.end), (event1.start, event1.end)))
                else:
                    if self.events_inside_subtree(verb, event1, event2) and self.token_in_between_events(verb, event1, event2):
                        predictions.append(((event1.start, event1.end), (event2.start, event2.end)))
        
        # if there is no prediction yet, use a less strict rule
        if len(predictions) == 0:
            for event1, event2 in combinations(doc.ents, 2):
                for verb in verbs:
                    # find out if backwards relation is given
                    is_backwards, keyword = self.backwards(verb, doc)
                    # predict if the verb is in between events - less strict
                    if is_backwards: 
                        if self.token_in_between_events(keyword, event1, event2):
                            predictions.append(((event2.start, event2.end), (event1.start, event1.end)))
                    else:
                        if self.token_in_between_events(verb, event1, event2):
                            predictions.append(((event1.start, event1.end), (event2.start, event2.end)))

        if len(predictions) == 0:
            for event1, event2 in combinations(doc.ents, 2):
                predictions.append(((event1.start, event1.end), (event2.start, event2.end)))

        predictions = set(predictions)

        return predictions

In [None]:
class pipeline:
    def __init__(self, train_sentences, train_labels) :
        self.train_sentences = train_sentences
        self.train_labels = train_labels
        
        self.is_causal_predictor = is_causal_predictor(train_sentences,train_labels, n=20)
        self.span_extractor = span_extractor()
        self.relation_classificator = relation_classificator()

    def predict(self, sentence):
        is_causal = self.is_causal_predictor.predict_causality(sentence)
        if is_causal:
            spans = self.span_extractor.predict(sentence, train_step_2())
            prediction = self.relation_classificator.predict(sentence, spans)
        else:
            prediction = []
        return prediction

In [None]:
pipe = pipeline(train_sentences, train_labels)
pipe.is_causal_predictor.causal_cues

['associate',
 'cause',
 'lead',
 'include',
 'increase',
 'AGEP',
 'result',
 'induce',
 'relate',
 'Haiti',
 'hht',
 'commonly',
 'occur',
 'silica',
 'approximately',
 'uroporphyrinogen',
 'develop',
 'particularly',
 'enamel',
 'call']

In [None]:
pipe.predict(sentences[11])

              precision    recall  f1-score   support

                   1.00      1.00      1.00         5
     B-EVENT       0.73      0.50      0.59        16
     I-EVENT       0.59      0.59      0.59        32
           O       0.80      0.84      0.82        83

    accuracy                           0.75       136
   macro avg       0.78      0.73      0.75       136
weighted avg       0.75      0.75      0.75       136



IndexError: string index out of range