In [1]:
import spacy
from spacy.tokens import Doc
from spacy.matcher import Matcher
from spacy import displacy
from zipfile import ZipFile
from pathlib import Path
from seqeval.metrics import classification_report
from seqeval import scheme
from tqdm import autonotebook as tqdm
import os
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from collections.abc import Iterable


In [2]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# REGULAR DATA IMPORT
data_dir = Path("./data/teaching-dataset")
with (data_dir / "span_extraction_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "span_extraction_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]

In [99]:
# THIS IS MY METHOD OF DATA IMPORTING (USING VS-CODE CAUSES ISSUES OTHERWISE)
parent = Path(os.getcwd()).parent
data_dir = Path(os.path.join(parent,"data/span-detection"))

with (data_dir / "span_extraction_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "span_extraction_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]

In [4]:
train_sentences, test_sentences, train_labels, test_labels =  train_test_split(sentences, labels, test_size=0.2, random_state=42)

In [5]:
class is_causal_predictor:
    def __init__(self, sentences, labels, n=None):
        self.sentences = sentences
        self.labels = labels
        self.n = n
        self.test_sentences = test_sentences
        self.test_labels = test_labels
        
        self.init_words()
        if n != None:
            self.causal_cues = self.get_causal_cues(self.n)


    def init_words(self):
        self.words = []
        self.nonCausalWords = []

        for label, sentence in zip(self.labels, self.sentences):
            if type(sentence) == list:
                sentence = ' '.join(sentence)
            if "B-EVENT" in label: #if sentence is causal
                doc = nlp(sentence)
                wordsHelp = [token.text for token in doc if not token.is_stop and not token.is_punct and token.pos_ != "NOUN" and token.pos_ != "ADJ"] 
                self.words.extend(wordsHelp) #append all words to a list if they are NOT nouns & NOT punctuation & NOT adjectives

            else: #if sentence is NOT causal
                doc = nlp(sentence)
                wordsHelp = [token.text for token in doc if not token.is_stop and not token.is_punct and token.pos_ != "NOUN" and token.pos_ != "ADJ"] 
                self.nonCausalWords.extend(wordsHelp)  
    
    # returns n best causal cues
    def get_causal_cues(self, n):
        def flatten(lis): #pretty ugly solution but we have to flatten the list since every new sentence adds "[]" which Counter can't deal with
            for item in lis:
                if isinstance(item, Iterable) and not isinstance(item, str):
                    for x in flatten(item):
                        yield x
                else:        
                    yield item
        
        def get_n_lemmata(causal_freq, n):
            # sort words
            sorted_words = np.array(causal_freq.most_common(len(causal_freq)))[:,0]
            converted_return = []
            for word in sorted_words:
                # lemmatize
                doc = nlp(str(word))
                word = " ".join([token.lemma_ for token in doc])
                if not word in converted_return:
                    converted_return.append(word)
                # break if n lemmata found
                if len(converted_return)==n:
                    break
            return converted_return

        causal_freq = Counter(self.words)
        nonCausal_freq = Counter(self.nonCausalWords)
        for word in causal_freq:
            causal_freq[word] = causal_freq[word]/(nonCausal_freq[word]+1)
        return get_n_lemmata(causal_freq, n)


    def predict_causality(self, sentence):
        if type(sentence) == list: # convert to str
            sentence = ' '.join(sentence)
        matcher = Matcher(nlp.vocab)
        pattern = [[{"LEMMA": cue}] for cue in self.causal_cues]
        matcher.add("CAUSAL", pattern)
        doc = nlp(sentence)
        matches = matcher(doc)
        return bool(matches)


    # Find best value for n given a testset and initialize causal cues according to best n
    def init_causal_cues_best_n(self, test_sentences, test_labels, step_size=20):
        new_labels = []
        for label in test_labels:
            if "B-EVENT" in label:
                new_labels.append(1)
            else:
                new_labels.append(0)

        old_f1 = 0
        f1 = 0
        n = 0
        while f1 >= old_f1:
            n = n+step_size
            predictions = []

            # predict
            self.causal_cues = self.get_causal_cues(n)
            for sentence, l in zip(test_sentences, test_labels):
                p = self.predict_causality(sentence)
                predictions.append(p)

            # evaluate
            old_f1 = f1
            tp = sum([int(p) == 1 and int(l) == 1 for p, l in zip(predictions, new_labels)])
            fp = sum([int(p) == 1 and int(l) == 0 for p, l in zip(predictions, new_labels)])
            fn = sum([int(p) == 0 and int(l) == 1 for p, l in zip(predictions, new_labels)])
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f1 = 2 * precision * recall / (precision + recall)

        self.f1 = old_f1
        self.n = n-20
        self.causal_cues = self.get_causal_cues(self.n)


In [6]:
is_causal_predictor = is_causal_predictor(train_sentences,train_labels, 120)
print(is_causal_predictor.causal_cues)

['cause', 'result', 'lead', 'include', 'occur', 'trigger', 'associate', 'Haiti', 'generate', 'commonly', 'agep', 'induce', 'swell', 'particularly', 'eat', 'generally', 'develop', 'fear', 'hht', 'HLA', 'increase', 'frequently', 'etc', 'speak', 'IA', 'experience', 'approximately', 'helicobacter', 'contribute', 'PCT', 'underlie', 'suggest', 'emit', 'potentially', 'worry', 'bleed', 'suffer', 'AVMs', 'face', '60', '70', 'allow', 'stem', 'involve', 'reduce', 'address', 'radiate', 'H5N1', 'weaver', 'relatively', 'happen', 'c', 'receive', 'delay', 'ACVRL1', 'and/or', 'Europe', 'follow', 'Chernobyl', 'Hiroshima', 'MG', 'reveal', 'AIDS', 'highly', 'form', 'lobe', 'threaten', 'rarely', 'Iraq', 'UROD', 'uroporphyrinogen', 'Nevada', 'Aware', 'commit', 'predominantly', 'lu', 'far', 'last', 'eg', 'state', 'properly', 'continue', 'contain', 'instigate', 'scream', 'Florida', 'worsen', 'show', 'contaminate', 'typhoon', 'Manila', 'thirty', 'Salmonella', 'additionally', 'cite', 'thorotrast', 'West', 'MADH

TODO: 
- Aufzählungen
- Präpositionen

In [50]:
causal_prepositions = ["with", "to"]

In [117]:
def predict_3(sentence):

    def compound_handler(token):
        for child in token.children:
            if child.dep_ in  ["amod","compound","det","poss"]:
                predictions[child.i] = "B-EVENT"
                compound_handler(child)
            if child.dep_ in  ["prep"]:
                predictions[child.i] = "B-EVENT"
                for prep_child in child.children:
                    object_handler(prep_child)

    def conj_handler(token):
        if token.dep_ in ["conj"]:
            predictions[token.i] = "B-EVENT"
            compound_handler(token)
        if token.dep_ in ["cc"]:
            compound_handler(token)
        if doc[token.i - 1].text == "-":
            x = doc[token.i - 2].dep_
            if doc[token.i - 2].dep_ == "amod":
                y = token.i - 1
                predictions[token.i - 1] = "B-EVENT"
                predictions[token.i - 2] = "B-EVENT"
        # find whole conjunction
        for child in token.children:
            conj_handler(child)

    def object_handler(token):
        if token.dep_ in ["dobj", "pobj", "nsubj", "nsubjpass"] and token.pos_ not in ["PRON"]:
            predictions[token.i] = "B-EVENT"
            compound_handler(token)

            for child in token.children:
                if child.pos_ in ["NOUN", "PROPN"]:
                    predictions[child.i] = "B-EVENT"
                conj_handler(token)

    def cleanup():
        for i, token in enumerate(predictions):
            if token == "B-EVENT" or token == "I-EVENT":
                try:
                    if predictions[i+1] == "B-EVENT":
                        predictions[i+1] = "I-EVENT"
                except:
                    pass

    doc = Doc(nlp.vocab, words=sentence)
    doc = nlp(doc)
    predictions = ["O"] * len(sentence)
    
    for token in doc:
        if token.dep_ in ["ROOT", "advcl", "relcl"]:
            for child in token.children:
                # root-child is object?
                object_handler(child)
                # root-child is prep
                if child.dep_ in ["prep", "agent"]:
                    for prep_child in child.children:
                        # prep-child is object?
                        object_handler(prep_child)

        if token.dep_ == "acl":
            for child in token.children:
                object_handler(child)
                if child.dep_ == "agent":
                    for agent_child in child.children:
                        object_handler(agent_child)
                
            for parent in token.ancestors:
                object_handler(parent)

    cleanup()
    return predictions

idx = 0
sentence = sentences[idx]
doc = Doc(nlp.vocab, words=sentence)
doc = nlp(doc)
iterator = zip(doc, predict_3(sentences[idx]), labels[idx])
for token, pred_3, label in iterator:
    print(f"{token.text :<16} {pred_3 :<8} {label :<8} {token.pos_ :<8} {token.dep_:<8} {token.tag_:<8}")

displacy.render(doc, style="dep")

Herbal           B-EVENT  B-EVENT  ADJ      amod     JJ      
medications      I-EVENT  I-EVENT  NOUN     nsubjpass NNS     
,                O        O        PUNCT    punct    ,       
spider           B-EVENT  B-EVENT  NOUN     compound NN      
bites            I-EVENT  I-EVENT  NOUN     conj     NNS     
,                O        O        PUNCT    punct    ,       
iopamidol        B-EVENT  B-EVENT  PROPN    conj     NNP     
(                O        O        PUNCT    punct    -LRB-   
used             O        O        VERB     acl      VBN     
for              O        O        ADP      prep     IN      
radiocontrast    O        O        NOUN     pobj     NN      
)                O        O        PUNCT    punct    -RRB-   
,                O        O        PUNCT    punct    ,       
lacquers         B-EVENT  B-EVENT  NOUN     conj     NNS     
,                O        O        PUNCT    punct    ,       
mercury          B-EVENT  B-EVENT  NOUN     conj     NN      
,      

In [92]:
spacy.explain("relcl")

'relative clause modifier'

In [112]:
def predict_events(sentence, is_causal_predictor):
    causal_sentence = is_causal_predictor.predict_causality(sentence)
    if causal_sentence:
        return predict_3(sentence)
    else:
        return ["O"] * len(sentence)

In [113]:
predictions_1 = [predict_events(sentence, is_causal_predictor) for sentence in tqdm.tqdm(test_sentences)]
print(classification_report(test_labels, predictions_1, scheme=scheme.IOB2, mode="strict"))

  0%|          | 0/714 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       EVENT       0.10      0.18      0.13       769

   micro avg       0.10      0.18      0.13       769
   macro avg       0.10      0.18      0.13       769
weighted avg       0.10      0.18      0.13       769



In [104]:
predictions_1 = [predict_2(sentence) for sentence in tqdm.tqdm(test_sentences)]
print(classification_report(test_labels, predictions_1, scheme=scheme.IOB2, mode="strict"))

  0%|          | 0/714 [00:00<?, ?it/s]

              precision    recall  f1-score   support

       EVENT       0.12      0.65      0.21       769

   micro avg       0.12      0.65      0.21       769
   macro avg       0.12      0.65      0.21       769
weighted avg       0.12      0.65      0.21       769



In [40]:
nlp = spacy.load("en_core_web_sm")
for label in nlp.get_pipe("parser").labels:
    print(label, " -- ", spacy.explain(label))

ROOT  --  root
acl  --  clausal modifier of noun (adjectival clause)
acomp  --  adjectival complement
advcl  --  adverbial clause modifier
advmod  --  adverbial modifier
agent  --  agent
amod  --  adjectival modifier
appos  --  appositional modifier
attr  --  attribute
aux  --  auxiliary
auxpass  --  auxiliary (passive)
case  --  case marking
cc  --  coordinating conjunction
ccomp  --  clausal complement
compound  --  compound
conj  --  conjunct
csubj  --  clausal subject
csubjpass  --  clausal subject (passive)
dative  --  dative
dep  --  unclassified dependent
det  --  determiner
dobj  --  direct object
expl  --  expletive
intj  --  interjection
mark  --  marker
meta  --  meta modifier
neg  --  negation modifier
nmod  --  modifier of nominal
npadvmod  --  noun phrase as adverbial modifier
nsubj  --  nominal subject
nsubjpass  --  nominal subject (passive)
nummod  --  numeric modifier
oprd  --  object predicate
parataxis  --  parataxis
pcomp  --  complement of preposition
pobj  --  ob



In [20]:
def print_prediction(sentence, prediction, labels):
    for token, pred, label in zip(sentence, prediction, labels):
        print(f"{token :<16} {pred :<8} {label}")

idx = 11
print_prediction(sentences[idx], predictions_1[idx], labels[idx])

The              B-EVENT  O
winemaker        I-EVENT  O
carefully        O        O
chose            O        O
grapes           B-EVENT  O
from             O        O
different        B-EVENT  O
lots             I-EVENT  O
in               O        O
the              B-EVENT  O
vineyards        I-EVENT  O
and              O        O
blended          O        O
them             B-EVENT  O
into             O        O
this             B-EVENT  O
wonderful        I-EVENT  O
Pinot            I-EVENT  O
.                O        O
