In [1]:
import spacy
from spacy.tokens import Doc, SpanGroup
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
from tqdm import autonotebook as tqdm
from spacy.training import biluo_tags_to_spans
import iobes
import re
from itertools import combinations
from spacy import displacy
import numpy as np

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
data_dir = Path("./data/teaching-dataset")
with (data_dir / "relation_classification_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "relation_classification_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = []
        for line in f.read().decode("utf-8").split("\n"):
            relations = []
            for relation in re.finditer(r"\(\((\d+),(\d+)\),\((\d+),(\d+)\)\)", line):
                relation = (
                    (int(relation.group(1)), int(relation.group(2))),
                    (int(relation.group(3)), int(relation.group(4))),
                )
                relations.append(relation)
            labels.append(relations)

In [4]:
def parse_sentence(sentence):
    words = []
    tags = []
    for item in sentence:
        word, tag = item.split(" ")
        words.append(word)
        tags.append(tag)
    doc = Doc(nlp.vocab, words=words)
    doc = nlp(doc)
    tags = iobes.bio_to_bilou(tags)
    doc.ents = biluo_tags_to_spans(doc, tags)
    return doc

In [6]:
for word in sentences[0]:
    print(word)

According O
two O
different O
studies O
it O
seems O
plausible O
that O
the O
Pohang B-EVENT
earthquake I-EVENT
was O
induced O
by O
EGS B-EVENT
operations I-EVENT
. O


In [7]:
labels[0]

[((14, 16), (9, 11))]

In [8]:
doc = parse_sentence(sentences[0])
doc.ents

(Pohang earthquake, EGS operations)

In [9]:
def extract_active_passive(doc):
    # https://stackoverflow.com/questions/74528441/detect-passive-or-active-sentence-from-text
    passive_rules = [
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "VBN"},
        ],
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "VBZ"},
        ],
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "RB"},
            {"TAG": "VBN"},
        ],
    ]
    # Create pattern to match active voice use
    active_rules = [
        [{"DEP": "nsubj"}, {"TAG": "VBD", "DEP": "ROOT"}],
        [{"DEP": "nsubj"}, {"TAG": "VBP"}, {"TAG": "VBG", "OP": "!"}],
        [{"DEP": "nsubj"}, {"DEP": "aux", "OP": "*"}, {"TAG": "VB"}],
        [{"DEP": "nsubj"}, {"DEP": "aux", "OP": "*"}, {"TAG": "VBG"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "*"}, {"TAG": "VBG"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "*"}, {"TAG": "VBZ"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "+"}, {"TAG": "VBD"}],
    ]
    matcher = Matcher(nlp.vocab)
    matcher.add("Passive", passive_rules)
    matcher.add("Active", active_rules)
    matches = matcher(doc)
    matches = [
        (nlp.vocab.strings[match_id], doc[start:end])
        for match_id, start, end in matches
    ]
    return matches


def predict(sentence):
    doc = parse_sentence(sentence)
    matches = extract_active_passive(doc)
    predictions = []
    for ent_1, ent_2 in combinations(doc.ents, 2):
        for match_type, match_span in matches:
            if SpanGroup(doc, spans=[ent_1, ent_2, match_span]).has_overlap:
                match_active = match_type == "Active"
                if match_active:
                    predictions.append(
                        ((ent_1.start, ent_1.end), (ent_2.start, ent_2.end))
                    )
                    break
                else:
                    predictions.append(
                        ((ent_2.start, ent_2.end), (ent_1.start, ent_1.end))
                    )
                    break
    return predictions

In [10]:
doc = parse_sentence(sentences[0])
matches = extract_active_passive(doc)
print(doc)
for match_type, match_span in matches:
    print("\t{}: {}".format(match_type, match_span.text))

According two different studies it seems plausible that the Pohang earthquake was induced by EGS operations . 
	Active: it seems
	Passive: earthquake was induced


In [9]:
def in_between(token, outer1, outer2):
    return  token.i <= outer1.start and token.i >= outer2.end or \
            token.i <= outer2.start and token.i >= outer1.end


def predict_2(sentence):
    doc = parse_sentence(sentence)
    events = doc.ents
    predictions = []

    # find verbs
    verb_pattern = [[{'POS': 'VERB'},]]
    verb_matcher = Matcher(nlp.vocab)
    verb_matcher.add("verbs", verb_pattern)
    matches = verb_matcher(doc)
    verbs = [(doc[start:end]) for _, start, end in matches]
    verbs = [doc[verb.start] for verb in verbs] # get tokens instead of spans

    # remove verbs inside events
    for verb in verbs:
        for ent in events:
            if verb.i >= ent.start and verb.i <= ent.end:
                verbs.remove(verb)
    # remove verbs inside brackets
    

    for ent_1, ent_2 in combinations(doc.ents, 2):
        # entities are actually in order
        for verb in verbs:
            if in_between(verb, ent_1, ent_2):
                backwards = False
                for child in verb.children:
                    if child.dep_ in ["agent"]:
                        backwards = True
                if backwards:
                    predictions.append(((ent_2.start, ent_2.end), (ent_1.start, ent_1.end)))
                else:
                    predictions.append(((ent_1.start, ent_1.end), (ent_2.start, ent_2.end)))

    return predictions
    


for idx in [0,1,2,3,4,5,6,7,9]:
    doc = parse_sentence(sentences[idx])
    print(doc)
    pred = predict_2(sentences[idx])

    print("Ground truth:")
    for cause, effect in labels[idx]:
        print("\t{} -> {}".format(doc[cause[0]:cause[1]], doc[effect[0]:effect[1]]))
    print("Predictions:")
    for cause, effect in pred:
        print("\t{} -> {}".format(doc[cause[0]:cause[1]], doc[effect[0]:effect[1]]))
    print("\n")

# displacy.render(doc, style="dep")


According two different studies it seems plausible that the Pohang earthquake was induced by EGS operations . 
Ground truth:
	EGS operations -> Pohang earthquake
Predictions:
	EGS operations -> Pohang earthquake


Signs and symptoms include : Dyspnea ( shortness of breath ) exacerbated by exertion Cough , often persistent and sometimes severe Fatigue Tachypnea ( rapid breathing ) which is often labored , Loss of appetite and weight loss Chest pain Fever Gradual darkening of skin ( blue skin ) Gradual dark shallow rifts in nails eventually leading to cracks as protein fibers within nail beds are destroyed . 
Ground truth:
	Gradual dark shallow rifts in nails -> cracks
Predictions:
	Gradual dark shallow rifts in nails -> cracks


Pneumatic drilling in mines and less commonly , mining using explosives , would raise fine - ultra fine crystalline silica dust ( rock dust ) . 
Ground truth:
	Pneumatic drilling in mines -> fine - ultra fine crystalline silica dust
	mining using explosives -> f

In [None]:
spacy.explain("ADP")

'adposition'

In [38]:
predictions = []
for sentence in tqdm.tqdm(sentences):
    predictions.append(predict_2(sentence))

  0%|          | 0/468 [00:00<?, ?it/s]

In [39]:
def evaluate(predictions, references, micro_avg=True):
    tp = []
    fp = []
    fn = []
    for prediction, reference in zip(predictions, references):
        tp.append(len(set(prediction) & set(reference)))
        fp.append(len(set(prediction) - set(reference)))
        fn.append(len(set(reference) - set(prediction)))
    if micro_avg:
        tp = [sum(tp)]
        fp = [sum(fp)]
        fn = [sum(fn)]
    precision = [0 if tp[i] == 0 else tp[i] / (tp[i] + fp[i]) for i in range(len(tp))]
    recall = [0 if tp[i] == 0 else tp[i] / (tp[i] + fn[i]) for i in range(len(tp))]
    f1 = [
        0
        if precision[i] * recall[i] == 0
        else 2 * precision[i] * recall[i] / (precision[i] + recall[i])
        for i in range(len(tp))
    ]
    precision = sum(precision) / len(precision)
    recall = sum(recall) / len(recall)
    f1 = sum(f1) / len(f1)
    return precision, recall, f1


micro_precision, micro_recall, micro_f1 = evaluate(predictions, labels, True)
macro_precision, macro_recall, macro_f1 = evaluate(predictions, labels, False)

print("Micro Precision: {:.2f}".format(micro_precision))
print("Micro Recall: {:.2f}".format(micro_recall))
print("Micro F1: {:.2f}".format(micro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("Macro F1: {:.2f}".format(macro_f1))


Micro Precision: 0.52
Micro Recall: 0.63
Micro F1: 0.57
Macro Precision: 0.56
Macro Recall: 0.63
Macro F1: 0.58
