In [None]:
import spacy
from spacy.tokens import Doc
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
from seqeval.metrics import classification_report
from seqeval import scheme
from tqdm import autonotebook as tqdm


In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
data_dir = Path("./data/teaching-dataset")
with (data_dir / "span_extraction_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "span_extraction_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]

In [None]:
for token, label in zip(sentences[0], labels[0]):
    print(f"{token :<16} {label}")

In [None]:
doc = Doc(nlp.vocab, words=sentences[0])
doc = nlp(doc)

In [None]:
for token in doc:
    print(token, token.pos_, token.dep_)

In [None]:
for chunk in doc.noun_chunks:
    print(chunk)

In [None]:
def predict_1(sentence):
    doc = Doc(nlp.vocab, words=sentence)
    doc = nlp(doc)
    predictions = ["O"] * len(sentence)
    for noun_chunk in doc.noun_chunks:
        predictions[noun_chunk.start] = "B-EVENT"
        for i in range(noun_chunk.start + 1, noun_chunk.end):
            predictions[i] = "I-EVENT"
    return predictions

def predict_2(sentence):
    doc = Doc(nlp.vocab, words=sentence)
    doc = nlp(doc)
    matcher = Matcher(nlp.vocab)
    pattern = [
        {"POS": {"IN": ["NOUN", "PROPN", "ADJ"]}, "OP": "+"},
    ]
    matcher.add("EVENT", [pattern])
    predictions = ["O"] * len(sentence)
    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]
    for span in spacy.util.filter_spans(spans):
        predictions[span.start] = "B-EVENT"
        for i in range(span.start + 1, span.end):
            predictions[i] = "I-EVENT"
    return predictions

In [None]:
idx = 1
sentence = sentences[idx]
doc = Doc(nlp.vocab, words=sentence)
doc = nlp(doc)
iterator = zip(doc, predict_1(sentences[idx]), predict_2(sentences[idx]), labels[idx])
for token, pred_1, pred_2, label in iterator:
    print(f"{token.text :<16} {label :<8} {pred_1 :<8} {pred_2 :<8} {token.pos_ :<8}")

In [None]:
predictions_1 = [predict_1(sentence) for sentence in tqdm.tqdm(sentences)]
print(classification_report(labels, predictions_1, scheme=scheme.IOB2, mode="strict"))

In [None]:
predictions_2 = [predict_2(sentence) for sentence in tqdm.tqdm(sentences)]
print(classification_report(labels, predictions_2, scheme=scheme.IOB2, mode="strict"))

In [None]:
def print_prediction(sentence, prediction, labels):
    for token, pred, label in zip(sentence, prediction, labels):
        print(f"{token :<16} {pred :<8} {label}")

idx = 1
print_prediction(sentences[idx], predictions_1[idx], labels[idx])

In [None]:
Path("predictions.txt").write_text("\n\n".join(["\n".join(sentence) for sentence in predictions_2]));

In [None]:
# evaluation wie auf tira
# evaluate ist ein package, das ihr installieren müsst, `pip install evaluate`
import evaluate

# speichert die training predictions unter predictions.txt ab
# führt dann diese zelle aus. falls alles funktioniert, sollte es auch auf tira gehen
# auf tira aber natürlich den test datensatz verwenden!!!!

loaded_predictions = list(sentence.split("\n") for sentence in Path("predictions.txt").read_text().split("\n\n"))

evaluator = evaluate.load("fschlatt/ner_eval")
evaluator.compute(predictions=loaded_predictions, references=labels)