In [None]:
import spacy
from spacy.tokens import Doc, SpanGroup
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
from tqdm import autonotebook as tqdm
from spacy.training import biluo_tags_to_spans
import iobes
import re
from itertools import combinations
import openai
import os
import time

openai.api_key = os.environ.get('OPENAI_API_KEY')



In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
data_dir = Path("../data/teaching-dataset")
with (data_dir / "relation_classification_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "relation_classification_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = []
        for line in f.read().decode("utf-8").split("\n"):
            relations = []
            for relation in re.finditer(r"\(\((\d+),(\d+)\),\((\d+),(\d+)\)\)", line):
                relation = (
                    (int(relation.group(1)), int(relation.group(2))),
                    (int(relation.group(3)), int(relation.group(4))),
                )
                relations.append(relation)
            labels.append(relations)

print(sentences)

In [None]:
def parse_sentence(sentence):
    words = []
    tags = []
    for item in sentence:
        word, tag = item.split(" ")
        words.append(word)
        tags.append(tag)
    doc = Doc(nlp.vocab, words=words)
    doc = nlp(doc)
    tags = iobes.bio_to_bilou(tags)
    doc.ents = biluo_tags_to_spans(doc, tags)
    return doc


for sentence in sentences:
    print(sentence)

In [None]:
# def print_tags_and_tokens_for_sentences(sentences):
#     for sentence in sentences:
#         doc = parse_sentence(sentence)
#         for token in doc:
#             print(f"{token.text}, {token.ents}, POS-Tag: {token.pos_}, Dep-Tag: {token.dep_}")
#         print("\n")
# print_tags_and_tokens_for_sentences(sentences)


In [None]:
def gen_spans(sentences):
    spans_per_sentence = []
    spans = []
    for sentence_index in range(len(sentences)):
        span_start = -1
        span_end = -1
        for word_index in range(len(sentences[sentence_index])):

            word, tag = sentences[sentence_index][word_index].split(' ')
            if tag == 'B-EVENT':
                span_start = word_index
                for end_index in range(span_start, len(sentences[sentence_index])):
                    word, tag = sentences[sentence_index][end_index].split(' ')
                    if tag == 'O':
                        span_end = end_index
                        spans.append((span_start, span_end))
                        break
            span_start = -1
            span_end = -1
        spans_per_sentence.append(spans)
        spans = []
    return spans_per_sentence
    
spans_per_sentence = gen_spans(sentences)
print(spans_per_sentence)



In [None]:
labels[0]

In [None]:
# def predict(sentences, spans_per_sentence):
predictions = []
for sentence_index in range(len(sentences)):
    sentence_predictions = []
    doc = parse_sentence(sentences[sentence_index])
    doc.ents

    for entity_index1 in range(len(doc.ents)):
        for entity_index2 in range(entity_index1+1, len(doc.ents)):
            prediction = []
            prompt = f'Answer with "1" if "{doc.ents[entity_index1]}" is the cause for "{doc.ents[entity_index2]}". Answer with "2" if "{doc.ents[entity_index2]}" is the cause for "{doc.ents[entity_index1]}". Answer with "0" if there are no causal relations between the two. Answer only with that one word.'
            print(prompt)
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "system", "content": prompt},],
                temperature= 0,
                max_tokens = 200
                )
        
            message = response.choices[0].message.content
            print(doc)
            print(message)
            time.sleep(2)
    

            if '1' == message:
                prediction.append((spans_per_sentence[sentence_index][entity_index1], spans_per_sentence[sentence_index][entity_index2]))
            elif '2' == message:
                prediction.append((spans_per_sentence[sentence_index][entity_index2], spans_per_sentence[sentence_index][entity_index1]))
            elif '0' == message:
                print("No Causal Prediction")
            else:
                print("Wrong output")

            sentence_predictions.append(prediction)
            print(sentence_index, '/', len(sentences))

    predictions.append(sentence_predictions)

    # return predictions

# predictions = predict(sentences, spans_per_sentence)


In [None]:
print(predictions)

In [None]:
def evaluate(predictions, references, micro_avg=True):
    tp = []
    fp = []
    fn = []
    for prediction, reference in zip(predictions, references):
        tp.append(len(set(prediction) & set(reference)))
        fp.append(len(set(prediction) - set(reference)))
        fn.append(len(set(reference) - set(prediction)))
    if micro_avg:
        tp = [sum(tp)]
        fp = [sum(fp)]
        fn = [sum(fn)]
    precision = [0 if tp[i] == 0 else tp[i] / (tp[i] + fp[i]) for i in range(len(tp))]
    recall = [0 if tp[i] == 0 else tp[i] / (tp[i] + fn[i]) for i in range(len(tp))]
    f1 = [
        0
        if precision[i] * recall[i] == 0
        else 2 * precision[i] * recall[i] / (precision[i] + recall[i])
        for i in range(len(tp))
    ]
    precision = sum(precision) / len(precision)
    recall = sum(recall) / len(recall)
    f1 = sum(f1) / len(f1)
    return precision, recall, f1


micro_precision, micro_recall, micro_f1 = evaluate(predictions, labels, True)
macro_precision, macro_recall, macro_f1 = evaluate(predictions, labels, False)

print("Micro Precision: {:.2f}".format(micro_precision))
print("Micro Recall: {:.2f}".format(micro_recall))
print("Micro F1: {:.2f}".format(micro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("Macro F1: {:.2f}".format(macro_f1))


In [None]:
def overlap(ref_event, pred_event):
    return max(ref_event[0], pred_event[0]) <= min(ref_event[1], pred_event[1])


def evaluate_pair(reference, prediction):
    ref_cause, ref_effect = reference
    pred_cause, pred_effect = prediction
    if ref_cause == pred_cause and ref_effect == pred_effect:
        return 1
    elif overlap(ref_cause, pred_cause) and overlap(ref_effect, pred_effect):
        return 0.5
    return 0

def precision(tp, fp):
    if not tp:
        return 0
    return tp / (tp + fp)

def recall(tp, fn):
    if not tp:
        return 0
    return tp / (tp + fn)

def f1(tp, fp, fn):
    if not tp:
        return 0
    return 2 * tp / (2 * tp + fp + fn)

def evaluate(references, predictions):
    tps, fps, fns = [], [], []
    for reference, prediction in zip(references, predictions):
        tp, fp, fn = 0, 0, 0
        remaining_references = set(reference)
        for pred in prediction:
            for ref in remaining_references:
                score = evaluate_pair(ref, pred)
                if score:
                    tp += score
                    remaining_references.remove(ref)
                    break
            else:
                fp += 1
        fn += len(remaining_references)
        tps.append(tp)
        fps.append(fp)
        fns.append(fn)

    macro_prec = sum([precision(tp, fp) for tp, fp in zip(tps, fps)]) / len(tps)
    macro_rec = sum([recall(tp, fn) for tp, fn in zip(tps, fns)]) / len(tps)
    macro_f1 = sum([f1(tp, fp, fn) for tp, fp, fn in zip(tps, fps, fns)]) / len(tps)
    micro_prec = precision(sum(tps), sum(fps))
    micro_rec = recall(sum(tps), sum(fns))
    micro_f1 = f1(sum(tps), sum(fps), sum(fns))
    return {
        "macro": {"precision": macro_prec, "recall": macro_rec, "f1": macro_f1},
        "micro": {"precision": micro_prec, "recall": micro_rec, "f1": micro_f1},
    }

In [None]:
evaluate(labels, predictions)

# evaluate(labels, [((14, 16), (9, 11))])

In [None]:
data_dir = Path("./data/teaching-dataset")
with (data_dir / "relation_classification_text_test.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        test_sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]

test_spans_per_sentence = gen_spans(test_sentences)
# test_predictions = predict(test_sentences, test_spans_per_sentence)

# test_predictions = []
# for sentence in tqdm.tqdm(test_sentences):
#     doc = nlp(Doc(nlp.vocab, words=sentence))
#     test_predictions.append(predict(doc))


In [None]:
# test_predictions = []
# for sentence_index in range(len(test_sentences)):
#     prediction = []
#     doc = parse_sentence(test_sentences[sentence_index])
#     doc.ents
#     prompt = f'Answer with "1" if "{doc.ents[0]}" is the cause for "{doc.ents[1]}". Answer with "2" if "{doc.ents[1]}" is the cause for "{doc.ents[0]}". Answer with "0" if there are no causal relations between the two. Answer only with that one word.'
#     print(prompt)
#     response = openai.ChatCompletion.create(
#         model="gpt-3.5-turbo",
#         messages=[{"role": "system", "content": prompt},],
#         temperature= 0,
#         max_tokens=200,
#         )
        
#     message = response.choices[0].message.content
#     print(doc)
#     print(message)
#     time.sleep(5)


#     if '1' == message:
#         prediction.append((test_spans_per_sentence[sentence_index][0], test_spans_per_sentence[sentence_index][1]))
#     elif '2' == message:
#         prediction.append((test_spans_per_sentence[sentence_index][1], test_spans_per_sentence[sentence_index][0]))
#     elif '0' == message:
#         print("No Causal Prediction")
#     else:
#         print("Wrong output")

#     test_predictions.append(prediction)
#     print(sentence_index+1, '/', len(test_sentences))

In [None]:
test_predictions = []
for sentence_index in range(len(test_sentences)):
    sentence_predictions = []
    doc = parse_sentence(test_sentences[sentence_index])
    doc.ents

    for entity_index1 in range(len(doc.ents)):
        for entity_index2 in range(entity_index1+1, len(doc.ents)):
            prediction = []
            prompt = f'Answer with "1" if "{doc.ents[entity_index1]}" is the cause for "{doc.ents[entity_index2]}". Answer with "2" if "{doc.ents[entity_index2]}" is the cause for "{doc.ents[entity_index1]}". Answer with "0" if there are no causal relations between the two. Answer only with that one word.'
            print(prompt)
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[{"role": "system", "content": prompt},],
                temperature= 0,
                max_tokens = 200
                )
        
            message = response.choices[0].message.content
            print(doc)
            print(message)
            time.sleep(5)
    

            if '1' == message:
                prediction.append((test_spans_per_sentence[sentence_index][entity_index1], test_spans_per_sentence[sentence_index][entity_index2]))
            elif '2' == message:
                prediction.append((test_spans_per_sentence[sentence_index][entity_index2], test_spans_per_sentence[sentence_index][entity_index1]))
            elif '0' == message:
                print("No Causal Prediction")
            else:
                print("Wrong output")

            sentence_predictions.append(prediction)
            print(sentence_index, '/', len(test_sentences))

    test_predictions.append(sentence_predictions)

In [None]:
with open("predictions.txt", "w") as f:
    # f.write("\n".join(",".join(str(relation) for relation in prediction) for prediction in test_predictions).replace(" ", ""))
    f.write("\n".join(",".join(str(relation) for sublist in prediction for relation in sublist) for prediction in test_predictions).replace(" ", "").replace("[","").replace("]",""))


In [None]:
print(test_predictions)

# multiple predictions per sentence
  "Macro Precision": "0.4721191223742245",

  "Macro Recall": "0.5880952380952381",

  "Macro F1": "0.5085862665868351",

  "Micro Precision": "0.3944223107569721",

  "Micro Recall": "0.6149068322981367",
  
  "Micro F1": "0.48058252427184467"


# only one prediction per sentence
  "Macro Precision": "0.5612244897959183",

  "Macro Recall": "0.4340136054421769",

  "Macro F1": "0.4687074829931972",

  "Micro Precision": "0.6111111111111112",

  "Micro Recall": "0.3416149068322981",

  "Micro F1": "0.43824701195219123"