In [1]:
import spacy
from spacy.tokens import Doc, SpanGroup
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
from tqdm import autonotebook as tqdm
from spacy.training import biluo_tags_to_spans
import iobes
import re
from itertools import combinations
import openai
import os
import time

openai.api_key = os.environ.get('OPENAI_API_KEY')



  from tqdm import autonotebook as tqdm


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
data_dir = Path("./data/teaching-dataset")
with (data_dir / "relation_classification_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "relation_classification_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = []
        for line in f.read().decode("utf-8").split("\n"):
            relations = []
            for relation in re.finditer(r"\(\((\d+),(\d+)\),\((\d+),(\d+)\)\)", line):
                relation = (
                    (int(relation.group(1)), int(relation.group(2))),
                    (int(relation.group(3)), int(relation.group(4))),
                )
                relations.append(relation)
            labels.append(relations)

print(sentences)



In [4]:
def parse_sentence(sentence):
    words = []
    tags = []
    for item in sentence:
        word, tag = item.split(" ")
        words.append(word)
        tags.append(tag)
    doc = Doc(nlp.vocab, words=words)
    doc = nlp(doc)
    tags = iobes.bio_to_bilou(tags)
    doc.ents = biluo_tags_to_spans(doc, tags)
    return doc


for sentence in sentences:
    print(sentence)

['According O', 'two O', 'different O', 'studies O', 'it O', 'seems O', 'plausible O', 'that O', 'the O', 'Pohang B-EVENT', 'earthquake I-EVENT', 'was O', 'induced O', 'by O', 'EGS B-EVENT', 'operations I-EVENT', '. O']
['Signs O', 'and O', 'symptoms O', 'include O', ': O', 'Dyspnea O', '( O', 'shortness O', 'of O', 'breath O', ') O', 'exacerbated O', 'by O', 'exertion O', 'Cough O', ', O', 'often O', 'persistent O', 'and O', 'sometimes O', 'severe O', 'Fatigue O', 'Tachypnea O', '( O', 'rapid O', 'breathing O', ') O', 'which O', 'is O', 'often O', 'labored O', ', O', 'Loss O', 'of O', 'appetite O', 'and O', 'weight O', 'loss O', 'Chest O', 'pain O', 'Fever O', 'Gradual O', 'darkening O', 'of O', 'skin O', '( O', 'blue O', 'skin O', ') O', 'Gradual B-EVENT', 'dark I-EVENT', 'shallow I-EVENT', 'rifts I-EVENT', 'in I-EVENT', 'nails I-EVENT', 'eventually O', 'leading O', 'to O', 'cracks B-EVENT', 'as O', 'protein O', 'fibers O', 'within O', 'nail O', 'beds O', 'are O', 'destroyed O', '. O

In [5]:
# def print_tags_and_tokens_for_sentences(sentences):
#     for sentence in sentences:
#         doc = parse_sentence(sentence)
#         for token in doc:
#             print(f"{token.text}, {token.ents}, POS-Tag: {token.pos_}, Dep-Tag: {token.dep_}")
#         print("\n")
# print_tags_and_tokens_for_sentences(sentences)


In [6]:
def gen_spans(sentences):
    spans_per_sentence = []
    spans = []
    for sentence_index in range(len(sentences)):
        span_start = -1
        span_end = -1
        for word_index in range(len(sentences[sentence_index])):

            word, tag = sentences[sentence_index][word_index].split(' ')
            if tag == 'B-EVENT':
                span_start = word_index
                for end_index in range(span_start, len(sentences[sentence_index])):
                    word, tag = sentences[sentence_index][end_index].split(' ')
                    if tag == 'O':
                        span_end = end_index
                        spans.append((span_start, span_end))
                        break
            span_start = -1
            span_end = -1
        spans_per_sentence.append(spans)
        spans = []
    return spans_per_sentence
    
spans_per_sentence = gen_spans(sentences)
print(spans_per_sentence)



[[(9, 11), (14, 16)], [(49, 55), (58, 59)], [(0, 4), (8, 11), (14, 21)], [(0, 2), (5, 6), (11, 13), (14, 18), (24, 26), (29, 31), (32, 33), (35, 38)], [(32, 39), (43, 46)], [(0, 2), (9, 15)], [(0, 1), (33, 37)], [(0, 2), (5, 8)], [(7, 9), (20, 24), (25, 28), (36, 38)], [(0, 2), (4, 7), (14, 17)], [(3, 8), (12, 22), (24, 32)], [(1, 5), (8, 12)], [(0, 7), (10, 11), (12, 13), (14, 17), (18, 20), (22, 26)], [(0, 1), (4, 7), (11, 13), (14, 15), (16, 17), (18, 19)], [(0, 1), (16, 19)], [(1, 4), (6, 14)], [(1, 6), (12, 13)], [(5, 6), (7, 8)], [(0, 2), (5, 7), (10, 11), (17, 18), (20, 21)], [(26, 28), (30, 33), (34, 35), (36, 37), (39, 40), (42, 44)], [(0, 2), (13, 17), (19, 26), (38, 41), (57, 64), (65, 69), (70, 81), (82, 88), (94, 97)], [(1, 3), (8, 11), (22, 24)], [(5, 7), (9, 22), (23, 25), (28, 31)], [(1, 2), (4, 7)], [(11, 14), (16, 19)], [(0, 2), (5, 7), (9, 11)], [(1, 9), (19, 20)], [(0, 3), (6, 9), (32, 33)], [(1, 2), (7, 9), (11, 13), (14, 16), (17, 19), (20, 24), (28, 29), (30, 33)

In [7]:
labels[0]

[((14, 16), (9, 11))]

In [15]:
# def predict(sentences, spans_per_sentence):
predictions = []
for sentence_index in range(len(sentences)):
    prediction = []
    doc = parse_sentence(sentences[sentence_index])
    doc.ents
    prompt = f'Answer with "1" if "{doc.ents[0]}" is the cause for "{doc.ents[1]}". Answer with "2" if "{doc.ents[1]}" is the cause for "{doc.ents[0]}". Answer with "0" if there are no causal relations between the two. Answer only with that one word.'
    print(prompt)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "system", "content": prompt},],
        temperature= 0,
        max_tokens = 200
        )
        
    message = response.choices[0].message.content
    print(doc)
    print(message)
    time.sleep(10)
    

    if '1' == message:
        prediction.append((spans_per_sentence[sentence_index][0], spans_per_sentence[sentence_index][1]))
    elif '2' == message:
        prediction.append((spans_per_sentence[sentence_index][1], spans_per_sentence[sentence_index][0]))
    elif '0' == message:
        print("No Causal Prediction")
    else:
        print("Wrong output")

    predictions.append(prediction)
    print(sentence_index, '/', len(sentences))

    # return predictions

# predictions = predict(sentences, spans_per_sentence)


Answer with "1" if "Pohang earthquake" is the cause for "EGS operations". Answer with "2" if "EGS operations" is the cause for "Pohang earthquake". Answer with "0" if there are no causal relations between the two. Answer only with that one word.
According two different studies it seems plausible that the Pohang earthquake was induced by EGS operations . 
1
Answer with "1" if "Gradual dark shallow rifts in nails" is the cause for "cracks". Answer with "2" if "cracks" is the cause for "Gradual dark shallow rifts in nails". Answer with "0" if there are no causal relations between the two. Answer only with that one word.
Signs and symptoms include : Dyspnea ( shortness of breath ) exacerbated by exertion Cough , often persistent and sometimes severe Fatigue Tachypnea ( rapid breathing ) which is often labored , Loss of appetite and weight loss Chest pain Fever Gradual darkening of skin ( blue skin ) Gradual dark shallow rifts in nails eventually leading to cracks as protein fibers within n

RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID e6b017afd8b7bd6c7f3d81705eca7af9 in your message.)

In [16]:
print(predictions)

[[((9, 11), (14, 16))], [((49, 55), (58, 59))], [((0, 4), (8, 11))], [((0, 2), (5, 6))], [((32, 39), (43, 46))], [((0, 2), (9, 15))], [((33, 37), (0, 1))], [((0, 2), (5, 8))], [((20, 24), (7, 9))], [((0, 2), (4, 7))], [((3, 8), (12, 22))], [((1, 5), (8, 12))], [((0, 7), (10, 11))], [((0, 1), (4, 7))], [((0, 1), (16, 19))], [((1, 4), (6, 14))], [((1, 6), (12, 13))], [((5, 6), (7, 8))], [((0, 2), (5, 7))], [], [((0, 2), (13, 17))], [((1, 3), (8, 11))], [((5, 7), (9, 22))], [((1, 2), (4, 7))], [((11, 14), (16, 19))], [((0, 2), (5, 7))], [((1, 9), (19, 20))], [((0, 3), (6, 9))], [], [((4, 6), (9, 10))], [((0, 2), (8, 11))], [((0, 3), (5, 15))], [((0, 1), (3, 6))], [((4, 8), (15, 17))], [((1, 2), (4, 6))], [((0, 2), (13, 14))], [((7, 9), (16, 17))]]


In [17]:
def evaluate(predictions, references, micro_avg=True):
    tp = []
    fp = []
    fn = []
    for prediction, reference in zip(predictions, references):
        tp.append(len(set(prediction) & set(reference)))
        fp.append(len(set(prediction) - set(reference)))
        fn.append(len(set(reference) - set(prediction)))
    if micro_avg:
        tp = [sum(tp)]
        fp = [sum(fp)]
        fn = [sum(fn)]
    precision = [0 if tp[i] == 0 else tp[i] / (tp[i] + fp[i]) for i in range(len(tp))]
    recall = [0 if tp[i] == 0 else tp[i] / (tp[i] + fn[i]) for i in range(len(tp))]
    f1 = [
        0
        if precision[i] * recall[i] == 0
        else 2 * precision[i] * recall[i] / (precision[i] + recall[i])
        for i in range(len(tp))
    ]
    precision = sum(precision) / len(precision)
    recall = sum(recall) / len(recall)
    f1 = sum(f1) / len(f1)
    return precision, recall, f1


micro_precision, micro_recall, micro_f1 = evaluate(predictions, labels, True)
macro_precision, macro_recall, macro_f1 = evaluate(predictions, labels, False)

print("Micro Precision: {:.2f}".format(micro_precision))
print("Micro Recall: {:.2f}".format(micro_recall))
print("Micro F1: {:.2f}".format(micro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("Macro F1: {:.2f}".format(macro_f1))


Micro Precision: 0.74
Micro Recall: 0.26
Micro F1: 0.39
Macro Precision: 0.70
Macro Recall: 0.49
Macro F1: 0.54


In [18]:
def overlap(ref_event, pred_event):
    return max(ref_event[0], pred_event[0]) <= min(ref_event[1], pred_event[1])


def evaluate_pair(reference, prediction):
    ref_cause, ref_effect = reference
    pred_cause, pred_effect = prediction
    if ref_cause == pred_cause and ref_effect == pred_effect:
        return 1
    elif overlap(ref_cause, pred_cause) and overlap(ref_effect, pred_effect):
        return 0.5
    return 0

def precision(tp, fp):
    if not tp:
        return 0
    return tp / (tp + fp)

def recall(tp, fn):
    if not tp:
        return 0
    return tp / (tp + fn)

def f1(tp, fp, fn):
    if not tp:
        return 0
    return 2 * tp / (2 * tp + fp + fn)

def evaluate(references, predictions):
    tps, fps, fns = [], [], []
    for reference, prediction in zip(references, predictions):
        tp, fp, fn = 0, 0, 0
        remaining_references = set(reference)
        for pred in prediction:
            for ref in remaining_references:
                score = evaluate_pair(ref, pred)
                if score:
                    tp += score
                    remaining_references.remove(ref)
                    break
            else:
                fp += 1
        fn += len(remaining_references)
        tps.append(tp)
        fps.append(fp)
        fns.append(fn)

    macro_prec = sum([precision(tp, fp) for tp, fp in zip(tps, fps)]) / len(tps)
    macro_rec = sum([recall(tp, fn) for tp, fn in zip(tps, fns)]) / len(tps)
    macro_f1 = sum([f1(tp, fp, fn) for tp, fp, fn in zip(tps, fps, fns)]) / len(tps)
    micro_prec = precision(sum(tps), sum(fps))
    micro_rec = recall(sum(tps), sum(fns))
    micro_f1 = f1(sum(tps), sum(fps), sum(fns))
    return {
        "macro": {"precision": macro_prec, "recall": macro_rec, "f1": macro_f1},
        "micro": {"precision": micro_prec, "recall": micro_rec, "f1": micro_f1},
    }

In [19]:
evaluate(labels, predictions)

# evaluate(labels, [((14, 16), (9, 11))])

{'macro': {'precision': 0.7027027027027027,
  'recall': 0.48783783783783785,
  'f1': 0.5378378378378377},
 'micro': {'precision': 0.7428571428571429,
  'recall': 0.26,
  'f1': 0.3851851851851852}}

In [13]:
data_dir = Path("./data/teaching-dataset")
with (data_dir / "relation_classification_text_test.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        test_sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]

test_spans_per_sentence = gen_spans(test_sentences)
# test_predictions = predict(test_sentences, test_spans_per_sentence)

# test_predictions = []
# for sentence in tqdm.tqdm(test_sentences):
#     doc = nlp(Doc(nlp.vocab, words=sentence))
#     test_predictions.append(predict(doc))


In [11]:
test_predictions = []
for sentence_index in range(len(test_sentences)):
    prediction = []
    doc = parse_sentence(test_sentences[sentence_index])
    doc.ents
    prompt = f'Answer with "1" if "{doc.ents[0]}" is the cause for "{doc.ents[1]}". Answer with "2" if "{doc.ents[1]}" is the cause for "{doc.ents[0]}". Answer with "0" if there are no causal relations between the two. Answer only with that one word.'
    print(prompt)
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "system", "content": prompt},],
        temperature= 0,
        max_tokens=200,
        )
        
    message = response.choices[0].message.content
    print(doc)
    print(message)
    time.sleep(5)


    if '1' == message:
        prediction.append((test_spans_per_sentence[sentence_index][0], test_spans_per_sentence[sentence_index][1]))
    elif '2' == message:
        prediction.append((test_spans_per_sentence[sentence_index][1], test_spans_per_sentence[sentence_index][0]))
    elif '0' == message:
        print("No Causal Prediction")
    else:
        print("Wrong output")

    test_predictions.append(prediction)
    print(sentence_index, '/', len(test_sentences))

Answer with "1" if "fallout" is the cause for "localized burns". Answer with "2" if "localized burns" is the cause for "fallout". Answer with "0" if there are no causal relations between the two. Answer only with that one word.
After the Trinity test , the fallout caused localized burns on the backs of cattle in the area downwind . 
0
No Causal Prediction
Answer with "1" if "Arteriovenous fistula" is the cause for "chronic venous insufficiency". Answer with "2" if "chronic venous insufficiency" is the cause for "Arteriovenous fistula". Answer with "0" if there are no causal relations between the two. Answer only with that one word.
Arteriovenous fistula ( an abnormal connection or passageway between an artery and a vein ) may cause chronic venous insufficiency even with working vein valves . 
0
No Causal Prediction
Answer with "1" if "decreased protein intake" is the cause for "mobility limitations". Answer with "2" if "mobility limitations" is the cause for "decreased protein intake".

RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 2bdffdab14121743ee08342536420393 in your message.)

In [12]:
with open("predictions.txt", "w") as f:
    f.write("\n".join(",".join(str(relation) for relation in prediction) for prediction in test_predictions).replace(" ", ""))