In [1]:
#!pip install iobes
#!pip install seqeval
#!pip install sklearn_crfsuite 
#!conda install spacy
#!pip uninstall numpy
#!pip install numpy

import spacy
from spacy.tokens import Doc, SpanGroup
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
#from tqdm import autonotebook as tqdm
from tqdm import tqdm
from spacy.training import biluo_tags_to_spans
import iobes
import re
from itertools import combinations

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from seqeval import scheme

import sklearn_crfsuite
from collections import Counter


In [2]:
#!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [3]:
data_dir = Path("../data/teaching-dataset")
with (data_dir / "relation_classification_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "relation_classification_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = []
        for line in f.read().decode("utf-8").split("\n"):
            relations = []
            for relation in re.finditer(r"\(\((\d+),(\d+)\),\((\d+),(\d+)\)\)", line):
                relation = (
                    (int(relation.group(1)), int(relation.group(2))),
                    (int(relation.group(3)), int(relation.group(4))),
                )
                relations.append(relation)
            labels.append(relations)

In [4]:
def parse_sentence(sentence):
    words = []
    tags = []
    for item in sentence:
        word, tag = item.split(" ")
        words.append(word)
        tags.append(tag)
    doc = Doc(nlp.vocab, words=words)
    doc = nlp(doc)
    tags = iobes.bio_to_bilou(tags)
    doc.ents = biluo_tags_to_spans(doc, tags)
    return doc

In [5]:
# get list of all sentences with multiple relationships
helperlist = []
for i in range(len(sentences)):
    if len(labels[i]) > 1:
        helperlist.append(int(i))
print(helperlist)

[2, 3, 8, 9, 10, 12, 13, 18, 19, 20, 21, 22, 25, 27, 28, 30, 33, 34, 35, 36, 40, 43, 45, 47, 50, 51, 61, 64, 65, 67, 70, 72, 74, 77, 78, 79, 81, 82, 85, 86, 90, 91, 93, 95, 97, 99, 100, 102, 103, 104, 105, 106, 107, 108, 111, 112, 113, 114, 115, 118, 121, 123, 128, 130, 136, 138, 140, 145, 146, 149, 151, 155, 157, 159, 160, 170, 173, 174, 177, 179, 180, 182, 183, 185, 193, 194, 195, 197, 200, 202, 205, 209, 210, 211, 212, 220, 227, 228, 229, 230, 232, 233, 236, 238, 239, 240, 241, 246, 249, 252, 262, 264, 266, 268, 271, 275, 277, 279, 280, 284, 288, 294, 296, 301, 302, 303, 305, 307, 309, 312, 314, 316, 317, 320, 322, 323, 324, 325, 335, 338, 340, 343, 344, 346, 347, 349, 357, 359, 362, 369, 372, 373, 374, 382, 385, 388, 389, 390, 391, 392, 393, 394, 396, 397, 400, 401, 403, 407, 409, 410, 411, 413, 415, 422, 427, 433, 434, 437, 441, 442, 446, 448, 449, 450, 451, 452, 453, 454, 455, 456, 458, 460, 464, 466, 467]


In [6]:
# for each sentence, get list of event spans 
helperarray = np.empty([10000,2])
counter2 = 0
for i in range(len(sentences)):
    #print(labels[i])
    for entries in labels[i]:
        #print(entries)
        for numbers in entries:
            #print(numbers)
            for singlets in numbers:
                #print(i, singlets)
                helperarray[counter2] = [i, singlets]
                counter2 += 1
print(helperarray)

[[0.00000000e+000 1.40000000e+001]
 [0.00000000e+000 1.60000000e+001]
 [0.00000000e+000 9.00000000e+000]
 ...
 [7.67694262e+170 1.07692185e+026]
 [2.52022643e+180 1.07692187e+026]
 [2.43207542e-152 2.31412756e-152]]


In [7]:
# get list of event positions for each sentence
linelist = []
splitSentenceList = []
newDict = {}
labelDict = {}
indexcounter = 0

# items = sentences with multiple relations
for items in helperlist:
    # helperarray contains events per sentence 
    for lines in helperarray:
        # linelist is list of event positions per sentence
        if lines[0] == items:
            linelist.append(lines[1])
    # pop 4 positions each time
    for i in range(int(len(linelist)/4)):
        splitSentenceList.append(linelist.pop(0))
        splitSentenceList.append(linelist.pop(0))
        splitSentenceList.append(linelist.pop(0))
        splitSentenceList.append(linelist.pop(0))
        
        original_sentence = sentences[items]
        #print(original_sentence)
        modified_sentences = [None] * len(sentences[items])
        
        # iterate over 2 intervals, positions 1,2 and positions 3,4
        for j in range(len(sentences[items])):
            
            #get labels for new sentences 
            if int(splitSentenceList[0]) < int(splitSentenceList[2]):
                labelDict[indexcounter] = '0'
            else: 
                labelDict[indexcounter] = '1'
                
            #delete event label if outside of intervals
            if j not in range(int(splitSentenceList[0]), int(splitSentenceList[1])) and j not in range(int(splitSentenceList[2]), int(splitSentenceList[3])):
                w, t = sentences[items][j].split(' ')
                #print(w)
                t = 'O'
                modified_sentences[j] = ' '.join((w,t))
            else:
                modified_sentences[j] = sentences[items][j]

        
        sentences[items] = original_sentence
    
        newDict[indexcounter] = modified_sentences
        #break
        indexcounter += 1
        
    
    
    
    splitSentenceList = []
    linelist = []
#print(linelist)
print(labelDict)
print(newDict)

{0: '0', 1: '0', 2: '0', 3: '0', 4: '0', 5: '0', 6: '0', 7: '0', 8: '0', 9: '0', 10: '0', 11: '0', 12: '0', 13: '0', 14: '0', 15: '0', 16: '0', 17: '0', 18: '0', 19: '1', 20: '1', 21: '0', 22: '0', 23: '0', 24: '0', 25: '0', 26: '0', 27: '0', 28: '0', 29: '0', 30: '0', 31: '0', 32: '0', 33: '0', 34: '0', 35: '0', 36: '0', 37: '0', 38: '0', 39: '1', 40: '1', 41: '1', 42: '1', 43: '1', 44: '1', 45: '1', 46: '1', 47: '0', 48: '0', 49: '0', 50: '0', 51: '0', 52: '0', 53: '0', 54: '0', 55: '0', 56: '1', 57: '1', 58: '1', 59: '1', 60: '1', 61: '1', 62: '1', 63: '1', 64: '0', 65: '0', 66: '0', 67: '0', 68: '0', 69: '0', 70: '0', 71: '0', 72: '0', 73: '0', 74: '0', 75: '0', 76: '0', 77: '0', 78: '0', 79: '0', 80: '0', 81: '0', 82: '0', 83: '1', 84: '1', 85: '1', 86: '1', 87: '0', 88: '0', 89: '0', 90: '0', 91: '0', 92: '1', 93: '1', 94: '0', 95: '0', 96: '0', 97: '1', 98: '1', 99: '1', 100: '1', 101: '1', 102: '1', 103: '1', 104: '0', 105: '0', 106: '0', 107: '0', 108: '0', 109: '0', 110: '0',

In [8]:
# pop 4 instances from linelist
#splitSentenceList = []
#splitSentenceList.append(linelist.pop(0))
#splitSentenceList.append(linelist.pop(0))
#s#plitSentenceList.append(linelist.pop(0))
#splitSentenceList.append(linelist.pop(0))
#print(splitSentenceList)
#print(linelist)

In [9]:
#print(sentences[2])
#print(len(sentences[2]))
#original_sentence = sentences[2]
#modified_sentences = [None] * len(sentences[2])
#for i in range(len(sentences[2])):
#    if i not in range(int(splitSentenceList[0]), int(splitSentenceList[1])) and i not in range(int(splitSentenceList[2]), int(splitSentenceList[3])):
#        w, t = sentences[2][i].split(' ')
#        t = 'O'
#        modified_sentences[i] = ' '.join((w,t))
#    else:
#        modified_sentences[i] = sentences[2][i]

#print(sentences[2])
#modified_sentences.append(sentences[2])
#print(modified_sentences)
#sentences[2] = original_sentence

In [10]:
#newDict = {'a': 'test'}
#newDict[0] = original_sentence
#print(newDict[0])

In [11]:
print(original_sentence)
print(modified_sentences[0])

['High B-EVENT', 'infection I-EVENT', 'rates I-EVENT', 'for I-EVENT', 'diseases I-EVENT', 'such O', 'as O', 'respiratory B-EVENT', 'infections I-EVENT', ', O', 'malaria B-EVENT', ', O', 'tuberculosis B-EVENT', ', O', 'as O', 'well O', 'as O', 'sexually B-EVENT', 'transmitted I-EVENT', 'diseases I-EVENT', 'like O', 'HIV B-EVENT', '/ I-EVENT', 'AIDS I-EVENT', 'are O', 'the O', 'leading O', 'causes O', 'of O', 'death B-EVENT', 'for O', 'Haitians O', '. O']
High O


In [12]:
#for words in newDict:
#    print(words)

In [13]:
#new df with 1 line per sentence 
sentence_number=[]
wordlist = []
taglist = []
POSlist = []
DEPlist = []
customclass = []
iCounter = len(sentences)

biggerlist = []
biggertaglist = []
biggerPOSlist = []
biggerDEPlist = []
for i in range(len(sentences)):
    if labels[i][0][0][0] < labels[0][0][1][0]:
        customclass.append('0')
    elif labels[i][0][0][0] >= labels[0][0][1][0]:
        customclass.append('1')
for i in range(len(sentences)):
    sentence_number.append(str(i))
    
    for word in sentences[i]:
        wordlist.append(str(word.split()[0]))
        taglist.append(str(word.split()[1]))

    doc = parse_sentence(sentences[i])
    for token in doc:
        POSlist.append(token.pos_)
        DEPlist.append(token.dep_)
            

    biggerlist.append(wordlist)
    biggertaglist.append(taglist)
    biggerPOSlist.append(POSlist)
    biggerDEPlist.append(DEPlist)
    
    wordlist = []
    taglist = []
    POSlist = []
    DEPlist = []
    
    
#------------
for entries in labelDict:
    customclass.append(labelDict[entries])
    sentence_number.append(iCounter)
    iCounter += 1
    #print(iCounter)
    
    for word in newDict[entries]:
    #for word in newDict:
        wordlist.append(str(word.split()[0]))
        taglist.append(str(word.split()[1]))
    
    doc = parse_sentence(newDict[int(entries)])
    #print(doc)
    for token in doc:
        POSlist.append(token.pos_)
        DEPlist.append(token.dep_)
            

    biggerlist.append(wordlist)
    biggertaglist.append(taglist)
    biggerPOSlist.append(POSlist)
    biggerDEPlist.append(DEPlist)
    
    wordlist = []
    taglist = []
    POSlist = []
    DEPlist = []
     
    
train_data = {'Sentence #': sentence_number, 'Wordlist': biggerlist, 'Taglist': biggertaglist, 'POS': biggerPOSlist, 'DEP': biggerDEPlist, 'customclass': customclass,}
df = pd.DataFrame(data=train_data)
print(df)
print(customclass)

     Sentence #                                           Wordlist  \
0             0  [According, two, different, studies, it, seems...   
1             1  [Signs, and, symptoms, include, :, Dyspnea, (,...   
2             2  [Pneumatic, drilling, in, mines, and, less, co...   
3             3  [Interleukin-1, inhibitors, ,, such, as, canak...   
4             4  [Two, of, the, reviews, postulate, that, exerc...   
...         ...                                                ...   
1074       1074  [High, infection, rates, for, diseases, such, ...   
1075       1075  [High, infection, rates, for, diseases, such, ...   
1076       1076  [High, infection, rates, for, diseases, such, ...   
1077       1077  [High, infection, rates, for, diseases, such, ...   
1078       1078  [High, infection, rates, for, diseases, such, ...   

                                                Taglist  \
0     [O, O, O, O, O, O, O, O, O, B-EVENT, I-EVENT, ...   
1     [O, O, O, O, O, O, O, O, O, O, O, O

In [14]:
df.isnull().sum()

df = df.fillna(method='ffill')

df.groupby('customclass').size().reset_index(name='counts')
print(df.groupby('customclass').size().reset_index(name='counts'))

#X = df.drop('Tag', axis=1)
X = df.drop('customclass', axis=1)
#print(X)

v = DictVectorizer(sparse=False)

X = v.fit_transform(X.to_dict('records'))

for i in range(10):
    print(X[i])
    
#y = df.Tag.values
y = df.customclass.values
classes = np.unique(y)
classes = classes.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=3)
X_train.shape, y_train.shape

  customclass  counts
0           0     726
1           1     353
[1. 0. 0. ... 0. 0. 0.]
[2. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[1. 1. 1. ... 0. 0. 0.]
[1. 1. 0. ... 0. 0. 0.]
[1. 1. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]


((863, 4016), (863,))

In [15]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)
#per.fit(X_train, y_train, classes, coef_init = )

-- Epoch 1
Norm: 679.26, NNZs: 2460, Bias: -39.000000, T: 863, Avg. loss: 81705.526072
Total training time: 0.00 seconds.


Perceptron(max_iter=5, n_jobs=-1, verbose=10)

In [16]:
#per.partial_fit(X_train, y_train, classes)

In [17]:
new_classes = classes.copy()
print(new_classes)

['0', '1']


In [18]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test))

              precision    recall  f1-score   support

           0       0.65      0.99      0.78       136
           1       0.80      0.10      0.18        80

    accuracy                           0.66       216
   macro avg       0.73      0.54      0.48       216
weighted avg       0.71      0.66      0.56       216



In [19]:
import sys
np.set_printoptions(threshold=sys.maxsize)
print(per.predict(X_test))
print(y_test)

['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '1' '0' '0' '1' '0'
 '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0']
['1' '0' '0' '0' '1' '0' '0' '1' '1' '0' '0' '0' '1' '0' '1' '1' '0' '0'
 '1' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '1' '

In [20]:
def extract_active_passive(doc):
    # https://stackoverflow.com/questions/74528441/detect-passive-or-active-sentence-from-text
    passive_rules = [
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "VBN"},
        ],
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "VBZ"},
        ],
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "RB"},
            {"TAG": "VBN"},
        ],
    ]
    # Create pattern to match active voice use
    active_rules = [
        [{"DEP": "nsubj"}, {"TAG": "VBD", "DEP": "ROOT"}],
        [{"DEP": "nsubj"}, {"TAG": "VBP"}, {"TAG": "VBG", "OP": "!"}],
        [{"DEP": "nsubj"}, {"DEP": "aux", "OP": "*"}, {"TAG": "VB"}],
        [{"DEP": "nsubj"}, {"DEP": "aux", "OP": "*"}, {"TAG": "VBG"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "*"}, {"TAG": "VBG"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "*"}, {"TAG": "VBZ"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "+"}, {"TAG": "VBD"}],
    ]
    matcher = Matcher(nlp.vocab)
    matcher.add("Passive", passive_rules)
    matcher.add("Active", active_rules)
    matches = matcher(doc)
    matches = [
        (nlp.vocab.strings[match_id], doc[start:end])
        for match_id, start, end in matches
    ]
    return matches


def predict(sentence):
    doc = parse_sentence(sentence)
    matches = extract_active_passive(doc)
    predictions = []
    for ent_1, ent_2 in combinations(doc.ents, 2):
        for match_type, match_span in matches:
            if SpanGroup(doc, spans=[ent_1, ent_2, match_span]).has_overlap:
                match_active = match_type == "Active"
                if match_active:
                    predictions.append(
                        ((ent_1.start, ent_1.end), (ent_2.start, ent_2.end))
                    )
                    break
                else:
                    predictions.append(
                        ((ent_2.start, ent_2.end), (ent_1.start, ent_1.end))
                    )
                    break
    return predictions

In [21]:
doc = parse_sentence(sentences[0])
matches = extract_active_passive(doc)
print(doc)
for match_type, match_span in matches:
    print("\t{}: {}".format(match_type, match_span.text))

According two different studies it seems plausible that the Pohang earthquake was induced by EGS operations . 
	Active: it seems
	Passive: earthquake was induced


In [22]:
idx = 5
doc = parse_sentence(sentences[idx])
pred = predict(sentences[idx])
print(doc)
print("Ground truth:")
for cause, effect in labels[idx]:
    print("\t{} -> {}".format(doc[cause[0]:cause[1]], doc[effect[0]:effect[1]]))
print("Predictions:")
for cause, effect in pred:
    print("\t{} -> {}".format(doc[cause[0]:cause[1]], doc[effect[0]:effect[1]]))


Serum sickness can be developed as a result of exposure to antibodies derived from animals . 
Ground truth:
	Serum sickness -> exposure to antibodies derived from animals
Predictions:
	exposure to antibodies derived from animals -> Serum sickness


In [23]:
predictions = []
#for sentence in tqdm.tqdm(sentences):
for sentence in tqdm(sentences):
    predictions.append(predict(sentence))

100%|███████████████████████████████████████████████████████████████████████████████| 468/468 [00:03<00:00, 145.44it/s]


In [24]:
def evaluate(predictions, references, micro_avg=True):
    tp = []
    fp = []
    fn = []
    for prediction, reference in zip(predictions, references):
        tp.append(len(set(prediction) & set(reference)))
        fp.append(len(set(prediction) - set(reference)))
        fn.append(len(set(reference) - set(prediction)))
    if micro_avg:
        tp = [sum(tp)]
        fp = [sum(fp)]
        fn = [sum(fn)]
    precision = [0 if tp[i] == 0 else tp[i] / (tp[i] + fp[i]) for i in range(len(tp))]
    recall = [0 if tp[i] == 0 else tp[i] / (tp[i] + fn[i]) for i in range(len(tp))]
    f1 = [
        0
        if precision[i] * recall[i] == 0
        else 2 * precision[i] * recall[i] / (precision[i] + recall[i])
        for i in range(len(tp))
    ]
    precision = sum(precision) / len(precision)
    recall = sum(recall) / len(recall)
    f1 = sum(f1) / len(f1)
    return precision, recall, f1


micro_precision, micro_recall, micro_f1 = evaluate(predictions, labels, True)
macro_precision, macro_recall, macro_f1 = evaluate(predictions, labels, False)

print("Micro Precision: {:.2f}".format(micro_precision))
print("Micro Recall: {:.2f}".format(micro_recall))
print("Micro F1: {:.2f}".format(micro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("Macro F1: {:.2f}".format(macro_f1))


Micro Precision: 0.47
Micro Recall: 0.22
Micro F1: 0.30
Macro Precision: 0.26
Macro Recall: 0.27
Macro F1: 0.26
