In [1]:
#!pip install iobes
#!pip install seqeval
#!pip install sklearn_crfsuite 
#!conda install spacy
#!pip uninstall numpy
#!pip install numpy

import spacy
from spacy.tokens import Doc, SpanGroup
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
#from tqdm import autonotebook as tqdm
from tqdm import tqdm
from spacy.training import biluo_tags_to_spans
import iobes
import re
from itertools import combinations

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from seqeval import scheme

import sklearn_crfsuite
from collections import Counter


In [2]:
#!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [3]:
data_dir = Path("./data/teaching-dataset")
with (data_dir / "relation_classification_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "relation_classification_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = []
        for line in f.read().decode("utf-8").split("\n"):
            relations = []
            for relation in re.finditer(r"\(\((\d+),(\d+)\),\((\d+),(\d+)\)\)", line):
                relation = (
                    (int(relation.group(1)), int(relation.group(2))),
                    (int(relation.group(3)), int(relation.group(4))),
                )
                relations.append(relation)
            labels.append(relations)

In [4]:
def parse_sentence(sentence):
    words = []
    tags = []
    for item in sentence:
        word, tag = item.split(" ")
        words.append(word)
        tags.append(tag)
    doc = Doc(nlp.vocab, words=words)
    doc = nlp(doc)
    tags = iobes.bio_to_bilou(tags)
    doc.ents = biluo_tags_to_spans(doc, tags)
    return doc

In [5]:
wordlist = []
biggerlist = []
for word in sentences[0]:
    #print(word)
    wordlist.append(str(word.split()[0]))
print(wordlist)
biggerlist.append(wordlist)
biggerlist.append('test')
print(biggerlist)

['According', 'two', 'different', 'studies', 'it', 'seems', 'plausible', 'that', 'the', 'Pohang', 'earthquake', 'was', 'induced', 'by', 'EGS', 'operations', '.']
[['According', 'two', 'different', 'studies', 'it', 'seems', 'plausible', 'that', 'the', 'Pohang', 'earthquake', 'was', 'induced', 'by', 'EGS', 'operations', '.'], 'test']


In [6]:
wordlist = []
taglist = []
biggertaglist = []
biggerlist = []
for i in range(len(sentences)):
    for word in sentences[i]:
        wordlist.append(str(word.split()[0]))
        taglist.append(str(word.split()[1]))

        
    biggerlist.append(wordlist)
    biggertaglist.append(taglist)
    
    wordlist = []
    taglist = []
#print(biggerlist)        

In [7]:
#for word in sentences[0]:
#    print(word.split())
#    print(word.split()[0])
#    print(word.split()[1])

In [8]:
labels[0]

[((14, 16), (9, 11))]

In [9]:
len(labels[467])
len(sentences[0])
#helper = np.zeros((len(sentences[0]), len(sentences[0])))
helper = np.zeros(len(sentences[0]))
print(helper)

for entry in labels[0][0]:
    print(entry)
    for numbers in entry:
        print(numbers)
        helper[numbers-1] = 1
print(helper)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
(14, 16)
14
16
(9, 11)
9
11
[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 1. 0.]


In [10]:
print(labels[1][0][0][0])
print(labels[1][0][1][0])

49
58


In [11]:
doc = parse_sentence(sentences[0])
doc.ents

(Pohang earthquake, EGS operations)

In [12]:
#new df with 1 line per sentence 
sentence_number=[]
wordlist = []
taglist = []
POSlist = []
DEPlist = []
customclass = []

biggerlist = []
biggertaglist = []
biggerPOSlist = []
biggerDEPlist = []
for i in range(len(sentences)):
    if labels[i][0][0][0] < labels[0][0][1][0]:
        customclass.append('0')
    elif labels[i][0][0][0] >= labels[0][0][1][0]:
        customclass.append('1')
for i in range(len(sentences)):
    sentence_number.append(str(i))
    
    for word in sentences[i]:
        wordlist.append(str(word.split()[0]))
        taglist.append(str(word.split()[1]))

    doc = parse_sentence(sentences[i])
    for token in doc:
        POSlist.append(token.pos_)
        DEPlist.append(token.dep_)
            

    biggerlist.append(wordlist)
    biggertaglist.append(taglist)
    biggerPOSlist.append(POSlist)
    biggerDEPlist.append(DEPlist)
    
    wordlist = []
    taglist = []
    POSlist = []
    DEPlist = []

train_data = {'Sentence #': sentence_number, 'Wordlist': biggerlist, 'Taglist': biggertaglist, 'POS': biggerPOSlist, 'DEP': biggerDEPlist, 'customclass': customclass,}
#train_data = {'Sentence #': sentence_number, 'Wordlist': str(biggerlist), 'Taglist': str(biggertaglist), 'POS': str(biggerPOSlist), 'DEP': str(biggerDEPlist), 'customclass': customclass,}
df = pd.DataFrame(data=train_data)
print(df)
#df.to_dict()
#print(df)
#df.to_dict('records')
#print(df)
#df.to_dict('split')
#print(df)

    Sentence #                                           Wordlist  \
0            0  [According, two, different, studies, it, seems...   
1            1  [Signs, and, symptoms, include, :, Dyspnea, (,...   
2            2  [Pneumatic, drilling, in, mines, and, less, co...   
3            3  [Interleukin-1, inhibitors, ,, such, as, canak...   
4            4  [Two, of, the, reviews, postulate, that, exerc...   
..         ...                                                ...   
463        463  [Dental, fluorosis, (, also, termed, mottled, ...   
464        464  [The, detergent, causes, the, vessel, to, coll...   
465        465  [Large, segmental, hemangiomas, of, the, head,...   
466        466  [Following, a, strain, or, partial, rupture, o...   
467        467  [High, infection, rates, for, diseases, such, ...   

                                               Taglist  \
0    [O, O, O, O, O, O, O, O, O, B-EVENT, I-EVENT, ...   
1    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ... 

In [13]:
df.isnull().sum()

df = df.fillna(method='ffill')
#df['Sentence #'].nunique(), df.Wordlist.nunique(), df.Taglist.nunique(), df.customclass.nunique()
#print(df['Sentence #'].nunique())
#print(df.Wordlist.nunique())
#print(df.Taglist.nunique())
#print(df.CustomClass.nunique())

#df.groupby('Tag').size().reset_index(name='counts')
#print(df.groupby('Tag').size().reset_index(name='counts'))

df.groupby('customclass').size().reset_index(name='counts')
print(df.groupby('customclass').size().reset_index(name='counts'))

#X = df.drop('Tag', axis=1)
X = df.drop('customclass', axis=1)
#print(X)

v = DictVectorizer(sparse=False)

X = v.fit_transform(X.to_dict('records'))

for i in range(10):
    print(X[i])
    
#y = df.Tag.values
y = df.customclass.values
classes = np.unique(y)
classes = classes.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
X_train.shape, y_train.shape

  customclass  counts
0           0     314
1           1     154
[1. 0. 0. ... 0. 0. 0.]
[2. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[1. 1. 1. ... 0. 0. 0.]
[1. 1. 0. ... 0. 0. 0.]
[1. 1. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]


((374, 4015), (374,))

In [14]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)
#per.fit(X_train, y_train, classes, coef_init = )

-- Epoch 1
Norm: 116.55, NNZs: 1881, Bias: -22.000000, T: 374, Avg. loss: 223.491979
Total training time: 0.00 seconds.


In [15]:
#per.partial_fit(X_train, y_train, classes)

In [16]:
new_classes = classes.copy()
print(new_classes)

['0', '1']


In [17]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test))

              precision    recall  f1-score   support

           0       0.75      0.66      0.70        64
           1       0.42      0.53      0.47        30

    accuracy                           0.62        94
   macro avg       0.59      0.59      0.59        94
weighted avg       0.65      0.62      0.63        94



In [18]:
import sys
np.set_printoptions(threshold=sys.maxsize)
print(per.predict(X_test))
print(y_test)

['0' '1' '1' '0' '1' '1' '1' '0' '1' '0' '0' '1' '0' '1' '0' '1' '1' '0'
 '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0' '1' '1' '1' '0'
 '1' '1' '0' '1' '0' '1' '0' '1' '0' '1' '0' '0' '0' '1' '0' '0' '0' '0'
 '1' '0' '1' '1' '0' '1' '0' '0' '0' '0' '1' '0' '1' '1' '0' '0' '1' '0'
 '1' '1' '1' '0' '0' '0' '0' '1' '1' '1' '0' '0' '1' '0' '1' '0' '0' '0'
 '0' '0' '0' '1']
['0' '0' '1' '0' '1' '0' '0' '1' '1' '0' '0' '1' '1' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '1' '0' '0' '1' '0' '0' '1' '1'
 '0' '0' '0' '0' '0' '0' '0' '1' '1' '1' '1' '0' '0' '1' '0' '0' '1' '0'
 '0' '0' '1' '0' '0' '1' '0' '0' '0' '0' '0' '1' '1' '0' '1' '0' '1' '0'
 '1' '0' '0' '0' '0' '0' '1' '1' '0' '1' '0' '1' '1' '1' '0' '0' '0' '0'
 '0' '0' '0' '0']


In [19]:
def extract_active_passive(doc):
    # https://stackoverflow.com/questions/74528441/detect-passive-or-active-sentence-from-text
    passive_rules = [
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "VBN"},
        ],
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "VBZ"},
        ],
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "RB"},
            {"TAG": "VBN"},
        ],
    ]
    # Create pattern to match active voice use
    active_rules = [
        [{"DEP": "nsubj"}, {"TAG": "VBD", "DEP": "ROOT"}],
        [{"DEP": "nsubj"}, {"TAG": "VBP"}, {"TAG": "VBG", "OP": "!"}],
        [{"DEP": "nsubj"}, {"DEP": "aux", "OP": "*"}, {"TAG": "VB"}],
        [{"DEP": "nsubj"}, {"DEP": "aux", "OP": "*"}, {"TAG": "VBG"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "*"}, {"TAG": "VBG"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "*"}, {"TAG": "VBZ"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "+"}, {"TAG": "VBD"}],
    ]
    matcher = Matcher(nlp.vocab)
    matcher.add("Passive", passive_rules)
    matcher.add("Active", active_rules)
    matches = matcher(doc)
    matches = [
        (nlp.vocab.strings[match_id], doc[start:end])
        for match_id, start, end in matches
    ]
    return matches


def predict(sentence):
    doc = parse_sentence(sentence)
    matches = extract_active_passive(doc)
    predictions = []
    for ent_1, ent_2 in combinations(doc.ents, 2):
        for match_type, match_span in matches:
            if SpanGroup(doc, spans=[ent_1, ent_2, match_span]).has_overlap:
                match_active = match_type == "Active"
                if match_active:
                    predictions.append(
                        ((ent_1.start, ent_1.end), (ent_2.start, ent_2.end))
                    )
                    break
                else:
                    predictions.append(
                        ((ent_2.start, ent_2.end), (ent_1.start, ent_1.end))
                    )
                    break
    return predictions

In [20]:
doc = parse_sentence(sentences[0])
matches = extract_active_passive(doc)
print(doc)
for match_type, match_span in matches:
    print("\t{}: {}".format(match_type, match_span.text))

According two different studies it seems plausible that the Pohang earthquake was induced by EGS operations . 
	Active: it seems
	Passive: earthquake was induced


In [21]:
idx = 5
doc = parse_sentence(sentences[idx])
pred = predict(sentences[idx])
print(doc)
print("Ground truth:")
for cause, effect in labels[idx]:
    print("\t{} -> {}".format(doc[cause[0]:cause[1]], doc[effect[0]:effect[1]]))
print("Predictions:")
for cause, effect in pred:
    print("\t{} -> {}".format(doc[cause[0]:cause[1]], doc[effect[0]:effect[1]]))


Serum sickness can be developed as a result of exposure to antibodies derived from animals . 
Ground truth:
	Serum sickness -> exposure to antibodies derived from animals
Predictions:
	exposure to antibodies derived from animals -> Serum sickness


In [22]:
predictions = []
#for sentence in tqdm.tqdm(sentences):
for sentence in tqdm(sentences):
    predictions.append(predict(sentence))

100%|███████████████████████████████████████████████████████████████████████████████| 468/468 [00:03<00:00, 136.46it/s]


In [23]:
def evaluate(predictions, references, micro_avg=True):
    tp = []
    fp = []
    fn = []
    for prediction, reference in zip(predictions, references):
        tp.append(len(set(prediction) & set(reference)))
        fp.append(len(set(prediction) - set(reference)))
        fn.append(len(set(reference) - set(prediction)))
    if micro_avg:
        tp = [sum(tp)]
        fp = [sum(fp)]
        fn = [sum(fn)]
    precision = [0 if tp[i] == 0 else tp[i] / (tp[i] + fp[i]) for i in range(len(tp))]
    recall = [0 if tp[i] == 0 else tp[i] / (tp[i] + fn[i]) for i in range(len(tp))]
    f1 = [
        0
        if precision[i] * recall[i] == 0
        else 2 * precision[i] * recall[i] / (precision[i] + recall[i])
        for i in range(len(tp))
    ]
    precision = sum(precision) / len(precision)
    recall = sum(recall) / len(recall)
    f1 = sum(f1) / len(f1)
    return precision, recall, f1


micro_precision, micro_recall, micro_f1 = evaluate(predictions, labels, True)
macro_precision, macro_recall, macro_f1 = evaluate(predictions, labels, False)

print("Micro Precision: {:.2f}".format(micro_precision))
print("Micro Recall: {:.2f}".format(micro_recall))
print("Micro F1: {:.2f}".format(micro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("Macro F1: {:.2f}".format(macro_f1))


Micro Precision: 0.47
Micro Recall: 0.22
Micro F1: 0.30
Macro Precision: 0.26
Macro Recall: 0.27
Macro F1: 0.26
