In [1]:
#!pip install iobes
#!pip install seqeval
#!pip install sklearn_crfsuite

import spacy
from spacy.tokens import Doc, SpanGroup
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
from tqdm import autonotebook as tqdm
from spacy.training import biluo_tags_to_spans
import iobes
import re
from itertools import combinations

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from seqeval import scheme

import sklearn_crfsuite
from collections import Counter

In [2]:
#!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [3]:
data_dir = Path("./data/teaching-dataset")
with (data_dir / "relation_classification_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "relation_classification_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = []
        for line in f.read().decode("utf-8").split("\n"):
            relations = []
            for relation in re.finditer(r"\(\((\d+),(\d+)\),\((\d+),(\d+)\)\)", line):
                relation = (
                    (int(relation.group(1)), int(relation.group(2))),
                    (int(relation.group(3)), int(relation.group(4))),
                )
                relations.append(relation)
            labels.append(relations)

In [4]:
def parse_sentence(sentence):
    words = []
    tags = []
    for item in sentence:
        word, tag = item.split(" ")
        words.append(word)
        tags.append(tag)
    doc = Doc(nlp.vocab, words=words)
    doc = nlp(doc)
    tags = iobes.bio_to_bilou(tags)
    doc.ents = biluo_tags_to_spans(doc, tags)
    return doc

In [5]:
for word in sentences[0]:
    print(word)

According O
two O
different O
studies O
it O
seems O
plausible O
that O
the O
Pohang B-EVENT
earthquake I-EVENT
was O
induced O
by O
EGS B-EVENT
operations I-EVENT
. O


In [6]:
#for word in sentences[0]:
#    print(word.split())
#    print(word.split()[0])
#    print(word.split()[1])

In [7]:
labels[0]

[((14, 16), (9, 11))]

In [8]:
print(labels[1][0][0][0])
print(labels[1][0][1][0])

49
58


In [9]:
doc = parse_sentence(sentences[0])
doc.ents

(Pohang earthquake, EGS operations)

In [10]:
sentence_number=[]
words=[]
tags=[]
dependencies=[]
events=[]
ents=[]
labels1=[]
ID=[]
customclass=[]
for i in range(len(sentences)):
    doc = parse_sentence(sentences[i])
    for token in doc:
        sentence_number.append(f'Sentence: {i}')
        #words.append(str(token).split()[0])
        words.append(str(token))
        tags.append(token.pos_)
        dependencies.append(token.dep_)
        ents.append(str(doc.ents))
        labels1.append(str(labels[i]))
        ID.append(str(token.i))
        if labels[i][0][0][0] < labels[0][0][1][0]:
            customclass.append('0 -> 1')
        elif labels[i][0][0][0] >= labels[0][0][1][0]:
            customclass.append('1 -> 0')
        
    for word in sentences[i]:
        events.append(word.split()[1])


#train_data = {'Sentence #': sentence_number, 'ID in Sentence': ID, 'Word': words, 'POS': tags, 'DEP': dependencies, 'Tag': events, 'Labels': labels1, 'Ents': ents, 'CustomClass': customclass}
train_data = {'Sentence #': sentence_number, 'ID in Sentence': ID, 'Word': words, 'POS': tags, 'DEP': dependencies, 'Tag': events, 'Ents': ents, 'CustomClass': customclass}
df = pd.DataFrame(data=train_data)
print(df)

          Sentence # ID in Sentence       Word    POS     DEP      Tag  \
0        Sentence: 0              0  According   VERB    prep        O   
1        Sentence: 0              1        two    NUM  nummod        O   
2        Sentence: 0              2  different    ADJ    amod        O   
3        Sentence: 0              3    studies   NOUN    pobj        O   
4        Sentence: 0              4         it   PRON   nsubj        O   
...              ...            ...        ...    ...     ...      ...   
13425  Sentence: 467             28         of    ADP    prep        O   
13426  Sentence: 467             29      death   NOUN    pobj  B-EVENT   
13427  Sentence: 467             30        for    ADP    prep        O   
13428  Sentence: 467             31   Haitians  PROPN    pobj        O   
13429  Sentence: 467             32          .  PUNCT   punct        O   

                                                    Ents CustomClass  
0                    (Pohang earthquake,

In [11]:
print(df.CustomClass.iloc[:100])

0     1 -> 0
1     1 -> 0
2     1 -> 0
3     1 -> 0
4     1 -> 0
       ...  
95    0 -> 1
96    0 -> 1
97    0 -> 1
98    0 -> 1
99    0 -> 1
Name: CustomClass, Length: 100, dtype: object


In [12]:
df.isnull().sum()

df = df.fillna(method='ffill')
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

#df.groupby('Tag').size().reset_index(name='counts')
df.groupby('CustomClass').size().reset_index(name='counts')

#X = df.drop('Tag', axis=1)
X = df.drop('CustomClass', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
#y = df.Tag.values
y = df.CustomClass.values
classes = np.unique(y)
classes = classes.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
X_train.shape, y_train.shape

((10744, 4590), (10744,))

In [13]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

-- Epoch 1
Norm: 69.46, NNZs: 1390, Bias: -1.000000, T: 10744, Avg. loss: 0.114576
Total training time: 0.04 seconds.


Perceptron(max_iter=5, n_jobs=-1, verbose=10)

In [14]:
new_classes = classes.copy()
print(new_classes)

['0 -> 1', '1 -> 0']


In [15]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test))

              precision    recall  f1-score   support

      0 -> 1       1.00      1.00      1.00      1627
      1 -> 0       1.00      0.99      1.00      1059

    accuracy                           1.00      2686
   macro avg       1.00      1.00      1.00      2686
weighted avg       1.00      1.00      1.00      2686



In [16]:
def extract_active_passive(doc):
    # https://stackoverflow.com/questions/74528441/detect-passive-or-active-sentence-from-text
    passive_rules = [
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "VBN"},
        ],
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "VBZ"},
        ],
        [
            {"DEP": "nsubjpass"},
            {"DEP": "aux", "OP": "*"},
            {"DEP": "auxpass"},
            {"TAG": "RB"},
            {"TAG": "VBN"},
        ],
    ]
    # Create pattern to match active voice use
    active_rules = [
        [{"DEP": "nsubj"}, {"TAG": "VBD", "DEP": "ROOT"}],
        [{"DEP": "nsubj"}, {"TAG": "VBP"}, {"TAG": "VBG", "OP": "!"}],
        [{"DEP": "nsubj"}, {"DEP": "aux", "OP": "*"}, {"TAG": "VB"}],
        [{"DEP": "nsubj"}, {"DEP": "aux", "OP": "*"}, {"TAG": "VBG"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "*"}, {"TAG": "VBG"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "*"}, {"TAG": "VBZ"}],
        [{"DEP": "nsubj"}, {"TAG": "RB", "OP": "+"}, {"TAG": "VBD"}],
    ]
    matcher = Matcher(nlp.vocab)
    matcher.add("Passive", passive_rules)
    matcher.add("Active", active_rules)
    matches = matcher(doc)
    matches = [
        (nlp.vocab.strings[match_id], doc[start:end])
        for match_id, start, end in matches
    ]
    return matches


def predict(sentence):
    doc = parse_sentence(sentence)
    matches = extract_active_passive(doc)
    predictions = []
    for ent_1, ent_2 in combinations(doc.ents, 2):
        for match_type, match_span in matches:
            if SpanGroup(doc, spans=[ent_1, ent_2, match_span]).has_overlap:
                match_active = match_type == "Active"
                if match_active:
                    predictions.append(
                        ((ent_1.start, ent_1.end), (ent_2.start, ent_2.end))
                    )
                    break
                else:
                    predictions.append(
                        ((ent_2.start, ent_2.end), (ent_1.start, ent_1.end))
                    )
                    break
    return predictions

In [17]:
doc = parse_sentence(sentences[0])
matches = extract_active_passive(doc)
print(doc)
for match_type, match_span in matches:
    print("\t{}: {}".format(match_type, match_span.text))

According two different studies it seems plausible that the Pohang earthquake was induced by EGS operations . 
	Active: it seems
	Passive: earthquake was induced


In [18]:
idx = 5
doc = parse_sentence(sentences[idx])
pred = predict(sentences[idx])
print(doc)
print("Ground truth:")
for cause, effect in labels[idx]:
    print("\t{} -> {}".format(doc[cause[0]:cause[1]], doc[effect[0]:effect[1]]))
print("Predictions:")
for cause, effect in pred:
    print("\t{} -> {}".format(doc[cause[0]:cause[1]], doc[effect[0]:effect[1]]))


Serum sickness can be developed as a result of exposure to antibodies derived from animals . 
Ground truth:
	Serum sickness -> exposure to antibodies derived from animals
Predictions:
	exposure to antibodies derived from animals -> Serum sickness


In [19]:
predictions = []
for sentence in tqdm.tqdm(sentences):
    predictions.append(predict(sentence))

  0%|          | 0/468 [00:00<?, ?it/s]

In [20]:
def evaluate(predictions, references, micro_avg=True):
    tp = []
    fp = []
    fn = []
    for prediction, reference in zip(predictions, references):
        tp.append(len(set(prediction) & set(reference)))
        fp.append(len(set(prediction) - set(reference)))
        fn.append(len(set(reference) - set(prediction)))
    if micro_avg:
        tp = [sum(tp)]
        fp = [sum(fp)]
        fn = [sum(fn)]
    precision = [0 if tp[i] == 0 else tp[i] / (tp[i] + fp[i]) for i in range(len(tp))]
    recall = [0 if tp[i] == 0 else tp[i] / (tp[i] + fn[i]) for i in range(len(tp))]
    f1 = [
        0
        if precision[i] * recall[i] == 0
        else 2 * precision[i] * recall[i] / (precision[i] + recall[i])
        for i in range(len(tp))
    ]
    precision = sum(precision) / len(precision)
    recall = sum(recall) / len(recall)
    f1 = sum(f1) / len(f1)
    return precision, recall, f1


micro_precision, micro_recall, micro_f1 = evaluate(predictions, labels, True)
macro_precision, macro_recall, macro_f1 = evaluate(predictions, labels, False)

print("Micro Precision: {:.2f}".format(micro_precision))
print("Micro Recall: {:.2f}".format(micro_recall))
print("Micro F1: {:.2f}".format(micro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("Macro F1: {:.2f}".format(macro_f1))


Micro Precision: 0.47
Micro Recall: 0.22
Micro F1: 0.30
Macro Precision: 0.26
Macro Recall: 0.27
Macro F1: 0.26
