In [1]:
import spacy
from spacy.tokens import Doc
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
from seqeval import scheme
from tqdm import autonotebook as tqdm

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

import sklearn_crfsuite
from collections import Counter

  from tqdm import autonotebook as tqdm


In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
data_dir = Path("./data/teaching-dataset")
with (data_dir / "span_extraction_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "span_extraction_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]

In [4]:
idx = 0
sentence = sentences[idx]
doc = Doc(nlp.vocab, words=sentence)
doc = nlp(doc)
iterator = zip(doc, labels[idx])
for token, label in iterator:
    print(f"{token.text :<16} {label :<8} {token.pos_ :<8} {token.dep_ :<8}")

sentence_number=[]
words=[]
tags=[]
events=[]
for i in range(len(sentences)):
    doc = Doc(nlp.vocab, words=sentences[i])
    doc = nlp(doc)
    iterator = zip(doc, labels[i])
    for token, label in iterator:
        sentence_number.append(f'Sentence: {i}')
        words.append(str(token))
        tags.append(token.pos_)
        events.append(label)

train_data = {'Sentence #': sentence_number,'Word': words, 'POS': tags, 'Tag': events}
df = pd.DataFrame(data=train_data)

According        O        VERB     prep    
two              O        NUM      nummod  
different        O        ADJ      amod    
studies          O        NOUN     pobj    
it               O        PRON     nsubj   
seems            O        VERB     ROOT    
plausible        O        ADJ      oprd    
that             O        SCONJ    mark    
the              O        DET      det     
Pohang           B-EVENT  PROPN    compound
earthquake       I-EVENT  NOUN     nsubjpass
was              O        AUX      auxpass 
induced          O        VERB     ccomp   
by               O        ADP      agent   
EGS              B-EVENT  PROPN    compound
operations       I-EVENT  NOUN     pobj    
.                O        PUNCT    punct   


In [5]:
df.isnull().sum()

df = df.fillna(method='ffill')
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

df.groupby('Tag').size().reset_index(name='counts')

X = df.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df.Tag.values
classes = np.unique(y)
classes = classes.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
X_train.shape, y_train.shape

((10744, 3969), (10744,))

In [6]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

-- Epoch 1-- Epoch 1

-- Epoch 1
Norm: 48.66, NNZs: 1509, Bias: -2.000000, T: 10744, Avg. loss: 0.233991
Total training time: 0.07 seconds.
Norm: 60.45, NNZs: 1826, Bias: -2.000000, T: 10744, Avg. loss: 0.447040
Total training time: 0.07 seconds.
Norm: 65.02, NNZs: 1985, Bias: 2.000000, T: 10744, Avg. loss: 0.458675
Total training time: 0.07 seconds.


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


In [7]:
new_classes = classes.copy()
print(new_classes)

['B-EVENT', 'I-EVENT', 'O']


In [8]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test))

              precision    recall  f1-score   support

     B-EVENT       0.30      0.56      0.39       270
     I-EVENT       0.51      0.32      0.39       590
           O       0.84      0.83      0.83      1826

    accuracy                           0.69      2686
   macro avg       0.55      0.57      0.54      2686
weighted avg       0.71      0.69      0.69      2686



In [9]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

In [10]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

              precision    recall  f1-score   support

     B-EVENT       0.52      0.14      0.22       270
     I-EVENT       0.52      0.38      0.44       590
           O       0.78      0.93      0.85      1826

    accuracy                           0.73      2686
   macro avg       0.61      0.48      0.50      2686
weighted avg       0.70      0.73      0.70      2686



In [11]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

In [12]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

              precision    recall  f1-score   support

     B-EVENT       0.32      0.52      0.40       270
     I-EVENT       0.48      0.41      0.44       590
           O       0.85      0.81      0.83      1826

    accuracy                           0.69      2686
   macro avg       0.55      0.58      0.56      2686
weighted avg       0.71      0.69      0.70      2686



In [13]:
pa =PassiveAggressiveClassifier()
pa.partial_fit(X_train, y_train, classes)

In [14]:
print(classification_report(y_pred=pa.predict(X_test), y_true=y_test, labels=new_classes))

              precision    recall  f1-score   support

     B-EVENT       0.40      0.13      0.20       270
     I-EVENT       0.62      0.18      0.28       590
           O       0.74      0.98      0.84      1826

    accuracy                           0.72      2686
   macro avg       0.58      0.43      0.44      2686
weighted avg       0.67      0.72      0.65      2686



In [15]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

In [16]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [17]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [18]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train);

In [19]:
def flatten(l):
    return [item for sublist in l for item in sublist]

y_pred = crf.predict(X_test)
# print(metrics.flat_classification_report(y_pred=y_pred, y_true=y_test, labels = new_classes))
# sollte eigentlich funktionieren tuts aber nicht
flat_y_true = flatten(y_test)
flat_y_pred = flatten(y_pred)
print(classification_report(flat_y_true, flat_y_pred))

              precision    recall  f1-score   support

     B-EVENT       0.62      0.57      0.59       267
     I-EVENT       0.60      0.57      0.58       540
           O       0.84      0.86      0.85      1848

    accuracy                           0.77      2655
   macro avg       0.69      0.67      0.68      2655
weighted avg       0.77      0.77      0.77      2655

