In [66]:
import spacy
from spacy.tokens import Doc
from spacy.matcher import Matcher
from zipfile import ZipFile
from pathlib import Path
from seqeval import scheme
from tqdm import autonotebook as tqdm

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

import sklearn_crfsuite
from collections import Counter

In [67]:
nlp = spacy.load("en_core_web_sm")

In [68]:
data_dir = Path("./data/teaching-dataset")
with (data_dir / "span_extraction_text_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        sentences = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]
with (data_dir / "span_extraction_references_train.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("references.txt") as f:
        labels = [
            sentence.split("\n") for sentence in f.read().decode("utf-8").split("\n\n")
        ]

In [69]:
idx = 0
sentence = sentences[idx]
doc = Doc(nlp.vocab, words=sentence)
doc = nlp(doc)
iterator = zip(doc, labels[idx])
for token, label in iterator:
    print(f"{token.text :<16} {label :<8} {token.pos_ :<8} {token.dep_ :<8}")

sentence_number=[]
words=[]
tags=[]
dependencies=[]
events=[]
for i in range(len(sentences)):
    doc = Doc(nlp.vocab, words=sentences[i])
    doc = nlp(doc)
    iterator = zip(doc, labels[i])
    for token, label in iterator:
        sentence_number.append(f'Sentence: {i}')
        words.append(str(token))
        tags.append(token.pos_)
        dependencies.append(token.dep_)
        events.append(label)

train_data = {'Sentence #': sentence_number,'Word': words, 'POS': tags, 'DEP': dependencies, 'Tag': events}
df = pd.DataFrame(data=train_data)
print(df)

According        O        VERB     prep    
two              O        NUM      nummod  
different        O        ADJ      amod    
studies          O        NOUN     pobj    
it               O        PRON     nsubj   
seems            O        VERB     ROOT    
plausible        O        ADJ      oprd    
that             O        SCONJ    mark    
the              O        DET      det     
Pohang           B-EVENT  PROPN    compound
earthquake       I-EVENT  NOUN     nsubjpass
was              O        AUX      auxpass 
induced          O        VERB     ccomp   
by               O        ADP      agent   
EGS              B-EVENT  PROPN    compound
operations       I-EVENT  NOUN     pobj    
.                O        PUNCT    punct   
          Sentence #       Word    POS     DEP      Tag
0        Sentence: 0  According   VERB    prep        O
1        Sentence: 0        two    NUM  nummod        O
2        Sentence: 0  different    ADJ    amod        O
3        Sentence: 0    stu

In [70]:
df.isnull().sum()

df = df.fillna(method='ffill')
df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique()

df.groupby('Tag').size().reset_index(name='counts')

X = df.drop('Tag', axis=1)
v = DictVectorizer(sparse=False)
X = v.fit_transform(X.to_dict('records'))
y = df.Tag.values
classes = np.unique(y)
classes = classes.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
X_train.shape, y_train.shape

((10744, 4012), (10744,))

In [71]:
per = Perceptron(verbose=10, n_jobs=-1, max_iter=5)
per.partial_fit(X_train, y_train, classes)

-- Epoch 1
-- Epoch 1
-- Epoch 1
Norm: 49.92, NNZs: 1467, Bias: -2.000000, T: 10744, Avg. loss: 0.290581
Total training time: 0.05 seconds.
Norm: 64.36, NNZs: 1841, Bias: -2.000000, T: 10744, Avg. loss: 0.557241
Total training time: 0.06 seconds.
Norm: 69.91, NNZs: 2024, Bias: 3.000000, T: 10744, Avg. loss: 0.573250
Total training time: 0.06 seconds.


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.1s finished


In [72]:
new_classes = classes.copy()
print(new_classes)

['B-EVENT', 'I-EVENT', 'O']


In [73]:
print(classification_report(y_pred=per.predict(X_test), y_true=y_test))

              precision    recall  f1-score   support

     B-EVENT       0.32      0.62      0.42       270
     I-EVENT       0.52      0.39      0.45       590
           O       0.86      0.81      0.83      1826

    accuracy                           0.70      2686
   macro avg       0.57      0.61      0.57      2686
weighted avg       0.73      0.70      0.71      2686



In [74]:
sgd = SGDClassifier()
sgd.partial_fit(X_train, y_train, classes)

In [75]:
print(classification_report(y_pred=sgd.predict(X_test), y_true=y_test, labels=new_classes))

              precision    recall  f1-score   support

     B-EVENT       0.49      0.16      0.24       270
     I-EVENT       0.48      0.57      0.52       590
           O       0.84      0.87      0.85      1826

    accuracy                           0.73      2686
   macro avg       0.60      0.53      0.54      2686
weighted avg       0.72      0.73      0.72      2686



In [76]:
nb = MultinomialNB(alpha=0.01)
nb.partial_fit(X_train, y_train, classes)

In [77]:
print(classification_report(y_pred=nb.predict(X_test), y_true=y_test, labels = new_classes))

              precision    recall  f1-score   support

     B-EVENT       0.33      0.61      0.43       270
     I-EVENT       0.46      0.37      0.41       590
           O       0.85      0.80      0.82      1826

    accuracy                           0.69      2686
   macro avg       0.55      0.59      0.55      2686
weighted avg       0.71      0.69      0.69      2686



In [78]:
pa =PassiveAggressiveClassifier()
pa.partial_fit(X_train, y_train, classes)

In [79]:
print(classification_report(y_pred=pa.predict(X_test), y_true=y_test, labels=new_classes))

              precision    recall  f1-score   support

     B-EVENT       0.34      0.41      0.37       270
     I-EVENT       0.48      0.42      0.45       590
           O       0.83      0.84      0.83      1826

    accuracy                           0.70      2686
   macro avg       0.55      0.55      0.55      2686
weighted avg       0.70      0.70      0.70      2686



In [80]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, d, t) for w, p, d, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['DEP'].values.tolist(),
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

In [81]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    deptag = sent[i][2]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        'deptag': deptag,
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        deptag1 = sent[i-1][2]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:deptag': deptag1,
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        deptag1 = sent[i+1][2]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:deptag': deptag1,
        })
    else:
        features['EOS'] = True
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, postag, deptag, label in sent]
def sent2tokens(sent):
    return [token for token, postag, deptag, label in sent]

In [82]:
print(sentences)



In [83]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [84]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train);

In [85]:
def flatten(l):
    return [item for sublist in l for item in sublist]

y_pred = crf.predict(X_test)
# print(metrics.flat_classification_report(y_pred=y_pred, y_true=y_test, labels = new_classes))
# sollte eigentlich funktionieren tuts aber nicht
flat_y_true = flatten(y_test)
flat_y_pred = flatten(y_pred)
print(classification_report(flat_y_true, flat_y_pred))

              precision    recall  f1-score   support

     B-EVENT       0.63      0.57      0.60       267
     I-EVENT       0.59      0.59      0.59       540
           O       0.84      0.86      0.85      1848

    accuracy                           0.77      2655
   macro avg       0.69      0.67      0.68      2655
weighted avg       0.77      0.77      0.77      2655



In [86]:
with Path("data/teaching-dataset/span_extraction_text_test.zip").open("rb") as file:
    zip_file = ZipFile(file)
    with zip_file.open("input.txt") as f:
        test_sentences = f.read().decode("utf-8").splitlines()

In [87]:
sentence_number=[]
words=[]
tags=[]
dependencies=[]
events=[]
for i in range(len(test_sentences)):
    doc = Doc(nlp.vocab, words=test_sentences[i])
    doc = nlp(doc)
    for token in doc:
        sentence_number.append(f'Sentence: {i}')
        words.append(str(token))
        tags.append(token.pos_)
        dependencies.append(token.dep_)

train_data = {'Sentence #': sentence_number,'Word': words, 'POS': tags, 'DEP': dependencies}
df = pd.DataFrame(data=train_data)
print(df)

           Sentence # Word    POS       DEP
0         Sentence: 0    A    DET       det
1         Sentence: 0    f  PROPN  compound
2         Sentence: 0    t   NOUN  compound
3         Sentence: 0    e   NOUN  compound
4         Sentence: 0    r   NOUN      ROOT
...               ...  ...    ...       ...
13846  Sentence: 2949    G  PROPN  compound
13847  Sentence: 2949    A  PROPN  compound
13848  Sentence: 2949    B  PROPN  compound
13849  Sentence: 2949    A  PROPN      ROOT
13850  Sentence: 2950    .  PUNCT      ROOT

[13851 rows x 4 columns]


In [88]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, d) for w, p, d in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['DEP'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None
getter = SentenceGetter(df)
sentences = getter.sentences

print(sentences)

[[('A', 'DET', 'det'), ('f', 'PROPN', 'compound'), ('t', 'NOUN', 'compound'), ('e', 'NOUN', 'compound'), ('r', 'NOUN', 'ROOT')], [('t', 'PROPN', 'compound'), ('h', 'NOUN', 'compound'), ('e', 'NOUN', 'ROOT')], [('o', 'NOUN', 'ROOT'), ('n', 'ADV', 'advmod')], [('s', 'X', 'ROOT'), ('e', 'NOUN', 'npadvmod'), ('a', 'PRON', 'npadvmod'), ('f', 'PROPN', 'nmod'), ('o', 'NOUN', 'npadvmod'), ('o', 'NOUN', 'npadvmod'), ('d', 'NOUN', 'attr')], [('l', 'NOUN', 'advmod'), ('i', 'PROPN', 'intj'), ('k', 'PROPN', 'compound'), ('e', 'NOUN', 'ROOT')], [('h', 'PROPN', 'compound'), ('y', 'PROPN', 'compound'), ('p', 'PROPN', 'compound'), ('e', 'PROPN', 'compound'), ('r', 'NOUN', 'compound'), ('p', 'NOUN', 'ROOT'), ('i', 'PRON', 'ROOT'), ('g', 'PROPN', 'ROOT'), ('m', 'VERB', 'punct'), ('e', 'PROPN', 'compound'), ('n', 'PROPN', 'compound'), ('t', 'PROPN', 'dep'), ('a', 'DET', 'det'), ('t', 'NOUN', 'ROOT'), ('i', 'PRON', 'nsubj'), ('o', 'NOUN', 'ROOT'), ('n', 'ADV', 'advmod')], [('(', 'PUNCT', 'ROOT')], [('a', '

In [89]:
X = [sent2features(s) for s in sentences]
y_pred = crf.predict(X)
flat_y_pred = flatten(y_pred)

In [90]:
Path("predictions.txt").write_text("\n".join(map(str, flat_y_pred)));