In [1]:
import spacy 
# !python -m spacy download de_core_news_sm
nlp = spacy.load("de_core_news_sm")
import pandas as pd
from datasets import Dataset
import json



def get_data(path):
    print("Reading file...")
    with open(path,'r', encoding = 'utf-8') as f:
        data = [json.loads(line) for line in f]
    return data
path = 'all.jsonl'
data = get_data(path)



Reading file...


In [2]:
def iob(d):
    count = 0 
    i=0
    starts = [x['start_offset'] for x in d['entities']]
    ends = [x['end_offset'] for x in d['entities']]
    starts.append(99999)
    ends.append(99999)

    sen_temp = []
    tag_temp = []

    
    text = d['text']
    if text.startswith(" "):
        text = text[1:]
        count = 1
    doc = nlp(text)
    pos = [token.tag_ for token in doc]

    for t in doc:
        sen_temp.append(t.text)
        if count == starts[i]:
            tag_temp.append('B-ORG')
            count = count+len(t.text)+1
            if count > ends[i]:
                i += 1

        elif count > starts[i] and count < ends[i]:
            if tag_temp[-1] == 'O':
                tag_temp.append('B-ORG')
            else:
                tag_temp.append('I-ORG')
            count = count+len(t.text)+1
            if count > ends[i]:
                i += 1

        else:
            tag_temp.append('O')
            count = count+len(t.text)+1
    token_nums = len(tag_temp)
    doc_id_temp = [str(d['doc_id'])+str(d['para_id'])] * token_nums
#     para_id_temp = [d['para_id']] * token_nums
    return doc_id_temp, sen_temp, pos, tag_temp

iob(data[0])

(['00', '00', '00', '00', '00', '00', '00', '00'],
 ['Maßnahmenbekanntgabe',
  'zu',
  'MA',
  '40',
  ',',
  'Prüfung',
  'der',
  'Nebenbeschäftigungen'],
 ['NN', 'APPR', 'NE', 'CARD', '$,', 'NN', 'ART', 'NN'],
 ['O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O'])

In [3]:
data[0]['entities']

[{'id': 51631, 'label': 'ORG', 'start_offset': 24, 'end_offset': 29}]

In [4]:
def tokenized_output(data):
    print('Tokenizing...')
    doc_id_list = []
    para_id_list = []
    txt_list = []
    pos_list = []
    tag_list = []

    for k in data:
#         doc_id_list.append(d['doc_id'])
#         para_id_list.append(d['para_id']
        
        docu, txt, pos, tag = iob(k)
        for i in range(len(docu)):
            doc_id_list.append(docu[i])
#             para_id_list.append(para[i])
            txt_list.append(txt[i])
            pos_list.append(pos[i])
            tag_list.append(tag[i])
#         txt_list.append(txt)
#         tag_list.append(tag)
    
    tokenized = pd.DataFrame({'doc_id' : doc_id_list,
#                              'para_id': para_id_list,
                             'tokens' : txt_list,
                              'pos_tags' : pos_list,
                             'ner_tags' : tag_list})

    return tokenized

In [5]:
df = tokenized_output(data)
# df.head()

Tokenizing...


In [6]:
label_list = list(df['ner_tags'].unique())
label_list

['O', 'B-ORG', 'I-ORG']

In [7]:
label_encoding_dict = {'O': 0,'B-ORG':1,'I-ORG':2}

In [8]:
import pandas as pd

from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report

In [9]:
# # from transformers import AutoTokenizer
# import os
# os.environ['TRANSFORMERS_CACHE'] = './cache'
# from transformers import LukeTokenizer, LukeModel, LukeForEntityPairClassification

In [10]:
class sentence(object):
    def __init__(self, df):
        self.n_sent = 0
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['tokens'].values.tolist(),
                                                       s['pos_tags'].values.tolist(),
                                                       s['ner_tags'].values.tolist())]
        self.grouped = self.df.groupby("doc_id").apply(agg)
        self.sentences = [s for s in self.grouped]
        
    def get_text(self):
        id_list = list(df['doc_id'].unique())
        
        try:
            s = self.grouped[id_list[self.n_sent]]
            self.n_sent +=1
            return s
        except:
            return None

In [11]:
getter = sentence(df)
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentences]
sentences[0]

'Maßnahmenbekanntgabe zu MA 40 , Prüfung der Nebenbeschäftigungen'

In [12]:
sent = getter.get_text()
print(sent)

[('Maßnahmenbekanntgabe', 'NN', 'O'), ('zu', 'APPR', 'O'), ('MA', 'NE', 'B-ORG'), ('40', 'CARD', 'I-ORG'), (',', '$,', 'O'), ('Prüfung', 'NN', 'O'), ('der', 'ART', 'O'), ('Nebenbeschäftigungen', 'NN', 'O')]


In [13]:
sentences = getter.sentences
sentences[0]

[('Maßnahmenbekanntgabe', 'NN', 'O'),
 ('zu', 'APPR', 'O'),
 ('MA', 'NE', 'B-ORG'),
 ('40', 'CARD', 'I-ORG'),
 (',', '$,', 'O'),
 ('Prüfung', 'NN', 'O'),
 ('der', 'ART', 'O'),
 ('Nebenbeschäftigungen', 'NN', 'O')]

In [14]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [15]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [16]:
print(X[0])
print(y[0])

[{'bias': 1.0, 'word.lower()': 'maßnahmenbekanntgabe', 'word[-3:]': 'abe', 'word[-2:]': 'be', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'NN', 'postag[:2]': 'NN', 'BOS': True, '+1:word.lower()': 'zu', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'APPR', '+1:postag[:2]': 'AP'}, {'bias': 1.0, 'word.lower()': 'zu', 'word[-3:]': 'zu', 'word[-2:]': 'zu', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'APPR', 'postag[:2]': 'AP', '-1:word.lower()': 'maßnahmenbekanntgabe', '-1:word.istitle()': True, '-1:word.isupper()': False, '-1:postag': 'NN', '-1:postag[:2]': 'NN', '+1:word.lower()': 'ma', '+1:word.istitle()': False, '+1:word.isupper()': True, '+1:postag': 'NE', '+1:postag[:2]': 'NE'}, {'bias': 1.0, 'word.lower()': 'ma', 'word[-3:]': 'MA', 'word[-2:]': 'MA', 'word.isupper()': True, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'NE', 'postag[:2]': 'NE', '-1:word.lower()': 'zu'

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [18]:
crf = CRF(algorithm = 'lbfgs',
         c1 = 0.1,
         c2 = 0.1,
         max_iterations = 100,
         all_possible_transitions = False)
try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass
y_pred = crf.predict(X_test)

In [37]:
!pip install git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git#egg=sklearn_crfsuite

ERROR: Invalid requirement: "git+'https://github.com/MeMartijn/updated-sklearn-crfsuite.git#egg=sklearn_crfsuite'"
Hint: = is not a valid operator. Did you mean == ?


In [42]:
unique_tags = crf.classes_
unique_tags

['O', 'B-ORG', 'I-ORG']

In [46]:
# from  sklearn_crfsuite.metrics import flat_classification_report  
import itertools
from sklearn.metrics import classification_report, confusion_matrix

unique_tags = crf.classes_
y_test_flat = list(itertools.chain.from_iterable(y_test))
y_pred_flat = list(itertools.chain.from_iterable(y_pred))

print(classification_report(y_test_flat, y_pred_flat))

              precision    recall  f1-score   support

       B-ORG       0.71      0.70      0.71       644
       I-ORG       0.71      0.72      0.72       741
           O       0.99      0.99      0.99     34703

    accuracy                           0.98     36088
   macro avg       0.81      0.80      0.80     36088
weighted avg       0.98      0.98      0.98     36088



In [48]:
from seqeval.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ORG       0.66      0.64      0.65       644

   micro avg       0.66      0.64      0.65       644
   macro avg       0.66      0.64      0.65       644
weighted avg       0.66      0.64      0.65       644

