# Import Libraries

In [1]:
# !pip install skweak
import skweak
from skweak import heuristics, gazetteers, generative, utils

import spacy
import re

import json
# !python -m spacy download de_core_news_lg



# Import Data

In [2]:
# get data


# import os

# files = os.listdir('./jsonlwmeta/2021_01')
# print("The number of files :", len(files))


# # 2. extract data
# data = []
# for i in files:
#     path = './jsonlwmeta/2021_01/{}'.format(i)
#     with open(path,'r') as f:
#         d = [json.loads(line)['text'] for line in f]
#         data.append(d)

# text = [item for sublist in data for item in sublist] #flatten list 'data'

# print("The number of instances :", len(text))
# print("Example:", text[0])


file = 'all_labelled.jsonl'
with open(file,'r',encoding = 'utf-8') as f:
    text = [json.loads(line)['text'] for line in f]

print("Example:", text[0])

Example: Maßnahmenbekanntgabe zu MA 40, Prüfung der Nebenbeschäftigungen


# Load pipeline

In [3]:
# load pipe line
pipeline = spacy.load("de_core_news_lg", disable=["ner","lemmatizer"])

In [4]:
# add special case to 
print('Before:')
print(pipeline.tokenizer.explain('Gesellschaft m.b.H.'))

from spacy.attrs import ORTH, NORM

pipeline.tokenizer.add_special_case("Gesellschaft m.b.H.", [{ORTH: "Gesellschaft m.b.H."}])
pipeline.tokenizer.add_special_case("Ges.m.b.H.", [{ORTH: "Ges.m.b.H."}])
pipeline.tokenizer.add_special_case("Gesellschaft mbH", [{ORTH: "Gesellschaft mbH"}])
pipeline.tokenizer.add_special_case("Gesellschaft mit beschränkter Haftung", [{ORTH: "Gesellschaft mit beschränkter Haftung"}])
pipeline.tokenizer.add_special_case("Betriebsgesellschaft m.b.H.", [{ORTH: "Betriebsgesellschaft m.b.H."}])

print('After:')
print(pipeline.tokenizer.explain('Gesellschaft m.b.H.'))

Before:
[('TOKEN', 'Gesellschaft'), ('TOKEN', 'm.b'), ('INFIX', '.'), ('TOKEN', 'H.')]
After:
[('SPECIAL-1', 'Gesellschaft m.b.H.')]


In [5]:
# get data
text_all = ' '.join(text[:1000])
docs = list(pipeline.pipe([text_all]))

# Set labeling functions (skweak)

In [6]:
# 0) org detector : find ORG entities detected by spacy ner pipeline

pipeline_with_ner = spacy.load("de_core_news_lg", disable=["lemmatizer"])

def from_spacy_ner(doc):
    txt = doc.text
    doc_ner = list(pipeline.pipe([txt]))
    for i in doc_ner[0].ents:
        if i.label_ == 'ORG':
            if i.text not in ['KURZFASSUNG', 'INHALTSVERZEICHNIS','ABKÜRZUNGSVERZEICHNIS', 'Einschau']:
                yield i.start, i.end, "ORG"
                
lf0 = heuristics.FunctionAnnotator("from_spacy_ner", from_spacy_ner)

In [7]:
# 1) ma_detector : find MA ORGs
def ma_detector(doc):
    ma = ["MA","ma","Magistratsabteilung","magistratsabteilung", 'Magistratsabteilungen','magistratsabteilungen']
    for tok in doc:
        if tok.text in ma:
            if re.match("\d*",tok.nbor(1).text):
                yield tok.i, (tok.i)+2, "ORG"

lf1 = heuristics.FunctionAnnotator("ma_detect", ma_detector)

In [8]:
# 2) gazetteers : list of orgs (extracted from document reports titles in https://www.stadtrechnungshof.wien.at/)

org_list = []
with open('org_list.txt', 'r', encoding = 'utf-8') as f:
    lines = f.readlines()
    org_list = [re.sub('\n','',line) for line in lines]

orgs_doc = list(pipeline.pipe(org_list))

orgs=[]
for i in orgs_doc:
    token_list = [tokens.text for tokens in i]
    temp = tuple(token_list)
    orgs.append(temp)

trie = gazetteers.Trie(orgs)

lf2 = gazetteers.GazetteerAnnotator("gov_detect", {"ORG":trie})

In [9]:
# 3) company_detector : find noun chunk containing 'GmbH'

def find_in_noun_chunks(previous_index, noun_chunk_list):
    get_true = []
    for i in noun_chunk_list:
        chunk_start = i.start
        chunk_end = i.end
        if (previous_index >= chunk_start and previous_index < chunk_end):
            get_true.append(i)
            
    if len(get_true) > 0:
        return (get_true[0].text, get_true[0].start)
    else:
        return 'None'    

def company_detector(doc):
    com_end_with= {"Gesellschaft m.b.H.","Gesellschaft mbH", "Ges.m.b.H.",
                   "Gesellschaft mit beschränkter Haftung", "Betriebsgesellschaft m.b.H.", "GmbH"}
    token_list = [tokens.text for tokens in doc]
    pos_list = [tokens.pos_ for tokens in doc]
    noun_chunk_list = [chunks for chunks in doc.noun_chunks]

    for chunk in doc.noun_chunks:
        company = []
        for i in chunk:
            if i.text in com_end_with:
                company.append(chunk.text)

                start = chunk.start
                end = chunk.end

                if chunk[0].text in ['der','die','das','des', 'wiener']:
                    yield start, end, "ORG"
                    
                else:

                    token_index = chunk.start
                    previous = token_list[token_index-1]

                    find_noun = find_in_noun_chunks(token_index-1, noun_chunk_list)

                    if find_noun != 'None':
                        start = find_noun[1]
                        company.insert(0,find_noun[0])
                        yield start, end, "ORG"

lf3 = heuristics.FunctionAnnotator("company_detect", company_detector)

In [10]:
# 4) verein_detector
def verein_detector(doc):
    for tok in doc:
        if 'Verein' in tok.text:
            if tok.nbor(1).pos_ == 'NOUN' or tok.nbor(1).pos_ == 'PROPN':
                yield tok.i, (tok.i)+2, "ORG"
lf4 = heuristics.FunctionAnnotator("verein_detect", verein_detector)


# 5) Wiener *band
def band_detector(doc):
    for tok in doc:
        if tok.text == "Wiener":
            if tok.nbor(1).text.endswith('band'):
                yield tok.i, (tok.i)+2, "ORG"
lf5 = heuristics.FunctionAnnotator("band_detect", band_detector)

# Test

In [11]:
#ref: skweak (https://github.com/NorskRegnesentral/skweak)
doc_lf = lf5(lf4(lf3(lf2(lf1(lf0(docs[0]))))))

# create and fit the HMM aggregation model
hmm = skweak.aggregation.HMM("hmm", ["ORG"])
hmm.fit_and_aggregate([doc_lf]*10)

# once fitted, we simply apply the model to aggregate all functions
doc_hmm = hmm(doc_lf)

# we can then visualise the final result (in Jupyter)
utils.display_entities(doc_hmm, "hmm")


Starting iteration 1
Finished E-step with 10 documents
Starting iteration 2


         1      -86330.8559             +nan


Finished E-step with 10 documents
Starting iteration 3


         2      -80659.2090       +5671.6469


Finished E-step with 10 documents
Starting iteration 4


         3      -80144.2373        +514.9717


Finished E-step with 10 documents


         4      -80041.0454        +103.1919


# Export as a text file (CoNLL format)

In [12]:
import pandas as pd


def to_conll(doc_hmm,export_path):
    
    #create df for temporaliy saving data
    temp = pd.DataFrame(index = [*range(len(doc_hmm))], columns= ['tokens','ner_tags'])
    
    #insert values in tokens column
    temp['tokens'] = [x.text for x in doc_hmm]
    
    #insert values in the ner_tags column
    org_index_list = [(i.start, i.end) for i in doc_hmm.spans['hmm']] #get org positions
    
    for i in org_index_list:
        
        count = 0
        
        for j in range(i[0],i[1]):
            if count == 0: #if the first token of entity, add 'B-ORG'
                temp.at[j,'ner_tags'] = 'B-ORG' 
                count += 1
            else: #else, add 'I-ORG'
                temp.at[j,'ner_tags'] = 'I-ORG'
    for i in range(len(temp)):
        if temp['tokens'][i] == ' ':
            temp.at[i,'ner_tags'] = ' '
    temp = temp.fillna('O') #fill 'O' with non-entity tokens
    
    #export as csv file
    temp.to_csv(export_path, header=None, index=None, sep='\t', mode='a')
    
    return temp

In [13]:
skweak_df = to_conll(doc_hmm,'skweak_test.txt')
skweak_df

Unnamed: 0,tokens,ner_tags
0,Maßnahmenbekanntgabe,O
1,zu,O
2,MA,B-ORG
3,40,I-ORG
4,",",O
...,...,...
46619,belegten,O
46620,Plätze,O
46621,leicht,O
46622,gesunken,O


# Compare : handlabelling vs skweak

In [14]:
def change_to_dochmm(path):
    
    # read file
    file = open(path,'r',encoding = 'utf-8')
    lines = [json.loads(line) for line in file]
    lines = lines[:1000]
    
    # get data
    text = [x['text'] for x in lines]
    entities = [x['entities'] for x in lines]
    
    # change_to_dochmm output style
    # 1) text
    fulltext = ' '.join(text) 
    
    # 2) entities
    text_len = 0
    org_index_list = []
    for i in range(len(entities)):
        for j in entities[i]:
            org_index_list.append((j['start_offset']+text_len,j['end_offset']+text_len))
        text_len = text_len + len(text[i]) + 1 
    
    return fulltext, org_index_list

In [15]:
#hand labelled
fulltext_hand_label, org_index_list_hand_label = change_to_dochmm('all_labelled.jsonl')

#skweak
fulltext_skweak = text_all
org_index_list_skweak = [(i.start_char, i.end_char) for i in doc_hmm.spans['hmm']]

In [16]:
def to_df(text, org_index_list):
    
    org_index_list.append((999999999,9999999999))
    
    token_start = 0
    list_index = 0 
    count_borg = 0

    tokens = re.split(' ', text)
    ner_tags = []
    
    for i in tokens:
        
        start_ind = org_index_list[list_index][0]
        end_ind = org_index_list[list_index][1]

        
        if token_start == start_ind:

            ner_tags.append('B-ORG')
            count_borg += 1

            token_start = token_start + len(i) + 1
            
            if token_start >= end_ind:
                list_index += 1
                
        elif token_start > start_ind and token_start < end_ind:
            
            if ner_tags[-1] == 'O':

                ner_tags.append('B-ORG')
                count_borg += 1

                token_start = token_start + len(i) + 1
                if token_start >= end_ind:
                    list_index += 1                
            else:
                ner_tags.append('I-ORG')

                token_start = token_start + len(i) + 1
                if token_start >= end_ind:
                    list_index += 1
        else:
            ner_tags.append('O')
            token_start = token_start + len(i) + 1
            if token_start >= end_ind:
                list_index += 1  

    df = pd.DataFrame({'tokens':tokens,
                      'ner_tags':ner_tags})
    print("The number of tokens :", len(df))
    print("   - B-ORG :", ner_tags.count('B-ORG'))
    print("   - I-ORG :", ner_tags.count('I-ORG'))
    print("   - O :", ner_tags.count('O'))
    return df    

In [17]:
#hand labelled
print('Hand labelled data :')
fulltext_hand_label, org_index_list_hand_label = change_to_dochmm('all_labelled.jsonl')
hl_df = to_df(fulltext_hand_label, org_index_list_hand_label)


#skweak
print('\n\nskweak data :')
fulltext_skweak = text_all
org_index_list_skweak = [(i.start_char, i.end_char) for i in doc_hmm.spans['hmm']]
weak_df = to_df(fulltext_skweak, org_index_list_skweak)

Hand labelled data :
The number of tokens : 42312
   - B-ORG : 841
   - I-ORG : 1046
   - O : 40425


skweak data :
The number of tokens : 42312
   - B-ORG : 809
   - I-ORG : 1114
   - O : 40389


In [18]:
from sklearn.metrics import classification_report
y_true = list(hl_df['ner_tags'])
y_pred = list(weak_df['ner_tags'])
print(classification_report(y_true, y_pred, labels=['O', 'B-ORG', 'I-ORG']))

              precision    recall  f1-score   support

           O       0.99      0.99      0.99     40425
       B-ORG       0.91      0.87      0.89       841
       I-ORG       0.85      0.90      0.87      1046

    accuracy                           0.99     42312
   macro avg       0.92      0.92      0.92     42312
weighted avg       0.99      0.99      0.99     42312



In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred, labels=['O', 'B-ORG', 'I-ORG'])

array([[40182,    75,   168],
       [  105,   734,     2],
       [  102,     0,   944]], dtype=int64)

In [20]:
non_detected = list(set(org_index_list_hand_label) - set(org_index_list_skweak))
non_detected_list = []
for i in non_detected:
    non_detected_list.append(fulltext_hand_label[i[0]:i[1]])
set(non_detected_list)

{' Wiener Krankenanstalten',
 '24',
 '34',
 '40',
 'Amtsträgerschaft',
 'Bildungsdirektion für Wien',
 'Einschau',
 'Fonds Soziales Wien',
 'GEMMA-Verfahren',
 'Gesundheitsverbund',
 'Gesundheitsverbund ',
 'Gesundheitsverbundes',
 'Gesundheitsverbundes ',
 'IMT Information Management Technology',
 'Institutes der Wirtschaftsprüfer in Deutschland',
 'Internationale Organisation für Normung',
 'KH Nord, Krankenhaus Nord Krankenhaus Nord - Klinik Floridsdorf',
 'Krankenanstaltenverbund',
 'Krankenanstaltenverbund ',
 'Krankenanstaltenverbundes',
 'Krankenanstaltenverbundes ',
 'Krankenhauses Nord',
 'Kuratorium für Psychosoziale Dienst in Wien',
 'Kuratorium für Psychosoziale Dienste in Wien',
 'Kuratoriums für Psychosoziale Dienste in Wien',
 'MDK Magistratsdirektion',
 'Magistratsabteilung 01',
 'Magistratsabteilung 11',
 'Magistratsabteilung 2',
 'Magistratsabteilung 24',
 'Magistratsabteilung 3',
 'Magistratsabteilung 40',
 'Magistratsabteilung 53',
 'Magistratsabteilung 70',
 'PR Pu