# Import Libraries

In [1]:
# !pip install skweak
import skweak
from skweak import heuristics, gazetteers, generative, utils

import spacy
import re

import json
# !python -m spacy download de_core_news_lg



# Import Data

In [2]:
# get data


import os

files = os.listdir('./jsonlwmeta/2021_01')
print("The number of files :", len(files))


# 2. extract data
data = []
for i in files:
    path = './jsonlwmeta/2021_01/{}'.format(i)
    with open(path,'r') as f:
        d = [json.loads(line)['text'] for line in f]
        data.append(d)

text = [item for sublist in data for item in sublist] #flatten list 'data'

print("The number of instances :", len(text))
print("Example:", text[0])

The number of files : 42
The number of instances : 3705
Example: Maßnahmenbekanntgabe zu MA 40, Prüfung der Nebenbeschäftigungen


# Load pipeline

In [8]:
# load pipe line
pipeline = spacy.load("de_core_news_lg", disable=["ner","lemmatizer"])

In [9]:
# add special case to 
print('Before:')
print(pipeline.tokenizer.explain('Gesellschaft m.b.H.'))

from spacy.attrs import ORTH, NORM

pipeline.tokenizer.add_special_case("Gesellschaft m.b.H.", [{ORTH: "Gesellschaft m.b.H."}])
pipeline.tokenizer.add_special_case("Ges.m.b.H.", [{ORTH: "Ges.m.b.H."}])
pipeline.tokenizer.add_special_case("Gesellschaft mit beschränkter Haftung", [{ORTH: "Gesellschaft mit beschränkter Haftung"}])
pipeline.tokenizer.add_special_case("Betriebsgesellschaft m.b.H.", [{ORTH: "Betriebsgesellschaft m.b.H."}])

print('After:')
print(pipeline.tokenizer.explain('Gesellschaft m.b.H.'))

Before:
[('TOKEN', 'Gesellschaft'), ('TOKEN', 'm.b'), ('INFIX', '.'), ('TOKEN', 'H.')]
After:
[('SPECIAL-1', 'Gesellschaft m.b.H.')]


In [10]:
# get data
docs = list(pipeline.pipe(text))

# Set labeling functions (skweak)

In [11]:
# 0) org detector : find ORG entities detected by spacy ner pipeline

pipeline_with_ner = spacy.load("de_core_news_lg", disable=["lemmatizer"])

def from_spacy_ner(doc):
    txt = doc.text
    doc_ner = list(pipeline.pipe([txt]))
    for i in doc_ner[0].ents:
        if i.label_ == 'ORG':
            if i.text not in ['KURZFASSUNG', 'INHALTSVERZEICHNIS','ABKÜRZUNGSVERZEICHNIS', 'Einschau']:
                yield i.start, i.end, "ORG"
                
lf0 = heuristics.FunctionAnnotator("from_spacy_ner", from_spacy_ner)

In [15]:
# 1) ma_detector : find MA ORGs

def ma_detector(doc):
    ma = ["MA","ma","Magistratsabteilung","magistratsabteilung"]
    for tok in doc:
        if tok.text in ma:
            if re.match("\d*",tok.nbor(1).text):
                yield tok.i, (tok.i)+2, "ORG"

lf1 = heuristics.FunctionAnnotator("ma_detect", ma_detector)

In [18]:
# 2) gazetteers : list of orgs

orgs_list = ["Wiener Kinder- und Jugend­anwaltschaft",
"Wiener Pflege-, Patientinnen-und Patientenanwaltschaft",
"Wiener Umweltanwaltschaft",
"Wiener Tierschutzombudsstelle",
"Sicherheitsvertrauenspersonen",
"Unabhängiger Bediensteten­schutzbeauftragter",
"Gleichbehandlungskommission",
"Gleichbehandlungsbeauftragte",
"Stadtrechnungshof Wien",
"Stadtrechnungshofes Wien",
"Stadt Wien",
"Wiener Gesundheitsverbund",
"Gesundheitsverbund",
"Wiener Krankenanstaltenverbund",
"Krankenanstaltenverbund"
"Wiener Musikverein",
"Musikverein Wien",
]

orgs_doc = list(pipeline.pipe(orgs_list))

orgs=[]
for i in orgs_doc:
    token_list = [tokens.text for tokens in i]
    temp = tuple(token_list)
    orgs.append(temp)

trie = gazetteers.Trie(orgs)

lf2 = gazetteers.GazetteerAnnotator("gov_detect", {"ORG":trie})

In [34]:
org_list = []
with open('org_list.txt', 'r', encoding = 'utf-8') as f:
    lines = f.readlines()
    org_list = [re.sub('\n','',line) for line in lines]

print(org_list)

['Wiener Kinder- und Jugend\xadanwaltschaft', 'Wiener Pflege-, Patientinnen-und Patientenanwaltschaft', 'Wiener Umweltanwaltschaft', 'Wiener Tierschutzombudsstelle', 'Sicherheitsvertrauenspersonen', 'Unabhängiger Bediensteten\xadschutzbeauftragter', 'Gleichbehandlungskommission', 'Gleichbehandlungsbeauftragte', 'Kontaktfrauen', 'Stadtrechnungshof Wien', 'Stadt Wien', 'Wiener Gesundheitsverbund', 'Wiener Krankenanstaltenverbund', '"Drachengasse 2" Theatregesellschaft m.b.H.', '"Theatre in der Josefstadt" Betriebsgesellschaft m.b.H', '"Volkstheater" Gesellschaft m.b.H.', 'Wiener Stadterneuerungsgesellschaft', 'AKTIONSRADIUS WIEN', 'AUSTRIAN FASHION ASSOCIATION', 'B&F Vienna - Bestattung und Friedhöfe GmbH', 'BFW Gebäudeerrichtungs- und Autovermietung GmbH & Co KG', 'Autovermietung GmbH & Co KG', 'BFW Gebäudeerrichtungs- und Vermietungs GmbH & Co KG', 'B&F Wien - Bestattung und Friedhöfe GmbH', 'BFW Gebäudeerrichtungs- und Vermietungs GmbH & Co KG', 'Eurocomm-PR GmbH', 'EuroVienna EU-cons

In [38]:
# 2) gazetteers : list of orgs
orgs_doc = list(pipeline.pipe(org_list))

orgs=[]
for i in orgs_doc:
    token_list = [tokens.text for tokens in i]
    temp = tuple(token_list)
    orgs.append(temp)

trie = gazetteers.Trie(orgs)

lf2 = gazetteers.GazetteerAnnotator("gov_detect", {"ORG":trie})

In [13]:
# 3) company_detector : find noun chunk containing 'GmbH'

def find_in_noun_chunks(previous_index, noun_chunk_list):
    get_true = []
    for i in noun_chunk_list:
        chunk_start = i.start
        chunk_end = i.end
        if (previous_index >= chunk_start and previous_index < chunk_end):
            get_true.append(i)
            
    if len(get_true) > 0:
        return (get_true[0].text, get_true[0].start)
    else:
        return 'None'    

def company_detector(doc):
    com_end_with= {"Gesellschaft m.b.H.", "Ges.m.b.H.","Gesellschaft mit beschränkter Haftung", "Betriebsgesellschaft m.b.H.", "GmbH"}
    token_list = [tokens.text for tokens in doc]
    pos_list = [tokens.pos_ for tokens in doc]
    noun_chunk_list = [chunks for chunks in doc.noun_chunks]

    for chunk in doc.noun_chunks:
        company = []
        for i in chunk:
            if i.text in com_end_with:
                company.append(chunk.text)

                start = chunk.start
                end = chunk.end

                if chunk[0].text in ['der','die','das','des', 'wiener']:
                    yield start, end, "ORG"
                    
                else:

                    token_index = chunk.start
                    previous = token_list[token_index-1]

                    find_noun = find_in_noun_chunks(token_index-1, noun_chunk_list)

                    if find_noun != 'None':
                        start = find_noun[1]
                        company.insert(0,find_noun[0])
                        yield start, end, "ORG"

lf3 = heuristics.FunctionAnnotator("company_detect", company_detector)

In [24]:
# 4. skweak labelling functions

# 4) verein_detector
def verein_detector(doc):
    for tok in doc:
        if 'Verein' in tok.text:
            if tok.nbor(1).pos_ == 'NOUN' or tok.nbor(1).pos_ == 'PROPN':
                yield tok.i, (tok.i)+2, "ORG"
lf4 = heuristics.FunctionAnnotator("verein_detect", verein_detector)

# 5) Wiener *band
def band_detector(doc):
    for tok in doc:
        if tok.text == "Wiener":
            if tok.nbor(1).text.endswith('band'):
                yield tok.i, (tok.i)+2, "ORG"
lf5 = heuristics.FunctionAnnotator("band_detect", band_detector)

# Test

In [16]:
text_all = ' '.join(text[0:200])
docs_all = list(pipeline.pipe([text_all]))

doc_lf = lf5(lf4(lf3(lf2(lf1(lf0(docs_all[0]))))))

# create and fit the HMM aggregation model
hmm = skweak.aggregation.HMM("hmm", ["ORG"])
hmm.fit_and_aggregate([doc_lf]*10)

# once fitted, we simply apply the model to aggregate all functions
doc_hmm = hmm(doc_lf)

# we can then visualise the final result (in Jupyter)
utils.display_entities(doc_hmm, "hmm")


Starting iteration 1
Finished E-step with 10 documents
Starting iteration 2


         1      -12758.7877             +nan


Finished E-step with 10 documents
Starting iteration 3


         2      -11578.7841       +1180.0036


Finished E-step with 10 documents
Starting iteration 4


         3      -11533.9564         +44.8277


Finished E-step with 10 documents


         4      -11532.4313          +1.5251


In [19]:
text_all = ' '.join(text[500:800])
docs_all = list(pipeline.pipe([text_all]))
doc_lf = lf5(lf4(lf3(lf2(lf1(lf0(docs_all[0]))))))

# create and fit the HMM aggregation model
hmm = skweak.aggregation.HMM("hmm", ["ORG"])
hmm.fit_and_aggregate([doc_lf]*10)

# once fitted, we simply apply the model to aggregate all functions
doc_hmm = hmm(doc_lf)

# we can then visualise the final result (in Jupyter)
utils.display_entities(doc_hmm, "hmm")

Starting iteration 1
Finished E-step with 10 documents
Starting iteration 2


         1      -22466.7475             +nan


Finished E-step with 10 documents
Starting iteration 3


         2      -20778.8588       +1687.8887


Finished E-step with 10 documents
Starting iteration 4


         3      -20716.6684         +62.1904


Finished E-step with 10 documents


         4      -20711.3726          +5.2958


In [39]:
path = './jsonlwmeta/2021_05/StRH-VIII-10-20.docx.jsonl'
data = []
with open(path,'r') as f:
    d = [json.loads(line)['text'] for line in f]
    data.append(d)

text = [item for sublist in data for item in sublist]
text_all = ' '.join(text)
docs_all = list(pipeline.pipe([text_all]))

doc_lf = lf5(lf4(lf3(lf2(lf1(lf0(docs_all[0]))))))

# create and fit the HMM aggregation model
hmm = skweak.aggregation.HMM("hmm", ["ORG"])
hmm.fit_and_aggregate([doc_lf]*10)

# once fitted, we simply apply the model to aggregate all functions
doc_hmm = hmm(doc_lf)

# we can then visualise the final result (in Jupyter)
utils.display_entities(doc_hmm, "hmm")

Starting iteration 1
Finished E-step with 10 documents
Starting iteration 2


         1      -21700.8270             +nan


Finished E-step with 10 documents
Starting iteration 3


         2      -19670.9182       +2029.9089


Finished E-step with 10 documents
Starting iteration 4


         3      -19567.9215        +102.9967


Finished E-step with 10 documents


         4      -19560.1012          +7.8203


# Export as a text file (CoNLL format)

In [22]:
import pandas as pd


def to_conll(doc_hmm,export_path):
    
    #create df for temporaliy saving data
    temp = pd.DataFrame(index = [*range(len(doc_hmm))], columns= ['tokens','ner_tags'])
    
    #insert values in tokens column
    temp['tokens'] = [x.text for x in doc_hmm]
    
    #insert values in the ner_tags column
    org_index_list = [(i.start, i.end) for i in doc_hmm.spans['hmm']] #get org positions
    
    for i in org_index_list:
        
        count = 0
        
        for j in range(i[0],i[1]):
            if count == 0: #if the first token of entity, add 'B-ORG'
                temp.at[j,'ner_tags'] = 'B-ORG' 
                count += 1
            else: #else, add 'I-ORG'
                temp.at[j,'ner_tags'] = 'I-ORG'
                
    temp = temp.fillna('O') #fill 'O' with non-entity tokens
    
    #export as csv file
    temp.to_csv(export_path, header=None, index=None, sep='\t', mode='a')
    
    return temp

In [41]:
to_conll(doc_hmm,'convert_to_txt_all.txt')

Unnamed: 0,tokens,ner_tags
0,Wien,B-ORG
1,Holding,I-ORG
2,GmbH,I-ORG
3,und,O
4,Wien,B-ORG
...,...,...
16320,Wien,I-ORG
16321,",",O
16322,im,O
16323,November,O
