# Import Libraries

In [9]:
import re
from docx import Document
import unidecode

import skweak
from skweak import heuristics, gazetteers, generative, utils

import spacy

import os
import glob
import json

# Import files

In [10]:
def extraction(path):
    document = Document(path)

    #get title 
    if len(document.core_properties.title) > 2:
        title = document.core_properties.title
    else:
        title = ''

    #get full text
    para= [title]

    count = 0
    para_count = 0
    while count < len(document.paragraphs)-1:
#             next_count = count+1
        t = document.paragraphs[count].paragraph_format.left_indent
        if t != None :
            para[para_count] = para[para_count]+" "+document.paragraphs[count].text

        else:
            para.append(document.paragraphs[count].text)
            para_count += 1
        count = count+1   
    para.append(document.paragraphs[len(document.paragraphs)-1].text)

    docText = '\n\n'.join(x for x in para)
    docText = re.sub("_x000d_", " ", docText)
    #split paragraphs
    sp = re.split("\n\n\n", docText)
    splitted = [x for x in sp if len(x)>3]
    
    return splitted

def clensing(path):
    splitted = extraction(path) 

    cleansed =[]
    for i in splitted :           
        #remove linebreaking
        temp = i.replace('\n\n',' ')
        temp = temp.replace('\n','')
        temp = re.sub('\s+',' ',temp)
        cleansed.append(temp)        

    return cleansed
    
    

#get leaf directories
def leaf_folders(file_path):
    return [dirpaths for dirpaths, dirnames, filenames in os.walk(file_path) if not dirnames]


#get files
def files(file_path):
    folder_list = leaf_folders(file_path)
    files_list = {}
    
    for i in folder_list:
#         time = os.path.basename(i)
        os.chdir(i)
        files = []
        for file in glob.glob("*.docx"):

            files.append(file)
        
        files_list.update({i:files})
        
    return files_list

In [11]:
files_list = files('C:\\Users\\sooje\\jsonl_fromdocx\\2020')
month_list = [key for key in files_list]

all_2020 = []
for month in month_list:
    for file_name in files_list[month]:
        path = month+'\\'+file_name
        temp = clensing(path)
        
        all_2020.append(temp)
        
extracted = [item for sublist in all_2020 for item in sublist]

text = ' [sep] '.join(extracted)
print(len(text))

3926994


In [57]:
print("The number of sentences :", len(extracted))

The number of sentences : 10220


In [12]:
#docs only can contain shorther than 1000000 length of characters.
def split_text_for_docs(text):
    count = 0
    text_list = []
    while count < len(text):
        temp_text = text[count:count+1000000]
        text_list.append(temp_text)
        count = count+1000000+1
    return text_list

text_list = split_text_for_docs(text)
print(len(text_list))

4


In [13]:
# convert to spacy.Docs
print('convert to spacy.Docs...')
docs = []
for i in text_list:
    temp_doc = list(pipeline.pipe([i]))
    docs.append(temp_doc)

convert to spacy.Docs...


# Load pipeline

In [14]:
# load pipe line
pipeline = spacy.load("de_core_news_lg", disable=["ner","lemmatizer"])

In [15]:
# add special case to 
print('Before:')
print(pipeline.tokenizer.explain('Gesellschaft m.b.H.'))

from spacy.attrs import ORTH, NORM

pipeline.tokenizer.add_special_case("[sep]", [{ORTH: "[sep]"}])

pipeline.tokenizer.add_special_case("Gesellschaft m.b.H.", [{ORTH: "Gesellschaft m.b.H."}])
pipeline.tokenizer.add_special_case("Ges.m.b.H.", [{ORTH: "Ges.m.b.H."}])
pipeline.tokenizer.add_special_case("Gesellschaft mbH", [{ORTH: "Gesellschaft mbH"}])
pipeline.tokenizer.add_special_case("Gesellschaft mit beschränkter Haftung", [{ORTH: "Gesellschaft mit beschränkter Haftung"}])
pipeline.tokenizer.add_special_case("Betriebsgesellschaft m.b.H.", [{ORTH: "Betriebsgesellschaft m.b.H."}])

print('After:')
print(pipeline.tokenizer.explain('Gesellschaft m.b.H.'))

Before:
[('TOKEN', 'Gesellschaft'), ('TOKEN', 'm.b'), ('INFIX', '.'), ('TOKEN', 'H.')]
After:
[('SPECIAL-1', 'Gesellschaft m.b.H.')]


# Set labeling functions (skweak)

In [16]:
# 1) ma_detector : find MA ORGs

def ma_detector(doc):
    ma = ["MA","ma","Magistratsabteilung","magistratsabteilung", 'Magistratsabteilungen','magistratsabteilungen']
    for tok in doc[:-1]:
        if tok.text in ma:
            if re.match(r"\d",tok.nbor(1).text) or re.match(r"\d\d",tok.nbor(1).text):
                yield tok.i, (tok.i)+2, "ORG"


In [17]:
# 2) gazetteers : list of orgs (extracted orgs from Tätigkeitsberichte 2013 - 2021 : https://www.stadtrechnungshof.wien.at/berichte/archiv/index.html)
def get_trie():
    org_list = []
    with open('C:\\Users\\sooje\\orgs_list.txt', 'r', encoding = 'utf-8') as f:
        lines = f.readlines()
        org_list = [re.sub('\n','',line) for line in lines]

    orgs_doc = list(pipeline.pipe(org_list))

    orgs=[]
    for i in orgs_doc:
        token_list = [tokens.text for tokens in i]
        temp = tuple(token_list)
        orgs.append(temp)

    trie = gazetteers.Trie(orgs)
    return trie

In [18]:
# 3) company_detector : find noun chunk containing 'GmbH'

def find_in_noun_chunks(previous_index, noun_chunk_list):
    get_true = []
    for i in noun_chunk_list:
        chunk_start = i.start
        chunk_end = i.end
        if (previous_index >= chunk_start and previous_index < chunk_end):
            get_true.append(i)
            
    if len(get_true) > 0:
        return get_true[0].start
    else:
        return 'None'    

def company_detector(doc):
    com_end_with= {"Gesellschaft m.b.H.","Gesellschaft mbH", "Ges.m.b.H.",
                   "Gesellschaft mit beschränkter Haftung", "Betriebsgesellschaft m.b.H.", "GmbH"}
    token_list = [tokens.text for tokens in doc]
    pos_list = [tokens.pos_ for tokens in doc]
    noun_chunk_list = [chunks for chunks in doc.noun_chunks]

    for chunk in doc.noun_chunks:

        for i in chunk:
            if i.text in com_end_with:

                start = chunk.start
                end = chunk.end

                if chunk[0].text in ['Der','Die','Das','Des', 'wiener']:
                    if chunk[0].text in ['Der','Die','Das','Des']:
                        yield start+1, end, "ORG"
                    else:
                        yield start, end, "ORG"
                    
                else:

                    token_index = chunk.start
                    previous = token_list[token_index-1]

                    find_noun = find_in_noun_chunks(token_index-1, noun_chunk_list)

                    if find_noun != 'None':
                        start = find_noun
                        yield start, end, "ORG"


In [19]:
# 4) Verein_Detector : find noun chunk starts with 'Verein'
def find_next_noun_chunks(next_index, noun_chunk_list):
    get_true = []
    for count in range(len(noun_chunk_list)):
        i = noun_chunk_list[count]
        
        chunk_start = i.start
        chunk_end = i.end
        
        if (next_index >= chunk_start and next_index < chunk_end):
            get_true.append(i.text)
            end_entity = chunk_end
   
    if len(get_true) > 0:
        return end_entity
        
    else:
        return 'None'    

def verein_detector(doc):
    pre = ["Verein", "Vereiner", "Vereinen"]
    token_list = [tokens.text for tokens in doc]
    noun_chunk_list = [chunks for chunks in doc.noun_chunks]


    for tok in doc:
        pre_org = []
        if tok.text in pre:

            start = tok.i

            next_index = (tok.i)+1
            next_noun_chunk = find_next_noun_chunks(next_index, noun_chunk_list)

            if next_noun_chunk != 'None':
                end = next_noun_chunk

                yield start, end, "ORG"              

In [20]:
lf1 = heuristics.FunctionAnnotator("ma_detect", ma_detector)
lf2 = gazetteers.GazetteerAnnotator("gov_detect", {"ORG":get_trie()})
lf3 = heuristics.FunctionAnnotator("company_detect", company_detector)
lf4 = heuristics.FunctionAnnotator("verein_detect", verein_detector)   

def sk_generate(doc):
    doc_lf = (lf4(lf3(lf2(lf1(doc)))))


    # create and fit the HMM aggregation model
    hmm = skweak.aggregation.HMM("hmm", ["ORG"])
    hmm.fit_and_aggregate([doc_lf]*5)


    # once fitted, we simply apply the model to aggregate all functions
    doc_hmm = hmm(doc_lf)


    # we can then visualise the final result (in Jupyter)
    utils.display_entities(doc_hmm, "hmm")
    
    return doc_hmm

# Aggregate LFs

In [46]:
#weak supervision labelling
print('weak supervision labelling...')
results = []
for i in docs:
    temp_result = sk_generate(i[0])
    results.append(temp_result)

weak supervision labelling...
Starting iteration 1
Finished E-step with 5 documents
Starting iteration 2


         1     -124925.0091             +nan


Finished E-step with 5 documents
Starting iteration 3


         2     -117881.8263       +7043.1828


Finished E-step with 5 documents
Starting iteration 4


         3     -117131.0939        +750.7324


Finished E-step with 5 documents


         4     -116950.1376        +180.9563


Starting iteration 1
Finished E-step with 5 documents
Starting iteration 2


         1     -144494.9537             +nan


Finished E-step with 5 documents
Starting iteration 3


         2     -135546.9390       +8948.0147


Finished E-step with 5 documents
Starting iteration 4


         3     -134744.5029        +802.4361


Finished E-step with 5 documents


         4     -134606.5716        +137.9313


Starting iteration 1
Finished E-step with 5 documents
Starting iteration 2


         1     -131843.9878             +nan


Finished E-step with 5 documents
Starting iteration 3


         2     -122998.7560       +8845.2318


Finished E-step with 5 documents
Starting iteration 4


         3     -122280.8420        +717.9140


Finished E-step with 5 documents


         4     -122203.2474         +77.5946


Starting iteration 1
Finished E-step with 5 documents
Starting iteration 2


         1     -151293.5417             +nan


Finished E-step with 5 documents
Starting iteration 3


         2     -139286.6618      +12006.8799


Finished E-step with 5 documents
Starting iteration 4


         3     -138375.0245        +911.6373


Finished E-step with 5 documents


         4     -137203.4361       +1171.5884


# Clean data

In [None]:
#text are in 'text_list'
#doc_hmm results are in 'results'

In [27]:
import pandas as pd


def to_df(doc_hmm):
    
    #create df for temporaliy saving data
    temp = pd.DataFrame(index = [*range(len(doc_hmm))], columns= ['tokens','ner_tags'])
    
    #insert values in tokens column
    temp['tokens'] = [x.text for x in doc_hmm]
    
    #insert values in the ner_tags column
    org_index_list = [(i.start, i.end) for i in doc_hmm.spans['hmm']] #get org positions
    
    for i in org_index_list:
        
        count = 0
        
        for j in range(i[0],i[1]):
            if count == 0: #if the first token of entity, add 'B-ORG'
                temp.at[j,'ner_tags'] = 'B-ORG' 
                count += 1
            else: #else, add 'I-ORG'
                temp.at[j,'ner_tags'] = 'I-ORG'
    
    temp  = temp.drop(temp[temp.tokens == ' '].index)
    temp = temp.reset_index(drop = True)
    
    for i in range(len(temp)):
        if temp['tokens'][i] == '[sep]':
            temp.at[i, 'tokens'] = ' '
            temp.at[i,'ner_tags'] = ' '
    
    temp = temp.fillna('O') #fill 'O' with non-entity tokens


    return temp

In [25]:
def get_org_list(labelled):
    org_list = []
    temp_org = ''
    start = 0
    end = 0
    
    for i in range(len(labelled)):
        if labelled['ner_tags'][i] == 'B-ORG':

            org_list.append((start, end, temp_org))

            temp_org = labelled['tokens'][i]
            start = i
            end = i+1

        elif labelled['ner_tags'][i] == 'I-ORG':
            temp_org = temp_org + ' ' + labelled['tokens'][i]
            end = i+1
            
    return org_list[1:]

def start_with_lowercase(labelled):
    
    labelled_upd = labelled.copy()
    org_list= get_org_list(labelled) 

    for i in org_list:

        start = i[0]
        end = i[1]

        tok_list = list(labelled['tokens'][start:end])

        # if start with lower case letter : not an org
        if tok_list[0].islower():

            for j in range(len(tok_list)):
                token  = tok_list[j]
                if token.islower():

                    labelled_upd.at[start,'ner_tags'] = 'O'
                    start = start + 1
                else:
                    if start < end:

                        labelled_upd.at[start,'ner_tags'] = 'B-ORG'
                        break 
                        
                    else:
                        labelled_upd.at[start,'ner_tags'] = 'O'
                        break

        # if start with number :  not an org
        elif re.match(r'\d',tok_list[0]):

            labelled_upd.at[start,'ner_tags'] = 'O'
            labelled_upd.at[start+1,'ner_tags'] = 'B-ORG'    

    return labelled_upd
 
def start_with_article(labelled):
    
    labelled_upd = labelled.copy()
    org_list= get_org_list(labelled) 

    for i in org_list:

        start = i[0]
        end = i[1]

        tok_list = list(labelled['tokens'][start:end])
      
    # if start with article : over detected token            
        if tok_list[0] in ['der','die','das','des', 'Der', 'Die', 'Das', 'Des']:

            labelled_upd.at[start,'ner_tags'] = 'O'
            labelled_upd.at[start+1,'ner_tags'] = 'B-ORG'  
        
    return labelled_upd

def detect_too_long(labelled):
    
    labelled_upd = labelled.copy()
    org_list= get_org_list(labelled) 

    for i in org_list:

        start = i[0]
        end = i[1]

        tok_list = list(labelled['tokens'][start:end])
        
        # if too long org is detected : delete from the org 
        
        if end - start > 15:
            for row in range(start,end):
                labelled_upd.at[row,'ner_tags'] = 'O'   
                
    return labelled_upd
  

def cleansing(labelled):
    labelled_upd_lower = start_with_lowercase(labelled)
    labelled_upd_article = start_with_article(labelled_upd_lower)
    labelled_upd_article_lower = start_with_lowercase(labelled_upd_article)
    labelled_upd_all = detect_too_long(labelled_upd_article_lower)  
    
    return labelled_upd_all


In [47]:
def cleansed_results(results):
    df = pd.DataFrame(columns = ['tokens','ner_tags'])
    for i in range(len(results)):
        print('cleansing result {}'.format(i))
        weak_df = to_df(results[i])
        cleansed_temp = cleansing(weak_df)
        df = pd.concat([df,cleansed_temp])
#         print(len(df))

    return df.reset_index(drop = True)


In [48]:
cleansed_results_all = cleansed_results(results)

cleansing 0 result
132751
cleansing 1 result
270850
cleansing 2 result
406895
cleansing 3 result
534167


In [56]:
print('\n\n----------------------------------------')
print("The number of tokens :", len(cleansed_results_all) - list(cleansed_results_all['ner_tags']).count(' ') )
ner_tags = list(cleansed_results_all['ner_tags'])
print("   - B-ORG :", ner_tags.count('B-ORG'))
print("   - I-ORG :", ner_tags.count('I-ORG'))
print("   - O :", ner_tags.count('O'))
print('----------------------------------------\n\n')



----------------------------------------
The number of tokens : 523989
   - B-ORG : 8635
   - I-ORG : 14249
   - O : 501105
----------------------------------------




In [59]:
#save as txt file
print('save as a file...')
cleansed_results_all.to_csv('C:\\Users\\sooje\\skweak_labelled_20_all.txt', header=None, index=None, sep='\t', mode='a')

save as a file...
