In [1]:
import import_ipynb
from extractText_docx import extract_text


import spacy
from spacy import displacy
from spacy.pipeline import EntityRuler 
import json

importing Jupyter notebook from extractText_docx.ipynb


In [2]:
#!python -m spacy download de_core_news_lg
nlp = spacy.load('de_core_news_lg')

#add hand-craft rules : MA ##, Stadt~ Wien

ruler = nlp.add_pipe("entity_ruler", before = 'ner')

patterns = [
                {
                    "label": "ORG", "pattern": [
                        {"TEXT": {"REGEX": r"MA"}},
                        {"TEXT": {"REGEX": r"\w\d+"}}
                    ]
                },
                {
                    "label": "ORG", "pattern": [
                        {"TEXT": {"REGEX": r"Magistratsabteilung"}},
                        {"TEXT": {"REGEX": r"\w\d+"}}
                    ]
                },
                {
                    "label": "ORG", "pattern": [
                        {"TEXT": {"REGEX": r"Stadt*"}},
                        {"TEXT": {"REGEX": r"\w"}},
#                         {"TEXT": {"REGEX": r"Wien"}},
                    ]
                }  
            ]

ruler.add_patterns(patterns)

# #create the doc
# doc = nlp(text)

# #extract entities
# for ent in doc.ents:
#     print (ent.text, ent.label_)

In [6]:
#JSONL format : {"text": "President Obama", "labels": [ [10, 15, "PERSON"] ]}

def ner(nlp, file, text):
    '''text = paragraphs list'''
    ner_init = [] #save inital NER
    ner_init.append({"text":'Start of document - file: {}'.format(file), "label":''})
    for i in text:
        doc=nlp(i)
        labels_list=[] #save labels
        
        for word in doc.ents:
            if word.label_ == "ORG": #only organisations
                labels_list.append([word.start_char, word.end_char, word.label_])
        
        if len(labels_list) > 0:
            ner_init.append({"text":i, "label":labels_list})
    ner_init.append({"text":'End of document', "label":''})
    return ner_init   


def export_jsonl(save_path,file_name,ner_init):
    file = save_path + '\\' + file_name + '.jsonl'
    with open(file, 'w') as fp:
        for i in ner_init:
            json.dump(i, fp)
            fp.write('\n')
    fp.close()
    print('Successfully exported : ', file)

In [7]:
if __name__ == '__main__':
    path = 'StRH-I-3-20-MA_57.docx'
    print('Extracting text from docx...')
    file = extract_text(path)
    extracted_paragraphs = file.clensing()
    print(file.title)
    
    print('Finding named entities...')
    ner_init = ner(nlp, path, extracted_paragraphs)
    
    print('Exporting...')
    export_jsonl('C:\\Users\\sooje\\init_data',path, ner_init)

Extracting text from docx...
StRH I - 3/20 MA 57, Maßnahmenbekanntgabe zu MA 57, MA 17 und Verein PEREGRINA - Bildungs-, Beratungs- und Therapiezentrum für Immigrantinnen, Prüfung des Vereines PEREGRINA
Finding named entities...
Exporting...
Successfully exported :  C:\Users\sooje\init_data\StRH-I-3-20-MA_57.docx.jsonl
