In [1]:
import spacy 
# !python -m spacy download de_core_news_sm
nlp = spacy.load("de_core_news_sm")
import pandas as pd
from datasets import Dataset
import json



In [2]:
def get_data(path):
    print("Reading file...")
    with open(path,'r', encoding = 'utf-8') as f:
        data = [json.loads(line) for line in f]
    return data

def iob(data):
    count = 0 
    i=0
    starts = [x['start_offset'] for x in data['entities']]
    ends = [x['end_offset'] for x in data['entities']]
    starts.append(99999)
    ends.append(99999)

    sen_temp = []
    tag_temp = []

    text = data['text']
    if text.startswith(" "):
        text = text[1:]
        count = 1
    doc = nlp(text)


    for t in doc:
        sen_temp.append(t.text)
        if count == starts[i]:
            tag_temp.append('B-ORG')
            count = count+len(t.text)+1
            if count > ends[i]:
                i += 1

        elif count > starts[i] and count < ends[i]:
            if tag_temp[-1] == 'O':
                tag_temp.append('B-ORG')
            else:
                tag_temp.append('I-ORG')
            count = count+len(t.text)+1
            if count > ends[i]:
                i += 1

        else:
            tag_temp.append('O')
            count = count+len(t.text)+1

    return sen_temp, tag_temp

def tokenized_output(data):
    print('Tokenizing...')
    doc_id_list = []
    para_id_list = []
    txt_list = []
    tag_list = []

    for d in data:
        doc_id_list.append(d['doc_id'])
        para_id_list.append(d['para_id'])
        txt, tag = iob(d)
        txt_list.append(txt)
        tag_list.append(tag)

    tokenized = pd.DataFrame({'doc_id' : doc_id_list,
                             'para_id': para_id_list,
                             'tokens' : txt_list,
                             'ner_tags' : tag_list})

    return tokenized

In [3]:
def iob_with_pos(d):
    count = 0 
    i=0
    starts = [x['start_offset'] for x in d['entities']]
    ends = [x['end_offset'] for x in d['entities']]
    starts.append(99999)
    ends.append(99999)

    sen_temp = []
    tag_temp = []

    
    text = d['text']
    if text.startswith(" "):
        text = text[1:]
        count = 1
    doc = nlp(text)
    pos = [token.tag_ for token in doc]

    for t in doc:
        sen_temp.append(t.text)
        if count == starts[i]:
            tag_temp.append('B-ORG')
            count = count+len(t.text)+1
            if count > ends[i]:
                i += 1

        elif count > starts[i] and count < ends[i]:
            if tag_temp[-1] == 'O':
                tag_temp.append('B-ORG')
            else:
                tag_temp.append('I-ORG')
            count = count+len(t.text)+1
            if count > ends[i]:
                i += 1

        else:
            tag_temp.append('O')
            count = count+len(t.text)+1
    token_nums = len(tag_temp)
    doc_id_temp = [str(d['doc_id'])+str(d['para_id'])] * token_nums
    return sen_temp, pos, tag_temp


def to_conll_txt(tokenized,filename):
    with open("{}.txt".formant(filename), "w", encoding = 'utf-8') as record_file:
        for i in tokenized:
            sen, pos, tag = iob_with_pos(i)
            for j in range(len(sen)):
                record_file.write(sen[j]+"\t"+pos[j]+"\t"+tag[j]+"\n")
            record_file.write("\n")

In [4]:
if __name__ == '__main__':
    path = 'all.jsonl'
    data = get_data(path)
    tokenized = tokenized_output(data)
    print(tokenized.head())
    

Reading file...
Tokenizing...
   doc_id  para_id                                             tokens  \
0       0        0  [Maßnahmenbekanntgabe, zu, MA, 40, ,, Prüfung,...   
1       0        1                               [INHALTSVERZEICHNIS]   
2       0        2                            [ABKÜRZUNGSVERZEICHNIS]   
3       0        3               [bzw., beziehungsweise, Nr., Nummer]   
4       0        4  [Erledigung, des, Prüfungsberichtes, Der, Stad...   

                                            ner_tags  
0                   [O, O, B-ORG, I-ORG, O, O, O, O]  
1                                                [O]  
2                                                [O]  
3                                       [O, O, O, O]  
4  [O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, O...  


In [14]:
# def print_result(i):
#     for j in range(len(tokenized.iloc[i].tokens)):
#         print(tokenized.iloc[i].tokens[j], tokenized.iloc[i].ner_tags[j])
# print_result(300)

Die O
Magistratsabteilung B-ORG
40 I-ORG
sowie O
der O
Fonds B-ORG
Soziales I-ORG
Wien I-ORG
und O
das O
Kuratorium B-ORG
Wiener I-ORG
Pensionisten-Wohnhäuser I-ORG
( O
in O
Summe O
somit O
zwei O
Drittel O
aller O
geprüften O
Einrichtungen O
) O
verfügten O
über O
eine O
Stelle O
für O
Interne O
Revision O
, O
während O
das O
übrige O
Drittel O
eine O
solche O
nicht O
eingerichtet O
hatte O
. O
