In [7]:
import json
import spacy 

# Import training dataset 

In [9]:
def get_data(path):
    with open(path,'r', encoding = 'utf-8') as f:
        data = [json.loads(line) for line in f]
    return data

def to_train(data):
    # train data : ("Oranges are great source of vitamin C",{"entities":[(0,7,"Fruit")]}
    
    train = []

    for i in data:
        txt = i['text']
        ent = []

        for j in i['entities']:
            s = j['start_offset']
            e = j['end_offset']
            l = j['label']
            ent.append((s,e,l))
        train.append((txt,{"entities":ent}))

    return train


path = 'all.jsonl'
data_all = to_train(get_data(path))
data_all[0]

('Maßnahmenbekanntgabe zu MA 40, Prüfung der Nebenbeschäftigungen',
 {'entities': [(24, 29, 'ORG')]})

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data_all, test_size = 0.3)

# Change to .spacy file

In [27]:
from spacy.util import filter_spans
from tqdm import tqdm
from spacy.tokens import DocBin
nlp = spacy.blank("de") # load a new spacy model
# doc_bin = DocBin() # create a DocBin object

def to_spacy(data,output_savepath):
 
  # nlp = spacy.blank("de") # load a new spacy model
  doc_bin = DocBin() # create a DocBin object
  
  for training_example in tqdm(data): 

      text = training_example[0] #extract sentence
      labels = training_example[1]['entities']
      
      doc = nlp.make_doc(text) 
      ents = []

      for start, end, label in labels:
          span = doc.char_span(start, end, label=label, alignment_mode="contract")
          if span is None:
              print("Skipping entity")
          else:
              ents.append(span)

      filtered_ents = filter_spans(ents)
      doc.ents = filtered_ents 
      doc_bin.add(doc)

  doc_bin.to_disk(output_savepath) # save the docbin object

In [29]:
to_spacy(train,"./train.spacy")
to_spacy(test,"./test.spacy")

 46%|████▋     | 1205/2593 [00:00<00:00, 2112.48it/s]

Skipping entity


100%|██████████| 2593/2593 [00:01<00:00, 2117.33it/s]
100%|██████████| 1112/1112 [00:00<00:00, 1646.31it/s]


# Train model

In [30]:
#create config file: https://spacy.io/usage/training#config
!python -m spacy init fill-config base_config.cfg config.cfg

2022-10-09 13:51:32.790555: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [31]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./test.spacy

2022-10-09 13:51:56.301458: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-10-09 13:51:57,006] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2022-10-09 13:51:57,018] [INFO] Pipeline: ['tok2vec', 'ner']
INFO:spacy:Pipeline: ['tok2vec', 'ner']
[2022-10-09 13:51:57,023] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2022-10-09 13:51:57,024] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
[2022-10-09 13:52:00,604] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
INFO:spacy:Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  E

# Test trained model

In [35]:
trained_nlp = spacy.load("output/model-best")
text = "Organigramm der Wiener Rettungsleitstelle Quelle: Magistratsabteilung 70, Darstellung: Stadtrechnungshof Wien."
doc = trained_nlp(text)

for ent in doc.ents:
    print (ent.text, ent.label_)

Magistratsabteilung 70 ORG
Stadtrechnungshof Wien ORG
