In [3]:
# Generate a config from https://spacy.io/usage/training#quickstart
# Using [ner]
#
# Followed by the command
# python3 -m spacy init fill-config base_config.cfg config.cfg
#
# Then start training with
# python3 -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

import spacy
import json

from spacy.tokens import DocBin      # for annotated data
from spacy.util import filter_spans  # for removing spans with overlaps


file = open("Corona2.json", "r")

# read raw file contents
# print(file.read())

data = json.load(file)

# pretty print parsed JSON
# print(json.dumps(data, indent=4))

# pretty print single entry
#print(json.dumps(data["examples"][0], indent=4))

training_data = {'classes' : ['MEDICINE', "MEDICALCONDITION", "PATHOGEN"], 'annotations' : []}

# We only need the text string, the entity's (start index, end index, type)
for example in data["examples"]:
    entry = {}
    entry["text"] = example["content"]
    entry["entities"] = []
    
    for annotation in example["annotations"]:
        start = annotation['start']
        end = annotation['end']
        label = annotation['tag_name'].upper()
        entry['entities'].append((start, end, label))
  
    training_data['annotations'].append(entry)


# print converted data
print(json.dumps(training_data['annotations'][0], indent=4))

# Convert the data to SpaCy's DocBin
nlp = spacy.blank("de")

doc_bin = DocBin()
for data in training_data["annotations"]:
    text = data['text']
    labels = data['entities']
    
    doc = nlp.make_doc(text) 
    
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract") 
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    
    filtered_ents = filter_spans(ents) # remove duplicates or overlaps
    doc.ents = filtered_ents 
    doc_bin.add(doc)
    
# save the docbin object
doc_bin.to_disk("train.spacy") 

{
    "text": "While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
    "entities": [
        [
            360,
            371,
            "MEDICINE"
        