In [None]:
!python -m spacy download en_core_web_lg


In [None]:
import spacy

nlp = spacy.load("en_core_web_lg")

with open("ex1.txt", "r") as f:
    text = f.read()

doc = nlp(text)


for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
# display
from spacy import displacy
displacy.render(doc, style ="ent")

In [6]:
# using json file

import json
with open("annotations.json", "r") as f:
    data = json.load(f)

In [None]:
# has 2 keys: 'classes', 'annotations'
data

In [10]:
# is the list of tags: ['ADMIN_1', 'ADMIN_2', 'DISASTER']
data['classes']

['ADMIN_1', 'ADMIN_2', 'DISASTER']

In [31]:
# a list of lists(a 2d list) all the sentences and the entities and tags
data['annotations'][0]
# data.keys()

['On 01 April 2017 at 3:00 pm, Gatsibo district located in the Eastern Province of Rwanda experienced heavy rainfall associated with heavy storms, which resulted in destruction of houses and community farm lands in Kiramuruzi Sector Nyabisindu Cell. The affected area is located 36 kilometers from Gatsibo District, 40 kilometers from the Eastern Province office and 70 kilometers from the City of Kigali.\r',
 {'entities': [[29, 36, 'ADMIN_2'],
   [61, 77, 'ADMIN_1'],
   [131, 143, 'DISASTER'],
   [213, 223, 'ADMIN_2'],
   [296, 303, 'ADMIN_2'],
   [337, 353, 'ADMIN_1'],
   [396, 402, 'ADMIN_1']]}]

In [27]:
# you can index the lists to get the list of each sentence and the entities and tags associated
data['annotations'][0]

# the first thing in the list is the sentence
data['annotations'][0][0]

# the second thing in the list is a dictionary with one key 'entities'
data['annotations'][0][1]

# # the value of entities is another 2d array of the location of the entities and the tag
# data['annotations'][0][1]['entities']




{'entities': [[29, 36, 'ADMIN_2'],
  [61, 77, 'ADMIN_1'],
  [131, 143, 'DISASTER'],
  [213, 223, 'ADMIN_2'],
  [296, 303, 'ADMIN_2'],
  [337, 353, 'ADMIN_1'],
  [396, 402, 'ADMIN_1']]}

In [30]:
# training data

from spacy.tokens import DocBin
# tqdm allows you to configure and display progress bar. useful for tracking ML experiments
from tqdm import tqdm

# load a new blank spacy model
nlp = spacy.blank("en")
# make a new container. must be of type DocBin
doc_bin = DocBin()

# only interested in the content of the annotations key
training_data = data['annotations']

In [None]:
# training data continued

from spacy.util import filter_spans

for example in tqdm(training_data):
    # get each paragraph
    text = example[0]
    labels = example['annotations'][0][1]['entities']
    # make new doc object for each example(each sentence)
    doc = nlp.make_doc(text)

    # collect entities
    ents = []

    # create a span object for each entity
    # span is a group of token(multi word token)
    for x in labels:
        span = doc.char_span(x[0],x[1], label = x[2], alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    
    # avoid duplicate span
    filtered_ents = filter_spans(ents)
    # add filtered entities to our doc object
    doc.ents=filtered_ents
    # add our doc objects to doc bin we created earlier 
    doc_bin.add(doc)

    doc_bin.to_disk("train.spacy")

In [43]:
data['annotations'][0][1]['entities'][0][2]

'ADMIN_2'