In [None]:
!python -m spacy download en_core_web_lg


In [None]:
import spacy

nlp = spacy.load("en_core_web_lg")

with open("ex1.txt", "r") as f:
    text = f.read()

doc = nlp(text)


for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
# display
from spacy import displacy
displacy.render(doc, style ="ent")

In [None]:
# using json file

import json
with open("annotations.json", "r") as f:
    data = json.load(f)

In [None]:
# has 2 keys: 'classes', 'annotations'
data

In [None]:
# is the list of tags: ['ADMIN_1', 'ADMIN_2', 'DISASTER']
data['classes']

In [None]:
# a list of lists(a 2d list) all the sentences and the entities and tags
data['annotations'][0]
# data.keys()

In [None]:
# # you can index the lists to get the list of each sentence and the entities and tags associated
# data['annotations'][0]

# # the first thing in the list is the sentence
# data['annotations'][0][0]

# # the second thing in the list is a dictionary with one key 'entities'
# data['annotations'][0][1]

# the value of entities is another 2d array of the location of the entities and the tag
data['annotations'][0][1]['entities']




In [None]:
# training data

from spacy.tokens import DocBin
# tqdm allows you to configure and display progress bar. useful for tracking ML experiments
from tqdm import tqdm

# load a new blank spacy model
nlp = spacy.blank("en")
# make a new container. must be of type DocBin
doc_bin = DocBin()

# only interested in the content of the annotations key
training_data = data['annotations']

In [None]:
# training data continued
from tqdm import tqdm
from spacy.util import filter_spans

for example in tqdm(training_data):
    # get each paragraph
    text = example[0]
    labels = example[1]['entities']

    # make new doc object for each example(each sentence)
    doc = nlp.make_doc(text)

    # collect entities
    ents = []

    # create a span object for each entity
    # span is a group of token(multi word token)
    for x in labels:
        span = doc.char_span(x[0],x[1], label = x[2], alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    
    # avoid duplicate span
    filtered_ents = filter_spans(ents)
    # add filtered entities to our doc object
    doc.ents=filtered_ents
    # add our doc objects to doc bin we created earlier 
    doc_bin.add(doc)

    doc_bin.to_disk("train.spacy")

In [None]:
!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

In [None]:
nlp_ner = spacy.load("model-best")

In [None]:
# headlines = "Kenya Meteorological Department (KMD) released various forecasts which indicated continued rains beyond the peak of the season which was predicted to be in second half of April. In April, rains spread out to most parts of the country, with parts of Western Kenya receiving over 300mm of rain while Rift Valley, Central and Southeastern Kenya received over 150mm. These rains were above what is normally received in April for Western, parts of Rift Valley, Central and Southeastern Kenya. At the same time, parts of the North, where drought is evolving, received rainfall that is below what is normally received in the month of April. On 27 April 2021, the KMD released the monthly forecast for May which indicated continued "
nlp_ner = spacy.load("model-best")
doc2 = nlp_ner("Kenya Meteorological Department (KMD) released various forecasts which indicated continued rains beyond the peak of the season which was predicted to be in second half of April. In April, rains spread out to most parts of the country, with parts of Western Kenya receiving over 300mm of rain while Rift Valley, Central and Southeastern Kenya received over 150mm. These rains were above what is normally received in April for Western, parts of Rift Valley, Central and Southeastern Kenya. At the same time, parts of the North, where drought is evolving, received rainfall that is below what is normally received in the month of April. On 27 April 2021, the KMD released the monthly forecast for May which indicated continued "
)
# c

for ent in doc2.ents:
    print(ent.text, ent.label_)