In [1]:
import spacy
import json
import random
from spacy.training.example import Example

In [2]:
def load_json_data(file):
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

In [3]:
def save_json_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [4]:
def train_spacy(data, iterations):
    TRAIN_DATA = data
    ner_model = spacy.blank("en")
    
    if "ner" not in ner_model.pipe_names:
        ner = ner_model.add_pipe("ner", last=True)
    
    # Add labels to the NER component
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    
    # Disable other pipes for training only NER
    other_pipes = [pipe for pipe in ner_model.pipe_names if pipe != "ner"]
    with ner_model.disable_pipes(*other_pipes):
        optimizer = ner_model.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                # Create Example object
                doc = ner_model.make_doc(text)
                example = Example.from_dict(doc, annotations)
                
                # Update the model with the Example object
                ner_model.update(
                    [example],  # List of Example objects
                    drop=0.2,
                    sgd=optimizer,
                    losses=losses
                )
            print(f"loss @ {itn}",losses.get("ner"))
    return ner_model

In [5]:
# load the data
TRAIN_DATA = load_json_data("data/hp_training_data.json")
# train ner model
num_epochs = 10
ner_model = train_spacy(TRAIN_DATA, num_epochs)
# save ner model
ner_model.to_disk("hp_ner_model")

Starting iteration 0
loss @ 0 1173.0989255151123
Starting iteration 1
loss @ 1 243.65640006619478
Starting iteration 2
loss @ 2 139.91759563502958
Starting iteration 3
loss @ 3 126.27410132768671
Starting iteration 4
loss @ 4 127.82585205081172
Starting iteration 5
loss @ 5 108.42290607387334
Starting iteration 6
loss @ 6 81.35821965079461
Starting iteration 7
loss @ 7 62.02512983493153
Starting iteration 8
loss @ 8 65.44746564939062
Starting iteration 9
loss @ 9 82.96372085565211


In [6]:
# load the trained hp ner model
ner_hp_model = spacy.load('hp_ner_model')

In [7]:
TRAIN_DATA[0][0]

"At the start-of-term banquet, Harry had gotten the idea that Professor Snape disliked him. By the end of the first Potions lesson, he knew he'd been wrong. Snape didn't dislike Harry -- he hated him."

In [8]:
doc = ner_hp_model(TRAIN_DATA[0][0])

In [9]:
for ent in doc.ents:
    print(f'{ent.label_.upper()} - {ent.text}')

PERSON - Harry
PERSON - Professor Snape
PERSON - Snape
PERSON - Harry
