In [None]:
import spacy

In [None]:
# Download the large version of the SpaCy NER model
!python -m spacy download en_core_web_lg

# !python -m spacy download en_core_web_sm ---- for if we want to use a small model

In [None]:
demo_model = spacy.load("en_core_web_lg")

In [None]:
# Running an example to show the standard model without training
results = demo_model("My name is Mirna Ashour and I live in NYC")

In [None]:
# Use .ents to access entities within an object of type DocBin returned by model
results.ents

(Mirna Ashour, NYC)

In [None]:
# Can use built in functions to render the results visually with color-coded tagging
from spacy import displacy
displacy.render(results, style="ent", jupyter=True)

In [None]:
training_data = []
# Training data needs to be in the following format:

#   An array of dictionaries that have two keys: text and entities
#   The text key should map to a string carrying one headline
#   The entities key should map to an array of tuples that the start and end span values for the labeled entities along with their label

# [{
#    'text': "Headline in one string",
#
#    'entities': [(span_start_num, span_end_num, 'entity_label'),
#                 (span_start_num, span_end_num, 'entity_label'),
#                 ...]
#   },
#   ...
# ]

In [None]:
# Imports the DocBin object and a blank verison of the standard NER model
from spacy.tokens import DocBin
from tqdm import tqdm

blank_model = spacy.blank("en")
doc_bin = DocBin()

In [None]:
# Prepare training data for model by converting it into DocBin format
from spacy.util import filter_spans

for headline  in tqdm(training_data):
    text = headline['text']
    labels = headline['entities']
    doc = blank_model.make_doc(text)
    entities = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            entities.append(span)
    filtered_entities = filter_spans(entities)
    doc.ents = filtered_entities
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")

In [None]:
# Generate a config file for the model before training
!python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
# Train the model and save to current directory under "train.spacy"
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

In [None]:
# Best version of model is saved and is now ready to use by passing in data to be labelled and return in DocBin format
custom_ner_model = spacy.load("model-best")