In [1]:
from __future__ import unicode_literals
import plac
import random
from pathlib import Path
import spacy
import json
import io
from spacy.util import minibatch, compounding
import pandas as pd
import re
import numpy as np
import srsly
import unidecode
from spacy.scorer import Scorer
from spacy.gold import GoldParse

In [10]:
spacy.require_gpu()

True

In [13]:
LABELS = ['menuitem']
TRAIN_DATA = srsly.read_json("drive/My Drive/annotated_data_phoenix.json")

In [14]:
len(TRAIN_DATA)

160993

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
train, test = train_test_split(TRAIN_DATA, test_size=0.1, random_state=1)

train, val = train_test_split(train, test_size=0.11, random_state=1)

In [17]:
len(train), len(val), len(test)

(128954, 15939, 16100)

In [18]:
train = train[:100]
val = val[:10]

In [2]:
def evaluate(nlp, ner, dev_sents):
    scorer = Scorer()
    
    for raw_text, annotations in dev_sents:
        doc = nlp.make_doc(raw_text)
        ner(doc)
        gold = GoldParse(doc, entities=annotations["entities"])
        nlp.entity(doc)
        scorer.score(doc, gold)
    return scorer.scores

def get_scores(nlp, ner, examples):
    random.shuffle(examples)

    scores = evaluate(nlp, ner, examples)

    precision = '%.2f' % scores['ents_p']
    recall = '%.2f' % scores['ents_r']
    f_measure = '%.2f' % scores['ents_f']
    print(f"P = {precision}\nR = {recall}\nF = {f_measure}")

    return scores

In [29]:
@plac.annotations(
    model=("Model name. Defaults to blank 'en' model.", "option", "m", str),
    new_model_name=("New model name for model meta.", "option", "nm", str),
    output_dir=("Optional output directory", "option", "o", Path),
    n_iter=("Number of training iterations", "option", "n", int))

def main(model=None, new_model_name='model', output_dir='menuitem_ner_model', n_iter=50):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')
    
    for label in LABELS:
        ner.add_label(label)   # add new entity label to entity recognizer

    # get names of other pipes to disable them during training
    random.seed(7)
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            print(f"Epoch {itn+1}/{n_iter}.\n------------")
            random.shuffle(train)
            losses = {}
            batches = minibatch(train, size=compounding(100.0, 500.0, 1.2))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, 
                           annotations, 
                           sgd=optimizer, 
                           drop=0.35,
                           losses=losses)
            print(f"loss -> {losses}")
            # print("Training metrics:")
            # get_scores(nlp, ner, train)
            # print("\nValidation metrics:")
            # get_scores(nlp, ner, val)
            # print("=========================")

            # save model to output directory
            output_dir_ = f"drive/My Drive/{output_dir}_epoch_{itn+1}"
            if output_dir_ is not None:
                output_dir_ = Path(output_dir_)
                if not output_dir_.exists():
                    output_dir_.mkdir()
                nlp.meta['name'] = new_model_name  # rename model
                nlp.to_disk(output_dir_)
                print("Saved model to", output_dir_)

main()

Created blank 'en' model
Epoch 1/50.
------------
loss -> {'ner': 638257.0937576294}
Saved model to drive/My Drive/menuitem_ner_model_epoch_1
Epoch 2/50.
------------
loss -> {'ner': 295657.8459300995}
Saved model to drive/My Drive/menuitem_ner_model_epoch_2
Epoch 3/50.
------------
loss -> {'ner': 233649.82171583176}
Saved model to drive/My Drive/menuitem_ner_model_epoch_3
Epoch 4/50.
------------
loss -> {'ner': 208049.1404337883}
Saved model to drive/My Drive/menuitem_ner_model_epoch_4
Epoch 5/50.
------------
loss -> {'ner': 195648.90961933136}
Saved model to drive/My Drive/menuitem_ner_model_epoch_5
Epoch 6/50.
------------
loss -> {'ner': 179508.0811343193}
Saved model to drive/My Drive/menuitem_ner_model_epoch_6
Epoch 7/50.
------------
loss -> {'ner': 170754.4791829586}
Saved model to drive/My Drive/menuitem_ner_model_epoch_7
Epoch 8/50.
------------
loss -> {'ner': 164743.40112113953}
Saved model to drive/My Drive/menuitem_ner_model_epoch_8
Epoch 9/50.
------------
loss -> {'n

In [37]:
for i in range(35, 51):
    output_dir_ = f"drive/My Drive/menuitem_ner_model_epoch_{i}"
    print(f"loading from {output_dir_}")
    nlp = spacy.load(output_dir_)
    ner = nlp.get_pipe("ner")
    # print("Training metrics:")
    # get_scores(nlp, ner, train)
    print("\nValidation metrics:")
    get_scores(nlp, ner, val)
    print("=========================")
    print("\nTest metrics:")
    get_scores(nlp, ner, test)
    print("=========================")

loading from drive/My Drive/menuitem_ner_model_epoch_35

Validation metrics:
P = 95.20
R = 94.68
F = 94.94

Test metrics:
P = 95.12
R = 94.54
F = 94.83
loading from drive/My Drive/menuitem_ner_model_epoch_36

Validation metrics:
P = 94.89
R = 94.96
F = 94.92

Test metrics:
P = 94.79
R = 94.88
F = 94.84
loading from drive/My Drive/menuitem_ner_model_epoch_37

Validation metrics:
P = 95.19
R = 95.01
F = 95.10

Test metrics:
P = 95.14
R = 94.91
F = 95.03
loading from drive/My Drive/menuitem_ner_model_epoch_38

Validation metrics:
P = 94.84
R = 94.73
F = 94.78

Test metrics:
P = 94.77
R = 94.66
F = 94.72
loading from drive/My Drive/menuitem_ner_model_epoch_39

Validation metrics:
P = 94.85
R = 94.50
F = 94.67

Test metrics:
P = 94.76
R = 94.38
F = 94.57
loading from drive/My Drive/menuitem_ner_model_epoch_40

Validation metrics:
P = 95.17
R = 95.00
F = 95.09

Test metrics:
P = 95.08
R = 94.89
F = 94.99
loading from drive/My Drive/menuitem_ner_model_epoch_41

Validation metrics:
P = 94.96
R

In [5]:
val = srsly.read_json("annotated_data_ci_w_no_cuisine_menus.json")

output_dir_ = f"menuitem_ner_model_epoch_50"
print(f"loading from {output_dir_}")
nlp = spacy.load(output_dir_)
ner = nlp.get_pipe("ner")
# print("Training metrics:")
# get_scores(nlp, ner, train)
print("\nValidation metrics:")
get_scores(nlp, ner, val)
print("=========================")

loading from menuitem_ner_model_epoch_50

Validation metrics:
P = 87.68
R = 86.59
F = 87.13


In [4]:
def preprocess_reviews(review: str) -> str:
    review = unidecode.unidecode(review + " ").lower()\
                                              .replace("&", "and")\
                                              .replace("-", " ")\
                                              .replace(" w/", " with ")

    review = re.sub('\*|\"|\$|#', '', review)
    review = re.sub(r'(\W)(?=\1)', '', review)

    review = ' '.join(review.split())

    return review

In [39]:
# test the saved model
# nlp = spacy.load('drive/My Drive/menuitem_ner_model_epoch_50')



review = "First time here for brunch .. located inside the Valley Ho hotel in Downtown Scottsdale area. This hotel has a retro cool vibe. We were seated by the bar in big lounge chairs.. perfect for relaxing with a drink but a bit awkward for brunch .. the service was great, the people watching, the relaxed vibe and the mimosas were exactly what we were looking for. I had the breakfast burrito with chorizo, green chili pork, potatoes wrapped in a flour tortilla and covered in enchilada sauce.. it was huge and delicious! We also split an order of the mini donuts.. they arrive warm with chocolate, caramel, raspberry and marshmallow for dipping .. We will definitely be back for brunch again soon .. a staycation at the Valley Ho is a must after seeing the pool area!!"
review = preprocess_reviews(review)

doc = nlp(review)
for ent in doc.ents:
    print(ent.label_, " -> ", ent.text)

menuitem  ->  mimosas
menuitem  ->  breakfast burrito
menuitem  ->  chorizo
menuitem  ->  green chili pork
menuitem  ->  potatoes
menuitem  ->  flour tortilla
menuitem  ->  enchilada
menuitem  ->  mini
menuitem  ->  donuts
menuitem  ->  chocolate
menuitem  ->  caramel
menuitem  ->  raspberry


In [27]:
# validation
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot["entities"])
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

# example run

examples = TRAIN_DATA

ner_model = spacy.load('output_epoch_10')
results = evaluate(ner_model, examples)
print(results)

{'uas': 0.0, 'las': 0.0, 'las_per_type': {'': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'ents_p': 94.72118865171166, 'ents_r': 94.26966235621343, 'ents_f': 94.49488612336701, 'ents_per_type': {'menuitem': {'p': 94.72118865171166, 'r': 94.26966235621343, 'f': 94.49488612336701}}, 'tags_acc': 0.0, 'token_acc': 100.0, 'textcat_score': 0.0, 'textcats_per_cat': {}}
