# Task
Given a plain text recipe essay with no bulleted ingredient lines or recipe steps whatsoever. Create a model that is able to extract the ingredient names along with their quantities.

# Import Libraries



In [2]:
import json
import spacy
import random
from spacy import displacy
from pathlib import Path
from tqdm import tqdm

# Load Data

In [15]:
def load_data(path):
    with open(path) as f:
        data = json.load(f)
    return data

def clean(data):
  """ This function removes the lines with empty entities
  Args:
      data: a dictionary in spacy training format

  Returns:
      train_data: a clean dictionary
  """
  train_data = []
  for i in data['annotations']:
    k = i[1]['entities']
    if len(k) != 0:
      train_data.append(i)
  return train_data

# Training

In [16]:
def ner(train_data, model, n_iter, output_dir):
  """ This function is for training and saving model
  Args:
      train_data: a dictionary in spacy training format
      model: spacy pretrained model
      n_iter: number of iteration for training
      output_dir: path for saving traind model

  Returns:
      trained model
  """

  if model is not None:
      nlp = spacy.load(model)  
      print("Loaded model '%s'" % model)
  else:
      nlp = spacy.blank('en')  
      print("Created blank 'en' model")

  if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
  else:
    ner = nlp.get_pipe('ner')

  for _, annotations in train_data:
      for ent in annotations.get('entities'):
          ner.add_label(ent[2])

  other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
  with nlp.disable_pipes(*other_pipes):  # only train NER
      optimizer = nlp.begin_training()
      for itn in range(n_iter):
          random.shuffle(train_data)
          losses = {}
          for text, annotations in tqdm(train_data):
              nlp.update(
                  [text],  
                  [annotations],  
                  drop=0.2,   
                  sgd=optimizer,
                  losses=losses)
          print(losses)
          
  if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)


In [17]:
path = 'training_data.json'
model=None
n_iter=100
output_dir=Path("/model")


data = load_data(path)
print(".....loading dataset")
train_data = clean(data)
print(".....training")
ner(train_data, model, n_iter, output_dir)

.....loading dataset
.....training


  0%|          | 0/40 [00:00<?, ?it/s]

Created blank 'en' model


100%|██████████| 40/40 [00:02<00:00, 13.67it/s]
  5%|▌         | 2/40 [00:00<00:02, 14.40it/s]

{'ner': 248.4913252251384}


100%|██████████| 40/40 [00:03<00:00, 12.90it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.80it/s]

{'ner': 348.1156234299188}


100%|██████████| 40/40 [00:02<00:00, 13.39it/s]
  5%|▌         | 2/40 [00:00<00:02, 12.68it/s]

{'ner': 532.3225144368291}


100%|██████████| 40/40 [00:02<00:00, 14.20it/s]
  5%|▌         | 2/40 [00:00<00:02, 15.62it/s]

{'ner': 391.1139349626011}


100%|██████████| 40/40 [00:02<00:00, 14.23it/s]
  5%|▌         | 2/40 [00:00<00:02, 15.20it/s]

{'ner': 238.12006087375016}


100%|██████████| 40/40 [00:02<00:00, 14.07it/s]
  5%|▌         | 2/40 [00:00<00:02, 14.89it/s]

{'ner': 135.34738575914733}


100%|██████████| 40/40 [00:02<00:00, 14.29it/s]
  5%|▌         | 2/40 [00:00<00:02, 15.23it/s]

{'ner': 91.276739899145}


100%|██████████| 40/40 [00:02<00:00, 14.28it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.05it/s]

{'ner': 83.3760928162656}


100%|██████████| 40/40 [00:02<00:00, 14.33it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.78it/s]

{'ner': 49.27178377128427}


100%|██████████| 40/40 [00:02<00:00, 13.81it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.39it/s]

{'ner': 47.12760450669182}


100%|██████████| 40/40 [00:02<00:00, 13.99it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.95it/s]

{'ner': 33.65838257640198}


100%|██████████| 40/40 [00:03<00:00, 13.04it/s]
  5%|▌         | 2/40 [00:00<00:02, 14.15it/s]

{'ner': 39.57468420579015}


100%|██████████| 40/40 [00:02<00:00, 13.78it/s]
  5%|▌         | 2/40 [00:00<00:03, 12.62it/s]

{'ner': 32.18693456368458}


100%|██████████| 40/40 [00:02<00:00, 13.60it/s]
  5%|▌         | 2/40 [00:00<00:02, 14.33it/s]

{'ner': 18.50818784172261}


100%|██████████| 40/40 [00:03<00:00, 13.11it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.39it/s]

{'ner': 15.659425695187911}


100%|██████████| 40/40 [00:02<00:00, 13.88it/s]
  5%|▌         | 2/40 [00:00<00:03, 12.57it/s]

{'ner': 9.814667849990165}


100%|██████████| 40/40 [00:02<00:00, 13.73it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.69it/s]

{'ner': 8.303242869022151}


100%|██████████| 40/40 [00:02<00:00, 13.44it/s]
  5%|▌         | 2/40 [00:00<00:02, 14.98it/s]

{'ner': 8.474027240698595}


100%|██████████| 40/40 [00:03<00:00, 12.61it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.33it/s]

{'ner': 4.490093213952122}


100%|██████████| 40/40 [00:03<00:00, 12.42it/s]
  5%|▌         | 2/40 [00:00<00:02, 12.91it/s]

{'ner': 9.376166845335135}


100%|██████████| 40/40 [00:03<00:00, 12.36it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.88it/s]

{'ner': 4.452892942250787}


100%|██████████| 40/40 [00:03<00:00, 12.47it/s]
  5%|▌         | 2/40 [00:00<00:03, 12.62it/s]

{'ner': 2.380274379863873}


100%|██████████| 40/40 [00:03<00:00, 12.48it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.38it/s]

{'ner': 13.794892313793573}


100%|██████████| 40/40 [00:03<00:00, 12.30it/s]
  2%|▎         | 1/40 [00:00<00:04,  9.25it/s]

{'ner': 0.020287739672074454}


100%|██████████| 40/40 [00:03<00:00, 12.45it/s]
  5%|▌         | 2/40 [00:00<00:02, 14.59it/s]

{'ner': 0.1267662503719665}


100%|██████████| 40/40 [00:03<00:00, 12.76it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.80it/s]

{'ner': 0.06528633954872715}


100%|██████████| 40/40 [00:03<00:00, 12.37it/s]
  5%|▌         | 2/40 [00:00<00:02, 12.95it/s]

{'ner': 5.162481256834163}


100%|██████████| 40/40 [00:03<00:00, 13.25it/s]
  5%|▌         | 2/40 [00:00<00:03, 12.11it/s]

{'ner': 13.95970647294533}


100%|██████████| 40/40 [00:02<00:00, 13.65it/s]
  5%|▌         | 2/40 [00:00<00:02, 14.22it/s]

{'ner': 14.451414120026675}


100%|██████████| 40/40 [00:02<00:00, 14.71it/s]
  5%|▌         | 2/40 [00:00<00:02, 12.92it/s]

{'ner': 10.794809183820885}


100%|██████████| 40/40 [00:02<00:00, 14.40it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.82it/s]

{'ner': 35.704136465865155}


100%|██████████| 40/40 [00:02<00:00, 14.38it/s]
  5%|▌         | 2/40 [00:00<00:02, 15.94it/s]

{'ner': 26.97951597397668}


100%|██████████| 40/40 [00:02<00:00, 14.11it/s]
  5%|▌         | 2/40 [00:00<00:02, 16.16it/s]

{'ner': 26.45259444799342}


100%|██████████| 40/40 [00:02<00:00, 14.27it/s]
  5%|▌         | 2/40 [00:00<00:02, 14.77it/s]

{'ner': 24.716636176121625}


100%|██████████| 40/40 [00:02<00:00, 14.02it/s]
  5%|▌         | 2/40 [00:00<00:02, 16.71it/s]

{'ner': 18.757650949479697}


100%|██████████| 40/40 [00:02<00:00, 13.88it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.31it/s]

{'ner': 17.576360559416308}


100%|██████████| 40/40 [00:02<00:00, 14.19it/s]
  5%|▌         | 2/40 [00:00<00:03, 12.46it/s]

{'ner': 7.054584888184306}


100%|██████████| 40/40 [00:02<00:00, 14.24it/s]
  2%|▎         | 1/40 [00:00<00:04,  8.97it/s]

{'ner': 19.094796528482647}


100%|██████████| 40/40 [00:02<00:00, 13.80it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.98it/s]

{'ner': 14.884098486540363}


100%|██████████| 40/40 [00:03<00:00, 13.11it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.45it/s]

{'ner': 9.830594331180965}


100%|██████████| 40/40 [00:03<00:00, 11.76it/s]
  5%|▌         | 2/40 [00:00<00:02, 12.83it/s]

{'ner': 12.831098948755665}


100%|██████████| 40/40 [00:03<00:00, 11.33it/s]
  2%|▎         | 1/40 [00:00<00:03,  9.91it/s]

{'ner': 13.917445576489639}


100%|██████████| 40/40 [00:03<00:00, 11.20it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.56it/s]

{'ner': 36.73754943665498}


100%|██████████| 40/40 [00:03<00:00, 11.42it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.80it/s]

{'ner': 6.289427927334659}


100%|██████████| 40/40 [00:03<00:00, 11.21it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.99it/s]

{'ner': 22.549944207769126}


100%|██████████| 40/40 [00:03<00:00, 10.09it/s]
  2%|▎         | 1/40 [00:00<00:04,  8.89it/s]

{'ner': 6.995161251032196}


100%|██████████| 40/40 [00:04<00:00,  9.89it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

{'ner': 0.08731464864314924}


100%|██████████| 40/40 [00:03<00:00, 10.21it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.32it/s]

{'ner': 9.263968251798968}


100%|██████████| 40/40 [00:03<00:00, 10.44it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.80it/s]

{'ner': 6.0713845060700775}


100%|██████████| 40/40 [00:03<00:00, 10.76it/s]
  5%|▌         | 2/40 [00:00<00:03, 12.04it/s]

{'ner': 13.218769405594543}


100%|██████████| 40/40 [00:03<00:00, 10.69it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.67it/s]

{'ner': 3.859329744469556}


100%|██████████| 40/40 [00:04<00:00,  9.91it/s]
  2%|▎         | 1/40 [00:00<00:04,  9.17it/s]

{'ner': 4.450644771603928}


100%|██████████| 40/40 [00:04<00:00,  9.74it/s]
  2%|▎         | 1/40 [00:00<00:04,  8.24it/s]

{'ner': 6.020556927503858}


100%|██████████| 40/40 [00:04<00:00,  9.85it/s]
  2%|▎         | 1/40 [00:00<00:03,  9.88it/s]

{'ner': 0.08278779235847467}


100%|██████████| 40/40 [00:04<00:00,  9.78it/s]
  2%|▎         | 1/40 [00:00<00:04,  9.21it/s]

{'ner': 7.34321606805419}


100%|██████████| 40/40 [00:04<00:00,  9.88it/s]
  2%|▎         | 1/40 [00:00<00:04,  9.60it/s]

{'ner': 7.560851057467505}


100%|██████████| 40/40 [00:04<00:00,  8.73it/s]
  2%|▎         | 1/40 [00:00<00:06,  6.25it/s]

{'ner': 7.565800091684299}


100%|██████████| 40/40 [00:04<00:00,  9.98it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.23it/s]

{'ner': 23.411254159125093}


100%|██████████| 40/40 [00:04<00:00,  9.79it/s]
  2%|▎         | 1/40 [00:00<00:03,  9.86it/s]

{'ner': 9.882640176551408}


100%|██████████| 40/40 [00:04<00:00,  9.81it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.11it/s]

{'ner': 1.897694636641512}


100%|██████████| 40/40 [00:03<00:00, 10.03it/s]
  2%|▎         | 1/40 [00:00<00:04,  9.62it/s]

{'ner': 6.5182612553868555}


100%|██████████| 40/40 [00:04<00:00,  9.79it/s]
  2%|▎         | 1/40 [00:00<00:03,  9.88it/s]

{'ner': 3.8499670890579445}


100%|██████████| 40/40 [00:03<00:00, 10.08it/s]
  2%|▎         | 1/40 [00:00<00:04,  8.74it/s]

{'ner': 3.9579950821973497}


100%|██████████| 40/40 [00:04<00:00,  9.86it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.37it/s]

{'ner': 12.377547837000959}


100%|██████████| 40/40 [00:04<00:00,  9.92it/s]
  2%|▎         | 1/40 [00:00<00:03,  9.89it/s]

{'ner': 0.2214880894147197}


100%|██████████| 40/40 [00:04<00:00,  9.84it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

{'ner': 0.00217604945587613}


100%|██████████| 40/40 [00:04<00:00,  9.93it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

{'ner': 0.0037240017921570667}


100%|██████████| 40/40 [00:04<00:00,  9.96it/s]
  2%|▎         | 1/40 [00:00<00:03,  9.79it/s]

{'ner': 3.114549066168613}


100%|██████████| 40/40 [00:03<00:00, 10.08it/s]
  2%|▎         | 1/40 [00:00<00:04,  8.30it/s]

{'ner': 5.390239804055408e-09}


100%|██████████| 40/40 [00:04<00:00,  9.95it/s]
  2%|▎         | 1/40 [00:00<00:04,  9.68it/s]

{'ner': 4.72453744970065e-07}


100%|██████████| 40/40 [00:04<00:00,  9.96it/s]
  2%|▎         | 1/40 [00:00<00:03,  9.90it/s]

{'ner': 0.00012925591379631907}


100%|██████████| 40/40 [00:03<00:00, 10.25it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

{'ner': 7.761133295044924e-06}


100%|██████████| 40/40 [00:03<00:00, 10.76it/s]
  2%|▎         | 1/40 [00:00<00:03,  9.94it/s]

{'ner': 5.486423510371444}


100%|██████████| 40/40 [00:03<00:00, 10.96it/s]
  2%|▎         | 1/40 [00:00<00:04,  7.99it/s]

{'ner': 1.6766750527438755e-06}


100%|██████████| 40/40 [00:03<00:00, 10.71it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.61it/s]

{'ner': 0.015013274585371012}


100%|██████████| 40/40 [00:03<00:00, 10.70it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

{'ner': 2.1317577471254756}


100%|██████████| 40/40 [00:03<00:00, 10.56it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

{'ner': 7.768371851570531}


100%|██████████| 40/40 [00:03<00:00, 10.80it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.66it/s]

{'ner': 11.543548167757445}


100%|██████████| 40/40 [00:03<00:00, 10.69it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.24it/s]

{'ner': 10.356081899598799}


100%|██████████| 40/40 [00:03<00:00, 10.88it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.45it/s]

{'ner': 0.6944005184065384}


100%|██████████| 40/40 [00:03<00:00, 10.69it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

{'ner': 7.614213165395428}


100%|██████████| 40/40 [00:03<00:00, 10.85it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.10it/s]

{'ner': 8.448490036481424}


100%|██████████| 40/40 [00:03<00:00, 10.63it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.70it/s]

{'ner': 20.793379627224724}


100%|██████████| 40/40 [00:03<00:00, 10.69it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.94it/s]

{'ner': 22.021879237865914}


100%|██████████| 40/40 [00:03<00:00, 10.97it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.44it/s]

{'ner': 23.019719162876672}


100%|██████████| 40/40 [00:03<00:00, 10.59it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.03it/s]

{'ner': 16.88416795940567}


100%|██████████| 40/40 [00:03<00:00, 10.79it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.42it/s]

{'ner': 41.327365737793514}


100%|██████████| 40/40 [00:03<00:00, 10.73it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.34it/s]

{'ner': 33.00790729345499}


100%|██████████| 40/40 [00:03<00:00, 10.55it/s]
  2%|▎         | 1/40 [00:00<00:04,  9.56it/s]

{'ner': 92.12846766085497}


100%|██████████| 40/40 [00:03<00:00, 10.90it/s]
  2%|▎         | 1/40 [00:00<00:03,  9.99it/s]

{'ner': 21.30191020570493}


100%|██████████| 40/40 [00:03<00:00, 10.82it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.46it/s]

{'ner': 12.082708604539507}


100%|██████████| 40/40 [00:03<00:00, 10.78it/s]
  2%|▎         | 1/40 [00:00<00:04,  8.92it/s]

{'ner': 23.27990536620422}


100%|██████████| 40/40 [00:03<00:00, 10.77it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

{'ner': 19.87145030164338}


100%|██████████| 40/40 [00:03<00:00, 10.56it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.88it/s]

{'ner': 2.27461831964428}


100%|██████████| 40/40 [00:03<00:00, 10.62it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.20it/s]

{'ner': 67.32446280512369}


100%|██████████| 40/40 [00:03<00:00, 10.61it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

{'ner': 2.4253143646948416}


100%|██████████| 40/40 [00:03<00:00, 10.74it/s]
  2%|▎         | 1/40 [00:00<00:03,  9.95it/s]

{'ner': 0.4367676106784585}


100%|██████████| 40/40 [00:03<00:00, 10.65it/s]
  5%|▌         | 2/40 [00:00<00:03, 11.56it/s]

{'ner': 2.480389745751377}


100%|██████████| 40/40 [00:03<00:00, 10.96it/s]
  5%|▌         | 2/40 [00:00<00:03, 10.84it/s]

{'ner': 0.8667899559295147}


100%|██████████| 40/40 [00:03<00:00, 10.80it/s]

{'ner': 1.0901703871628619}
Saved model to /model





# Inference

In [4]:
output_dir=Path("/model")
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)

Loading from /content/model


In [10]:
text = """For the spice mix I have eight guajillo chelates for flavor and four chiles de arbol to bring some heat from which I'm both going to remove the stems and seeds. I'm also gonna add about two teaspoons of whole cumin seeds"""
print(text)

For the spice mix I have eight guajillo chelates for flavor and four chiles de arbol to bring some heat from which I'm both going to remove the stems and seeds. I'm also gonna add about two teaspoons of whole cumin seeds


In [8]:
doc = nlp2(text)
for ent in doc.ents:
  print(ent.text)


eight guajillo chelates
four chiles de arbol
two teaspoons of whole cumin seeds


In [9]:
displacy.render(nlp2(doc.text), style='ent', jupyter=True)