# Custom Food Entity Recognition

### Importing Dependencies

In [4]:
import spacy

In [5]:
import pandas as pd
import numpy as np

## Data Preprocessing

In [2]:
## createing a small dataset with two attributes 
## name column consists of Food name
## reviews column consists of the corresponding food review 

In [6]:
df = pd.DataFrame(columns=["name","reviews"])

In [7]:
df["name"] = ["rice","chicken","biriyani","mashroom","sandwich","pizza","burger","daal","fish","kaachi"]
df["reviews"] = ["for healthy lunch rice is must",
                 "chicken is my favourite food",
                 "i love biriyani",
                 "mashroom is a tasty food",
                 "i prefer sandwich for breakfast",
                 "pizza is my favourite",
                 "i like burger",
                 "daal is my weakness",
                 "fish is a healthy item",
                 "kaachi is one of my fav food"]

In [8]:
df.head()

Unnamed: 0,name,reviews
0,rice,for healthy lunch rice is must
1,chicken,chicken is my favourite food
2,biriyani,i love biriyani
3,mashroom,mashroom is a tasty food
4,sandwich,i prefer sandwich for breakfast


In [9]:
df.tail()

Unnamed: 0,name,reviews
5,pizza,pizza is my favourite
6,burger,i like burger
7,daal,daal is my weakness
8,fish,fish is a healthy item
9,kaachi,kaachi is one of my fav food


In [10]:
import sys
print(sys.executable)

/usr/bin/python3


In [20]:
import wordcloud

ModuleNotFoundError: No module named 'wordcloud'

In [11]:
!/usr/bin/python3 -m pip install wordcloud

Defaulting to user installation because normal site-packages is not writeable


In [12]:
from wordcloud import WordCloud, STOPWORDS
from spacy.util import minibatch, compounding

import matplotlib.pyplot as plt
import re
import random

In [13]:
# NER
nlp0 = spacy.load('en_core_web_sm')

In [14]:
# Get All Components of this NLP Object
nlp0.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [15]:
ner0 = nlp0.get_pipe('ner')

In [16]:
# Example
ex1 = "James went to London to buy Ibuprofen last year 2019"


In [17]:
docx = nlp0(ex1)

In [18]:
type(docx)

spacy.tokens.doc.Doc

In [19]:
# Check for entities
for entity in docx.ents:
    print(entity,entity.label_)

London GPE
Ibuprofen ORG
last year 2019 DATE


In [20]:
ex2 = "In spain at madrid i order sandwich for my breakfast"

In [21]:
doc2 = nlp0(ex2)

In [22]:
for entity in doc2.ents:
    print(entity,entity.label_)

spain GPE
madrid GPE


In [23]:
def process_review(review):
    processed_token = []
    for token in review.split():
        token = ''.join(e.lower() for e in token if e.isalnum())
        processed_token.append(token)
    return ' '.join(processed_token)

In [24]:
# food Names
all_foods = df['name'].unique().tolist()

all_foods = [x.lower() for x in all_foods]

all_foods

['rice',
 'chicken',
 'biriyani',
 'mashroom',
 'sandwich',
 'pizza',
 'burger',
 'daal',
 'fish',
 'kaachi']

In [25]:
df['reviews']

0     for healthy lunch rice is must
1       chicken is my favourite food
2                    i love biriyani
3           mashroom is a tasty food
4    i prefer sandwich for breakfast
5              pizza is my favourite
6                      i like burger
7                daal is my weakness
8             fish is a healthy item
9       kaachi is one of my fav food
Name: reviews, dtype: object

In [26]:
for i,name in df.iterrows():
    print(name)

name                                 rice
reviews    for healthy lunch rice is must
Name: 0, dtype: object
name                            chicken
reviews    chicken is my favourite food
Name: 1, dtype: object
name              biriyani
reviews    i love biriyani
Name: 2, dtype: object
name                       mashroom
reviews    mashroom is a tasty food
Name: 3, dtype: object
name                              sandwich
reviews    i prefer sandwich for breakfast
Name: 4, dtype: object
name                       pizza
reviews    pizza is my favourite
Name: 5, dtype: object
name              burger
reviews    i like burger
Name: 6, dtype: object
name                      daal
reviews    daal is my weakness
Name: 7, dtype: object
name                         fish
reviews    fish is a healthy item
Name: 8, dtype: object
name                             kaachi
reviews    kaachi is one of my fav food
Name: 9, dtype: object


In [27]:
##converting the training data into the spacy format data for training

In [28]:
count = 0
TRAIN_DATA = []
for _, item in df.iterrows():
    ent_dict = {}
    if count < 10:
        review = process_review(item['reviews'])
        #Locate foods and their positions once and add to the visited items.
        visited_items = []
        entities = []
        for token in review.split():
            if token in all_foods:
                for i in re.finditer(token, review):
                    if token not in visited_items:
                        entity = (i.span()[0], i.span()[1], 'FOOD')
                        visited_items.append(token)
                        entities.append(entity)
        if len(entities) > 0:
            ent_dict['entities'] = entities
            train_item = (review, ent_dict)
            TRAIN_DATA.append(train_item)
            count+=1

In [29]:
TRAIN_DATA

[('for healthy lunch rice is must', {'entities': [(18, 22, 'FOOD')]}),
 ('chicken is my favourite food', {'entities': [(0, 7, 'FOOD')]}),
 ('i love biriyani', {'entities': [(7, 15, 'FOOD')]}),
 ('mashroom is a tasty food', {'entities': [(0, 8, 'FOOD')]}),
 ('i prefer sandwich for breakfast', {'entities': [(9, 17, 'FOOD')]}),
 ('pizza is my favourite', {'entities': [(0, 5, 'FOOD')]}),
 ('i like burger', {'entities': [(7, 13, 'FOOD')]}),
 ('daal is my weakness', {'entities': [(0, 4, 'FOOD')]}),
 ('fish is a healthy item', {'entities': [(0, 4, 'FOOD')]}),
 ('kaachi is one of my fav food', {'entities': [(0, 6, 'FOOD')]})]

In [30]:
#creating a empty nlp model with English language

In [31]:
nlp = spacy.blank("en")

nlp.pipe_names

[]

In [32]:
from spacy.training import Example

In [33]:
n_iter = 30

def train_ner(training_data):
    
    """Steps
    Create a Blank NLP  model object
    Create and add NER to the NLP model
    Add Labels from your training data
    Train  
    """
    TRAIN_DATA = training_data
    
    # create blank Language class
    nlp = spacy.blank("en")  
    print("Created blank 'en' model")
    
    ner = nlp.create_pipe("ner")
    nlp.add_pipe('ner')
    ner.add_label("label")
    
#     if "ner" not in nlp.pipe_names:
#         ner = nlp.add_pipe("ner")
#         #nlp.add_pipe(ner, last=True)
#     # otherwise, get it so we can add labels
#     else:
#         ner = nlp.get_pipe("ner")
        
        
    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            
    nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        # batch up the examples using spaCy's minibatch
        #for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        for batch in spacy.util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)):
            for text, annotations in batch:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], losses=losses, drop=0.5)

#         batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
#         for batch in batches:
#             texts, annotations = zip(*batch)
#             example = []
#             # Update the model with iterating each text
#             for i in range(len(texts)):
#                 doc = nlp.make_doc(texts[i])
#                 example.append(Example.from_dict(doc, annotations[i]))
        
        print("Losses", losses)
    return nlp

In [34]:
# Let training the model

nlp2 = train_ner(TRAIN_DATA)

Created blank 'en' model
Losses {'ner': 32.12554767727852}
Losses {'ner': 20.396196506917477}
Losses {'ner': 15.503781765699387}
Losses {'ner': 12.066822669934481}
Losses {'ner': 12.47229163285374}
Losses {'ner': 7.940938097555772}
Losses {'ner': 6.787303852563127}
Losses {'ner': 5.859258757431729}
Losses {'ner': 2.343737853891999}
Losses {'ner': 2.320204331446069}
Losses {'ner': 0.4368409196031431}
Losses {'ner': 1.5692322826774254}
Losses {'ner': 4.058147088082767}
Losses {'ner': 0.0016758037000260265}
Losses {'ner': 0.00015153878378842213}
Losses {'ner': 1.8249410618941284}
Losses {'ner': 0.0028785403970550873}
Losses {'ner': 0.017999622452597244}
Losses {'ner': 8.599518349506943e-06}
Losses {'ner': 0.0029200013593541838}
Losses {'ner': 0.01987721362405572}
Losses {'ner': 8.641709652340436e-05}
Losses {'ner': 0.0023726587143433663}
Losses {'ner': 2.100480817475467e-08}
Losses {'ner': 6.744607597911261e-05}
Losses {'ner': 4.610307161968854e-06}
Losses {'ner': 0.002055817556175286}
Lo

In [35]:
ex2 = "i was in vacation at spain.i order sandwich for my breakfast and love to had rice with daal in lunch"

docx2 = nlp2(ex2)

In [36]:
for entity in docx2.ents:
    print(entity,entity.label_)

vacation FOOD
sandwich FOOD
daal FOOD


In [37]:
for text,_ in TRAIN_DATA[:5]:
    doc = nlp2(text)
    result = [(ent,ent.label_) for ent in doc.ents]
    print(result)

[(rice, 'FOOD')]
[(biriyani, 'FOOD')]
[(daal, 'FOOD')]
[(mashroom, 'FOOD')]
[(chicken, 'FOOD')]


In [38]:
## For another custom dataset with annotated format 

In [39]:
import spacy
nlp = spacy.load("en_core_web_sm") 

# Getting the ner component
f_ner = nlp.get_pipe('ner')

In [40]:
# New label to add
LABEL = "FOOD"

In [41]:
# Training examples in the required format

TRAIN_FOOD_DATA =[ ("Pizza is a common fast food.", {"entities": [(0, 5, "FOOD")]}),
              ("Pasta is an italian recipe", {"entities": [(0, 5, "FOOD")]}),
              ("China's noodles are very famous", {"entities": [(8,14, "FOOD")]}),
              ("Shrimps are famous in China too", {"entities": [(0,7, "FOOD")]}),
              ("Lasagna is another classic of Italy", {"entities": [(0,7, "FOOD")]}),
              ("Sushi is extemely famous and expensive Japanese dish", {"entities": [(0,5, "FOOD")]}),
              ("Unagi is a famous seafood of Japan", {"entities": [(0,5, "FOOD")]}),
              ("Tempura , Soba are other famous dishes of Japan", {"entities": [(0,7, "FOOD")]}),
              ("Udon is a healthy type of noodles", {"entities": [(0,4, "ORG")]}),
              ("Chocolate soufflé is extremely famous french cuisine", {"entities": [(0,17, "FOOD")]}),
              ("Flamiche is french pastry", {"entities": [(0,8, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Frenchfries are considered too oily", {"entities": [(0,11, "FOOD")]})
           ]

In [43]:
# Add the new label to ner
f_ner.add_label(LABEL)

1

In [42]:
# Resume training

optimizer = nlp.resume_training()
move_names = list(f_ner.move_names)

In [44]:
# List of pipes  want to train

pipe_exceptions = ["ner", "trf_wordpiecer"]

In [45]:
# List of pipes which should remain unaffected in training

other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [46]:
# Importing requirements

from spacy.util import minibatch, compounding
from spacy.training import Example
import random

In [47]:
# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :
    for itn in range(20):
        random.shuffle(TRAIN_FOOD_DATA)
        losses = {}
        # batch up the examples using spaCy's minibatch
        #for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        for batch in spacy.util.minibatch(TRAIN_DATA, size=compounding(1.0, 4.0, 1.001)):
            for text, annotations in batch:
                # create Example
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example],sgd = optimizer,losses=losses, drop=0.35)
        print("losses :",losses)

losses : {'ner': 18.603962196651075}
losses : {'ner': 16.27411495364244}
losses : {'ner': 15.04985213852342}
losses : {'ner': 10.774062501142907}
losses : {'ner': 5.8182633712076495}
losses : {'ner': 1.478158594066599}
losses : {'ner': 0.9667060405807044}
losses : {'ner': 0.08595987373109353}
losses : {'ner': 0.02540045509293462}
losses : {'ner': 0.007016814447979113}
losses : {'ner': 0.07796324311125406}
losses : {'ner': 0.5097430888220879}
losses : {'ner': 0.005890884539387035}
losses : {'ner': 8.208592893052143e-06}
losses : {'ner': 0.0008825195149548191}
losses : {'ner': 3.133192460109532e-06}
losses : {'ner': 0.00012609776739607237}
losses : {'ner': 0.0029466021012051874}
losses : {'ner': 0.000274822629082245}
losses : {'ner': 1.5526551290265886e-05}


In [48]:
# Testing the NER

test_text = "I ate Sushi yesterday. Maggi is a common fast food "

doc = nlp(test_text)
print("Entities in '%s'" % test_text)

for ent in doc.ents:
    print(ent,ent.label_)

Entities in 'I ate Sushi yesterday. Maggi is a common fast food '
Sushi GPE
Maggi FOOD


In [49]:
# Output directory

from pathlib import Path
output_dir=Path('/home/robin/assignment Folder/saved_model/')

In [50]:
# Saving the model to the output directory

if not output_dir.exists():
    output_dir.mkdir()
    
nlp.meta['name'] = 'my_ner'  # rename model
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

Saved model to /home/robin/assignment Folder/saved_model


In [52]:
# Loading the model from the directory

print("Loading from", output_dir)

nlp2 = spacy.load(output_dir)

assert nlp2.get_pipe("ner").move_names == move_names

doc2 = nlp2(' Dosa is an extremely famous south Indian dish')

for ent in doc2.ents:
    print(ent.label_, ent.text)

Loading from /home/robin/assignment Folder/saved_model


AssertionError: 

In [53]:
nlp2 = spacy.load(output_dir)

#assert nlp2.get_pipe("ner").move_names == move_names

doc2 = nlp2(' Dosa is an extremely famous south Indian dish')

for ent in doc2.ents:
    print(ent.label_, ent.text)

FOOD Dosa
NORP Indian
FOOD dish


# Let's try with another Custom Dataset

In [159]:
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm

# Data Preprocessing

In [160]:
# load a new spacy model
nlp = spacy.blank("en")

# create a DocBin object
db = DocBin()

**In this dataset collected information from differents sites and stored them in a text file. Then annotated the text file with NER Text Annontator and exported the input data as json format.This custom food dataset contains four classes as Food, Place, Country and Person**

In [162]:
import json
f = open('food.json')

TRAIN_DATA = json.load(f)

In [163]:
TRAIN_DATA

{'classes': ['FOOD', 'PLACE', 'COUNTRY', 'PERSON'],
 'annotations': [['Sushi is a traditional Japanese food made by combining vinegar rice and seafood. There is a type of fermented sushi, known as nare-zushi, but the most typical types of sushi are nigiri-zushi and temaki-zushi.You can find sushi all around Japan, but the sushi from restaurants in high class areas like Ginza or close to fishing ports is especially delicious.',
   {'entities': [[0, 5, 'FOOD'],
     [23, 31, 'COUNTRY'],
     [63, 67, 'FOOD'],
     [72, 80, 'FOOD'],
     [110, 115, 'FOOD'],
     [126, 136, 'FOOD'],
     [168, 173, 'FOOD'],
     [178, 190, 'FOOD'],
     [195, 211, 'FOOD'],
     [238, 243, 'COUNTRY'],
     [264, 275, 'PLACE'],
     [301, 306, 'PLACE'],
     [319, 332, 'PLACE']]}],
  ['Tempura is a Japanese dish made from seafood, fresh vegetables and other ingredients dipped in a flour and egg batter and fried in oil.',
   {'entities': [[0, 7, 'FOOD'],
     [13, 21, 'COUNTRY'],
     [37, 44, 'FOOD'],
     [

In [54]:
# Converting the JSON data into Spacy NER Format
# saved the annotated train data as spacy format

In [164]:
for text, annot in tqdm(TRAIN_DATA['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

# save the docbin object
db.to_disk("./custom_food_training_data.spacy")

100%|█████████████████████████████████████████████| 9/9 [00:00<00:00, 38.28it/s]


In [55]:
# configure the model with English language
# Using Ner from Spacy pipe line
# using efficiency optimize and train the model with cpu

In [166]:
! /usr/bin/python3 -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

2022-06-19 15:07:41.463372: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-19 15:07:41.463447: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [56]:
# train the configured model with training dataset and for validation dataset use the same training dataset

In [167]:
! /usr/bin/python3 -m spacy train config.cfg --output ./ --paths.train ./custom_food_training_data.spacy --paths.dev ./custom_food_training_data.spacy

2022-06-19 15:12:29.118148: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-19 15:12:29.118188: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-06-19 15:12:32,047] [INFO] Set up nlp object from config
[2022-06-19 15:12:32,062] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-06-19 15:12:32,069] [INFO] Created vocabulary
[2022-06-19 15:12:32,070] [INFO] Finished initializing nlp object
[2022-06-19 15:12:32,357] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  --

# Model Evaluation

In [57]:
# Saved the model and load the best model for model evaluation

In [168]:
food_ner = spacy.load("/home/robin/assignment Folder/model-best")

In [58]:
# testing the model with an unseen text using the best saved model

In [170]:
test_doc = food_ner('''The Place I love tempura. Having been to Tokyo a few times by now, 
my favourite go to restaurant for tempura has always been Tempura Tsunahachi in Shinjuku as it is very value for money.
On my most recent trip, I decided that it is time to try a more upscale tempura restaurant and Tempura Kondo it shall be. 
Located in Ginza, the 2-Michelin Star restaurant is very popular among locals.We went for both the Sumire (6,500yen) and Tsubami (8,500yen) lunch menu. 
For tempura dishes, the former has 2 prawns, 4 vegetables and 3 fishes with rice, miso soup and fruits; the latter has 2 prawns, 5 vegetables and 3 fishes.
Tsubaki also comes with Kakiage, a mixture bits of scallop and prawns in batter.''')

In [171]:
spacy.displacy.render(test_doc, style="ent", jupyter=True)

In [174]:
test_doc2 = food_ner('''There are many things in general you can try, from salads to hot dishes, from sweets to soups. But if you want something traditional, cheap and fast – it`s definitely souvlaki. 
The iconic dish of Greece is a fast food that you usually can find anywhere on the streets, in the bars, in small take-out shops and cafes. You will see souvlaki over the whole country. 
And if you are lucky enough to be invited to a house party of Greek people – ask for mousaka. It is the best when it`s homemade.

It’s hard to speak about Italian food, it’s better to enjoy it. The traditional colors of Italian cuisine – red,
green and yellow always create a great mixture of flavor and taste. Usually all the dishes are based on tomato, bread and olive oil.
The secret of the cuisine is not in a sophisticated preparation process, it is more in the freshness of the products.

''')

In [175]:
spacy.displacy.render(test_doc2, style="ent", jupyter=True)