In [None]:
import pandas as pd
df=pd.read_json('../input/nlp-course/restaurant.json')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
menu = ["Cheese Steak", "Cheesesteak", "Steak and Cheese", "Italian Combo", "Tiramisu", "Cannoli",
        "Chicken Salad", "Chicken Spinach Salad", "Meatball", "Pizza", "Pizzas", "Spaghetti",
        "Bruchetta", "Eggplant", "Italian Beef", "Purista", "Pasta", "Calzones",  "Calzone",
        "Italian Sausage", "Chicken Cutlet", "Chicken Parm", "Chicken Parmesan", "Gnocchi",         
        "Chicken Pesto", "Turkey Sandwich", "Turkey Breast", "Ziti", "Portobello", "Reuben",
        "Mozzarella Caprese",  "Corned Beef", "Garlic Bread", "Pastrami", "Roast Beef",              #menu items already given
        "Tuna Salad", "Lasagna", "Artichoke Salad", "Fettuccini Alfredo", "Chicken Parmigiana",
        "Grilled Veggie", "Grilled Veggies", "Grilled Vegetable", "Mac and Cheese", "Macaroni",  
         "Prosciutto", "Salami"]

# Getting the train data

In [None]:
menu_lower=[x.lower() for x in menu]     #converting all items to lowercase

import spacy
nlp=spacy.load('en_core_web_lg')

menu_doc=[nlp(item) for item in menu_lower] #converting the list of menu items into doc objects

In [None]:
import string
text_list=[_.rstrip() for _ in df['text']]
text_list=text_list[:100]                  #getting the first 100 reviews from the dataframe

In [None]:
len(text_list)

In [None]:
text_doc=[nlp(text) for text in text_list]

In [None]:
sent_list=[]           #converting the text into sentences so that we can only extract sentences with the food item for training data
for doc in text_doc:
    for sent in doc.sents:
        sent_list.append(sent.text.rstrip())
    

In [None]:
len(sent_list)

In [None]:
sent_doc=[nlp(doc) for doc in sent_list] #sent_list is a list of strings so converting into list of docs to apply matcher 

In [None]:

from spacy.matcher import PhraseMatcher 

matcher=PhraseMatcher(nlp.vocab,attr='LOWER')
matcher.add('Menu',menu_doc)


In [None]:
for doc in sent_doc:
    matches=matcher(doc)
    for match_id,start,end in matches:
        print(doc[start:end],doc[start:end].start_char,doc[start:end].end_char)

In [None]:
TRAIN_DATA=[]                                        #making the training data in the format needed
for doc in sent_doc:
    ent=[]
    matches=matcher(doc)
    if matches:
        for match_id,start,end in matches:                                         
            ent.append((doc[start:end].start_char,doc[start:end].end_char,"FOOD"))
        TRAIN_DATA.append((f"{doc}", {"entities": ent}))


In [None]:
print(len(TRAIN_DATA))
TRAIN_DATA
    

# Training the model

In [None]:
# Getting the ner component
ner=nlp.get_pipe('ner')

In [None]:
LABEL="FOOD"             # Add the new label to ner
ner.add_label(LABEL)

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [None]:
# Importing requirements
from spacy.util import minibatch, compounding
import random

# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :
    sizes = compounding(1.0, 4.0, 1.001)
    # Training for 30 iterations     
    for itn in range(30):
        # shuffle examples before training
        random.shuffle(TRAIN_DATA)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=sizes)
        # ictionary to store losses
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            # Calling update() over the iteration
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

# Predicting on NEW data

In [None]:
from spacy import displacy

In [None]:
nlp.pipe_names

In [None]:
# Testing the NER

test_text = "I just love the cheese maggi with toppings & the Biryani too."
doc = nlp(test_text)

displacy.render(doc,style='ent')


In [None]:
test_text = "Whenever we visit we order Fish curry and the kids love it!!!.The Ghee Dosa is delicious.Yesterday was fun"
doc = nlp(test_text)

displacy.render(doc,style='ent')


In [None]:
test_text = "The food is horrible.Vada Pav tastes like hard cardboard.Never gonna eat Idli here,it tastes horrible! "
doc = nlp(test_text)

displacy.render(doc,style='ent')


In [None]:
test_text = "The Chapathi is green and sticky,such a waste of $15 .I would rather stay hungry than to put Roti,Beef in my mouth "
doc = nlp(test_text)

displacy.render(doc,style='ent')


In [None]:
#Another set of training data


# # New label to add
# LABEL = "FOOD"

# # Training examples in the required format
# TRAIN_DATA =[ ("Pizza is a common fast food.", {"entities": [(0, 5, "FOOD")]}),
#               ("Cheese Pasta is an italian recipe.I love Cheese Pasta", {"entities": [(0, 12, "FOOD"),(41,53,"FOOD")]}),
#               ("China's noodles are very famous", {"entities": [(8,15, "FOOD")]}),
#               ("Shrimps are famous in China too", {"entities": [(0,7, "FOOD")]}),
#               ("Lasagna is another classic of Italy", {"entities": [(0,7, "FOOD")]}),
#               ("Sushi is extemely famous and expensive Japanese dish", {"entities": [(0,5, "FOOD")]}),
#               ("Unagi is a famous seafood of Japan", {"entities": [(0,5, "FOOD")]}),
#               ("Tempura , Soba are other famous dishes of Japan", {"entities": [(0,7, "FOOD")]}),
#               ("Udon is a healthy type of noodles", {"entities": [(0,4, "ORG")]}),
#               ("Chocolate soufflé is extremely famous french cuisine", {"entities": [(0,17, "FOOD")]}),
#               ("Flamiche is french pastry", {"entities": [(0,8, "FOOD")]}),
#               ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
#               ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
#               ("Frenchfries are considered too oily", {"entities": [(0,11, "FOOD")]})
#            ]