#### Food Reviews Tagging

In [5]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
stopwords = stopwords.words('english')

In [6]:
food = pd.read_excel('Food_vocab_9_1776.xlsx',index_col=0)
food.head()

Unnamed: 0,dish,restaurant
0,Masala Dosa,elco
1,Kulfi Falooda,elco
2,Pani Puri,elco
3,Pav Bhaji,elco
4,Sev Puri,elco


In [7]:
food.restaurant.value_counts()

Shalimar       474
gurukripa      256
amarjuice      219
poptates       214
spice5         176
elco           146
delhidarbar    109
dominos         97
prithvi         85
Name: restaurant, dtype: int64

In [8]:
reviews = pd.read_excel('generalized_review_tokens_11.xlsx',index_col=0)
print(reviews.shape)
reviews.head()

(86430, 4)


Unnamed: 0,Review Number,Restaurant,sentence no,words
0,0,1441 Pizzeria,0,I
1,0,1441 Pizzeria,0,visited
2,0,1441 Pizzeria,0,the
3,0,1441 Pizzeria,0,Fort
4,0,1441 Pizzeria,0,outlet


In [9]:
def clean_words(text):
    text = re.sub('\s+','ENDPAD',text)
    return text

In [10]:
reviews['words'] = reviews.words.apply(clean_words)

In [11]:
reviews.Restaurant.value_counts()

1441 Pizzeria        12885
Prithvi cafe          9751
Delhi Darbar          7856
Aaswad                7792
Cafe Mondegar         7639
5 Spice               7462
Shalimar              7457
Guru Kripa            7374
Elco                  6681
Pop Tate's            6321
Amar Juice Centre     5212
Name: Restaurant, dtype: int64

In [12]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(rn,r,sn,w) for rn,r,sn,w in zip(s["Review Number"].values.tolist(),s["Restaurant"].values.tolist(),s["sentence no"].values.tolist(),s["words"].values.tolist())]
        self.grouped = self.data.groupby(['Review Number','sentence no']).apply(agg_func)
        self.sentences = [s for s in self.grouped]

#### Make Data

In [13]:
getter = SentenceGetter(reviews)
data = getter.sentences

In [14]:
sentences =[' '.join([w[3] for w in sentence]) for sentence in data ]
sn = [sentence[0][2] for sentence in data ]
restaurants = [list(set([rest[1] for rest in sentence])) for sentence in data ]
rns = [list(set([rn[0] for rn in sentence])) for sentence in data ]

In [15]:
df1 = pd.DataFrame(rns,columns=['review no'])
df2 = pd.DataFrame(restaurants,columns=['restaurant'])
df3 = pd.DataFrame(sn,columns=['sentence no'])
df4 = pd.DataFrame(sentences,columns=['review'])
df= pd.concat([df1,df2,df3,df4],axis=1)
print(df.shape)
df.tail()

(7172, 4)


Unnamed: 0,review no,restaurant,sentence no,review
7167,1098,Amar Juice Centre,1,Open till late in the night when very limited ...
7168,1098,Amar Juice Centre,2,The Simple Pizzas are a must haves .
7169,1099,Amar Juice Centre,0,The food offered is very tasty .
7170,1099,Amar Juice Centre,1,One should definitely try the Masala Pav that ...
7171,1099,Amar Juice Centre,2,Great place to have snacks / dinner late at ni...


#### Label Food

In [16]:
def lower(text):
    text = [ word.lower() for word in text.split()]
    return ' '.join(text)

In [17]:
vocab = food.dish.apply(lower).tolist()
vocab = list(set(vocab))

In [18]:
def clean(text):
    text = re.sub(r"\[(.*?)\]","",text) # remove [word]
    text = re.sub(r"\((.*?)\)","",text) # remove (word)
    text = re.sub(r"\-.+","",text)     # remove after - , eg:  Burger - non veg   
    return text
food_vocab = [clean(dish) for dish in vocab]
foods = [[food for food in name.split()] for name in food_vocab]
fwords=[]
for words in foods:
    for word in words:
        fwords.append(word)

In [19]:
'paneer' in food_vocab

False

In [20]:
'paneer' in fwords

True

In [21]:
len(fwords)

4497

In [90]:
dishes = list(set([word for word in fwords if word not in stopwords  and word.isalpha()==True]))
waste = list(set([word for word in fwords if word in stopwords or word.isalpha()==False]))
print("TOTAL UNIQUE DISHES ARE :- ",len(dishes))
print(waste)

TOTAL UNIQUE DISHES ARE :-  670
['at', '&', '4', '380', "'", 'a', "'2", 'biryani....', 'with', 'of', 'in', 'do', '+', 'all', '65', 'on', "reg'", 'can', '1', 'tate’s', 'from', 'and', "med'", 'only', "buddha's", '5', '3']


##### ADD CUSTOM DISHES HERE

In [91]:
with open("remove_dishes.txt",'r') as file:
    remove_dishes = file.read().split()
with open("special_dishes.txt",'r') as file:
    special_dishes = file.read().split()

In [92]:
#for dish in remove_dishes:
#    if dish in dishes:
#        dishes.remove(dish)
dishes += special_dishes

In [93]:
len(dishes)

692

In [None]:
# custom = ['food','lunch','dinner','breakfast','snacks','chips'] ## WORKING ON THIS

In [47]:
print(dishes)

['kachori', 'nargis', 'panini', 'tikka', 'mumtaz', 'plan', 'eggplant', 'patiala', 'junglee', 'oriental', 'tikki', 'custard', 'green', 'nachos', 'movie', 'paradise', 'chatpata', 'scotch', 'paper', 'milly', 'paratha', 'talumien', 'seekh', 'koliwada', 'extra', 'premium', 'finger', 'soda', 'crush', 'pik', 'mashed', 'yum', 'stewed', 'croissant', 'veggies', 'brownies', 'mixed', 'strawberry', 'korma', 'char', 'steamed', 'slice', 'chowmein', 'afghani', 'dahi', 'black', 'gulab', 'cripsy', 'paneer', 'sticks', 'set', 'falooda', 'pack', 'med', 'specials', 'spy', 'biryani', 'peppy', 'pahadi', 'kari', 'taco', 'dog', 'jully', 'peas', 'shami', 'leg', 'chips', 'brain', 'delight', 'sada', 'pakoda', 'n', 'shalimar', 'brown', 'pista', 'chocolate', 'uttapam', 'kabab', 'treasure', 'fiesta', 'waldorf', 'hariyali', 'frankie', 'zafrani', 'gluten', 'double', 'pops', 'golden', 'sour', 'makhni', 'karacali', 'new', 'baileys', 'fry', 'shake', 'rose', 'chole', 'treat', 'cheesecake', 'tawa', 'bailley', 'milkshake', '

In [94]:
hotel_names = list(set(df.restaurant.apply(lower)))
hotel_names.remove("amar juice centre")

In [None]:
# %%time
# tagseqs=[]
# for review in reviews:
#     words = review.split()
#     tag_seq=[]
#     for i,word in enumerate(words):
#         f=0
#         for dish in dishes:
#             if word == dish:
#                 f=1
#                 tag_seq.append('E')
#                 break
#         if f==0:
#             tag_seq.append('O')
#     tagseqs.append(' '.join(tag_seq))

In [50]:
"n" in dishes

True

In [95]:
import difflib

In [96]:
# sample = df.sample(1000)

In [97]:
c=0
def algo_tag(text):
    words = text.split()
    tag_seq=[]
    for i,word in enumerate(words):
        f=0
        for dish in dishes:
            score = difflib.SequenceMatcher(None,word,dish)
            if float(score.ratio()) > 0.90:
                #print(score.ratio(),word,dish)
                f=1
                tag_seq.append('E')
                break
        if f==0:
            tag_seq.append('O')
        global c
    c+=1
    if c%1000==0:
        print("Done ",c)
    return ' '.join(tag_seq)

In [98]:
%%time
df['algo_2'] = df.review.apply(lower).apply(algo_tag)

Done  1000
Done  2000
Done  3000
Done  4000
Done  5000
Done  6000
Done  7000
Wall time: 12min 42s


## removal of remove_list words

In [99]:
remove_dishes += hotel_names

In [100]:
df.head(20)  #sample is the dataset on which algo tag was applied without removing remove_list words


Unnamed: 0,review no,restaurant,sentence no,review,algo_2
0,0,1441 Pizzeria,0,I visited the Fort outlet and I wish to keep v...,O O O O O O O O O O O O
1,0,1441 Pizzeria,1,I happened to walk into the outlet on a Wednes...,O O O O O O O O O O O O O O
2,0,1441 Pizzeria,2,We were delighted the the BOGO offer .,O O O O O O O O
3,0,1441 Pizzeria,3,I chose the make your own pizza with unlimited...,O O O O O O E O O E O
4,0,1441 Pizzeria,4,I literary turned into a kid making my pizza .,O O O O O O O O E O
5,0,1441 Pizzeria,5,The joy to choose your toppings and see it bei...,O O O O O E O O O O O O O O O O O O O O O O O
6,0,1441 Pizzeria,6,To my surprise they had a jain sauce variant a...,O O O O O O E E O O O O O O E O O O O O O
7,0,1441 Pizzeria,7,They bake the pizza in wood fired oven and per...,O O O E O O O O O O O O O O E O O O O
8,1,1441 Pizzeria,0,I am so glad that 1441 pizzeria has opened up ...,O O O O O O O O O O O O O O O O E O O
9,1,1441 Pizzeria,1,Its the first pizza joint in chembur to have t...,O O O E O O O O O O O O O E O O O E O O


In [103]:
##this is the cell 
newlist = []
for i,rev in zip(df['algo_2'],df['review']) : #takes dataframe created after normal algo_tag execution without removing removelist  
    i_list = i.split()
    rev_list = rev.split()
    
    for j in range(1,len(i_list)-1) :
        
#         #new two lines not tested if error comment next four lines
        if rev_list[0] in remove_dishes and i_list[1] != 'E' : 
            i_list[0] = 'O'
        if rev_list[-1] in remove_dishes and i_list[-2] != 'E' : 
            i_list[-1] = 'O'
#         elif (i_list[1] == 'E' and rev_list[1] in remove_dishes) :
#             i_list[j] = 'O'
#         #end of not tested 
        
        if rev_list[j] in remove_dishes :
            if i_list[j-1] != 'E' and i_list[j+1] != 'E' :
                i_list[j] = 'O'
#             ## next two lines not tested (elif part) , comment in case of error
#             elif (i_list[j-1] == 'E'and rev_list[j-1] in remove_dishes) or (i_list[j+1] == 'E' and rev_list[j+1] in remove_dishes) :
#                 i_list[j] = 'O'
#             ##end  not tested elif
    
    newlist.append(' '.join(i_list))

df['algo_clean'] = newlist

In [None]:
'''
i_list = .split()
for j in range(1,len(i_list)-1) :
    if i_list[j] in remove_dishes :
        if i_list[j-1] != 'E' and i_list[j+1] != 'E' :
            i_list[j] = 'O'
'''



In [104]:
df.head(20) 

Unnamed: 0,review no,restaurant,sentence no,review,algo_2,algo_clean
0,0,1441 Pizzeria,0,I visited the Fort outlet and I wish to keep v...,O O O O O O O O O O O O,O O O O O O O O O O O O
1,0,1441 Pizzeria,1,I happened to walk into the outlet on a Wednes...,O O O O O O O O O O O O O O,O O O O O O O O O O O O O O
2,0,1441 Pizzeria,2,We were delighted the the BOGO offer .,O O O O O O O O,O O O O O O O O
3,0,1441 Pizzeria,3,I chose the make your own pizza with unlimited...,O O O O O O E O O E O,O O O O O O E O O E O
4,0,1441 Pizzeria,4,I literary turned into a kid making my pizza .,O O O O O O O O E O,O O O O O O O O E O
5,0,1441 Pizzeria,5,The joy to choose your toppings and see it bei...,O O O O O E O O O O O O O O O O O O O O O O O,O O O O O E O O O O O O O O O O O O O O O O O
6,0,1441 Pizzeria,6,To my surprise they had a jain sauce variant a...,O O O O O O E E O O O O O O E O O O O O O,O O O O O O E E O O O O O O E O O O O O O
7,0,1441 Pizzeria,7,They bake the pizza in wood fired oven and per...,O O O E O O O O O O O O O O E O O O O,O O O E O O O O O O O O O O O O O O O
8,1,1441 Pizzeria,0,I am so glad that 1441 pizzeria has opened up ...,O O O O O O O O O O O O O O O O E O O,O O O O O O O O O O O O O O O O O O O
9,1,1441 Pizzeria,1,Its the first pizza joint in chembur to have t...,O O O E O O O O O O O O O E O O O E O O,O O O E O O O O O O O O O O O O O E O O


In [105]:
diff = []
for i,j,rev in zip(df['algo_2'],df['algo_clean'],df['review']):
    if i != j :
        diff.append((rev,i,j))

In [106]:
len(diff)

479

In [107]:
#results
for i in diff :
    print(i[0])
    print(i[1])
    print(i[2])
    print()

They bake the pizza in wood fired oven and perfectly thin rolled base becomes crisp to bite into .
O O O E O O O O O O O O O O E O O O O
O O O E O O O O O O O O O O O O O O O

I am so glad that 1441 pizzeria has opened up in chembur with its traditional wood fire oven .
O O O O O O O O O O O O O O O O E O O
O O O O O O O O O O O O O O O O O O O

Its the first pizza joint in chembur to have this kind of wood fire oven baking of pizza . ENDPAD
O O O E O O O O O O O O O E O O O E O O
O O O E O O O O O O O O O O O O O E O O

They have a very special segment where you can actually go ahead and made your own pizza from the scratch with your own choice of toppings and sauces .
O O O O E O O O O O O O O O O O E O O O O O O E O E O E O
O O O O O O O O O O O O O O O O E O O O O O O E O E O E O

They have two types of pizza categories which is classic and gourmet and if you are a chicken lover gourmet pizza toppings is what you should pick in .
O O O O O E O O O E O E O O O O O E O E E E O O O O 

In [53]:
'the' in remove_dishes

False

In [108]:
score = difflib.SequenceMatcher(None,"taste","tastes")
score.ratio()

0.9090909090909091

In [110]:
df.values

array([[0, '1441 Pizzeria', 0,
        'I visited the Fort outlet and I wish to keep visitinggg ..',
        'O O O O O O O O O O O O', 'O O O O O O O O O O O O'],
       [0, '1441 Pizzeria', 1,
        'I happened to walk into the outlet on a Wednesday and guess what !',
        'O O O O O O O O O O O O O O', 'O O O O O O O O O O O O O O'],
       [0, '1441 Pizzeria', 2, 'We were delighted the the BOGO offer .',
        'O O O O O O O O', 'O O O O O O O O'],
       ...,
       [1099, 'Amar Juice Centre', 0, 'The food offered is very tasty .',
        'O E O O O O O', 'O E O O O O O'],
       [1099, 'Amar Juice Centre', 1,
        'One should definitely try the Masala Pav that they offer .',
        'O O O O O E E O O O O', 'O O O O O E E O O O O'],
       [1099, 'Amar Juice Centre', 2,
        'Great place to have snacks / dinner late at night as well .',
        'O O O O E O E O O O O O O', 'O O O O E O E O O O O O O']],
      dtype=object)

In [111]:
seq =[]
for sentence in df.algo_clean:
    seq.extend(sentence.split())

In [112]:
len(seq)

86430

In [113]:
df.head()

Unnamed: 0,review no,restaurant,sentence no,review,algo_2,algo_clean
0,0,1441 Pizzeria,0,I visited the Fort outlet and I wish to keep v...,O O O O O O O O O O O O,O O O O O O O O O O O O
1,0,1441 Pizzeria,1,I happened to walk into the outlet on a Wednes...,O O O O O O O O O O O O O O,O O O O O O O O O O O O O O
2,0,1441 Pizzeria,2,We were delighted the the BOGO offer .,O O O O O O O O,O O O O O O O O
3,0,1441 Pizzeria,3,I chose the make your own pizza with unlimited...,O O O O O O E O O E O,O O O O O O E O O E O
4,0,1441 Pizzeria,4,I literary turned into a kid making my pizza .,O O O O O O O O E O,O O O O O O O O E O


#### TRY START

In [114]:
res =  [[w[1] for w in sentence] for sentence in data ]
rvnos =[[str(w[0]) for w in sentence] for sentence in data ]
sns =  [[w[2] for w in sentence] for sentence in data ]
r = []
for sentence in res:
    r.extend(sentence)
rn = []
for sentence in rvnos:
    rn.extend(sentence)
review_words =[]
rv = df.review.tolist()
for word in rv:
    review_words.extend(word.split())
sn=[]
for sentence in sns:
    sn.extend(sentence)

In [119]:
df1 = pd.DataFrame(rn,columns=['review no'])
df2 = pd.DataFrame(r,columns=['restaurant'])
df3 = pd.DataFrame(sn,columns=['sentence no'])
df4 = pd.DataFrame(review_words,columns=['words'])
df5 = pd.DataFrame(seq,columns=['algo_tag'])
tagged_words = pd.concat([df1,df2,df3,df4,df5],axis=1)

In [120]:
tagged_words.tail(20)

Unnamed: 0,review no,restaurant,sentence no,words,algo_tag
86410,1099,Amar Juice Centre,1,the,O
86411,1099,Amar Juice Centre,1,Masala,E
86412,1099,Amar Juice Centre,1,Pav,E
86413,1099,Amar Juice Centre,1,that,O
86414,1099,Amar Juice Centre,1,they,O
86415,1099,Amar Juice Centre,1,offer,O
86416,1099,Amar Juice Centre,1,.,O
86417,1099,Amar Juice Centre,2,Great,O
86418,1099,Amar Juice Centre,2,place,O
86419,1099,Amar Juice Centre,2,to,O


In [121]:
tagged_words.to_excel('generalized_review_tokens_11_tagged_v3.xlsx')

#### TRY END

### POS dish

In [None]:
dishes

In [None]:
from nltk import pos_tag

In [None]:
pos =pos_tag(dishes)
print(len(dishes))

In [None]:
others = []
common =[]
for word in pos:
    if word[1]!='NN' and word[1]!='JJ' and  word[1]!='NNS' and word[1]!='NNP' and  word[1]!='JJS' and word[1]!='FW' and word[1] != 'RB':
        others.append(word)
    else:
        common.append(word)

In [None]:
len(others)

In [None]:
print(others)

In [None]:
for word in others:
    if word[1] == 'VBD':
        print(word)

In [None]:
len(common)

In [None]:
print(common)

#### IMPROVEMENT

In [None]:
from nltk import word_tokenize

In [None]:
test = "I live on potato wedgies- and the-lol dkc,dsc,c! are mangoes like dw"

In [None]:
pos_tag(word_tokenize(test))

In [None]:
from fuzzywuzzy import fuzz 
from fuzzywuzzy import process 

In [None]:
word1 ="pizza topping"
word2 ="pizza"

In [None]:
fuzz.partial_ratio(word1,word2)