In [1]:
import pandas as pd
import re
import unidecode
import pickle
import srsly
from urllib.parse import unquote

### Preprocess each menu item

In [2]:
with open("restaurants_menus_df.pkl", "rb") as f:
    menus = pickle.load(f)

In [4]:
def remove_any_brackets(item):
    ret = ''
    skip1c = 0
    skip2c = 0
    for i in item:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            ret += i
    return ret

In [5]:
def preprocess_menu_items(item):
    # remove content of brackets and detect 0 length string
    item = remove_any_brackets(item).strip()
    len_item = len(item)
    
    if (len_item > 70) | (len_item < 3):
        # not considering string with length more than 50
        return ""
    else:
        item = unidecode.unidecode(item + " ")\
                                        .lower()\
                                        .replace(".", ". ")\
                                        .replace("&", "and")\
                                        .replace("-", " ")\
                                        .replace(" w/", " with ")
        to_remove = [f'\*|\"|\$|#', # remove * and " and $ and #
                     f'\d+\s*(lb|pounds|pound|oz|ounces|ounce|inches|inch'
                             f'|grams|gram|pcs|pieces|piece|each|cup'
                             f'|bowl|scoops|scoop|pot|liters|liter'
                             f'|or less|off)\s*((of)*)\.*\s+',
                     f'\s*\S*[0-9]\S*'] # remove anyword with digits in it
                     
        for pattern in to_remove:
            item = re.sub(pattern, ' ', item)
        
        item = ' '.join(item.replace(".", "").split())
        
        if len(item) < 3:
            return ""
        else:
            return item

In [6]:
menu_items_to_remove = ["cup","way","sol","uni","can","mix","hot","mac","red","hat","nem","pop","nan",
                        "res","the","cafe","inch","thin","soda","cake","bowl","tune","live","mild",
                        "club","cola","lime","beer","sole","well","solo","coka","fire","roll","dark",
                        "wine","chef","sake","diet","soup","fool","pils","coke","pick","sides","super","spicy",
                        "large","order","unity","pique","sides","small","juice","combo","coffe","toast",
                        "limes","liver","lemon","sauce","fried","green","limca","fruit","jumbo","meats",
                        "cocoa","basic","pound","plate","coast","drink","black","white","house","water",
                        "plain","large","lunch","sunny","truly","pepsi","baked","chips","crush","banks",
                        "fanta","shake","royal","garden","powers","crusts","virtue","waters","people","single",
                        "friday","labneh","uptown","liters","juices","corona","crimes","robust","tender",
                        "pieces","pizzas","salumi","loaded","sunset","scoops","gloves","sunday","medium",
                        "coffee","farmer","parlor","clever","donpx,","sprite","extras","simple","heater","taste",
                        "makers","bottle","drinks","deluxe","unique","chef's","lunch a","lunch b","lunch c",
                        "lunch d","lunch e","lunch f","lunch g","lunch h","lunch i","lunch j","lunch k",
                        "lunch l","lunch m","lunch n","lunch o","lunch p","lunch q","lunch r","lunch s",
                        "lunch t","lunch u","lunch v","lunch w","lunch x","lunch y","lunch z","pop ups",
                        "buffalo","napkins","chopped","phoenix","cluster","patriot","one egg","the egg",
                        "ketchup","baskets","genesis","average","v juice","chamber","or less","two egg",
                        "absolut","chronic","biscuit","imports","degrees","supreme","century","mondays",
                        "regular","special","doubles","t shirt","classic","awesome","western","original",
                        "utensils","seasonal","one meat","triad in","toppings","specials","desserts",
                        "can coke","thums up","original","pick two","exclusiv","can soda","saturday",
                        "the kind","diabetes","sandwich","can cola","cocacola","downtown","birthday",
                        "utensils","two rice","official","rotating","can pops","thursday","coke can",
                        "soda pop","paradise","festival","take off","tuesdays","new york","chutneys",
                        "principe","full pot","manhattan","benchmark","roll of garbage bags","garbage bag each",
                        "sani spritz spray","toilet paper","kids cups no spill locking lid","kleenex box","sani wipes"]

In [7]:
s=menus['menu']
rest = menus['url']

cat = set()
items = set()
items_ = []
og_items = []
rest_name = []

In [8]:
for j, i in zip(rest, s):
    cat |= set([k.lower() for k in i['food_items'].keys()])
    for v in i['food_items'].values():
        for fi in v:
            food_item = preprocess_menu_items(fi['name'])
            if (food_item != "") and (food_item not in menu_items_to_remove):
                og_items.append(fi['name'])
                items.add(food_item)
                items_.append(food_item)
                rest_name.append(j[5:])

In [9]:
len(items) # 47061
# v2 46768
# v3 46743 # remove "spicy" and replace w/ -> with

46743

In [10]:
# get top menu items
from ast import literal_eval

rest_info = pd.read_csv("restaurants_info.csv")

top_menu_items = set()

for top_menu_items_str in rest_info.top_food_items:
    top_menu_items_list = literal_eval(top_menu_items_str)
    top_menu_items_list = map(preprocess_menu_items, top_menu_items_list)
    top_menu_items_list = [item for item in top_menu_items_list if (item != "") and (item not in menu_items_to_remove)]
    top_menu_items |= set(top_menu_items_list)

In [11]:
len(top_menu_items)

10878

In [12]:
items |= top_menu_items

In [15]:
pd.DataFrame({'menu_items':list(items)}).to_csv('preprocessed_unique_menu_items_W_top_food_items_v4.csv', index=False)

In [13]:
len(items)

52932

### Using PhraseMatcher from spacy to get annotations

In [15]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.util import filter_spans

In [16]:
def preprocess_reviews(review: str) -> str:
    review = unidecode.unidecode(review + " ").lower()\
                                              .replace("&", "and")\
                                              .replace("-", " ")\
                                              .replace(" w/", " with ")

    review = re.sub('\*|\"|\$|#', '', review)
    review = re.sub(r'(\W)(?=\1)', '', review) # ....??? -> .? or ???? -> ? or !!!!!! -> !

    review = ' '.join(review.split())

    return review

In [17]:
%time
test = '   HI $40 & #20. are mising 2000 !!!????from the get---go... ****disclaimer* "don\'t tell anyone", '
print(preprocess_reviews(test))

Wall time: 0 ns
hi 40 and 20. are mising 2000 !?from the get go. disclaimer don't tell anyone,


In [18]:
items = list(items)

nlp = spacy.blank('en')
nlp_menu_item_docs = list(nlp.tokenizer.pipe(items))
phrase_matcher = PhraseMatcher(nlp.vocab)
phrase_matcher.add('menuitem', None, *nlp_menu_item_docs)

In [42]:
# get rest name with menu
rest_names_w_menus = menus.url.apply(lambda x: unquote(x)[5:])

In [19]:
rest_names_w_menus = ["café-orchid-chicago-2", "lao-sze-chuan-chicago-2"]

In [20]:
%%time
ann_data = []

reviews_w_no_menuitem = 0
total_menuitems_entities = 0
rest = 0

for rest_name in rest_names_w_menus:
    try:
        review_data = srsly.read_jsonl("restaurant wise reviews/Chicago, IL/" + rest_name + ".jsonl")
#         print(rest_name)
        for review_dict in review_data:
            review = review_dict['review']
            review = preprocess_reviews(review)

            doc = nlp(review)
            matches = phrase_matcher(doc)
            spans = [doc[start : end] for _, start, end in matches]
            spans = filter_spans(spans)
            entities = [[span.start_char, span.end_char, "menuitem"] for span in spans]

            num_entities = len(spans)
            if num_entities:
                total_menuitems_entities += num_entities
            else:
                reviews_w_no_menuitem += 1
            ann_data.append([review, {"entities" : entities}])
        rest += 1
    except:
        pass

Wall time: 6.38 s


In [44]:
print(reviews_w_no_menuitem, total_menuitems_entities)
print(rest)
len(ann_data)
# 23464 795276
# 309
# 160993

23464 795276
309


160993

In [None]:
# v1 -> lower and convert "&" to "and"
# v2 -> preprocess_reviews()
# v3 -> not repacing "." with ". ", replacing recurring punctuation with single

In [21]:
srsly.write_json("annotated_data_ci_w_no_cuisine_menus.json", ann_data)

In [None]:
# TODO : change filter_spans implementation?