In [1]:
import spacy
from spacy.util import filter_spans
import unidecode
from collections import defaultdict
import re
import srsly
from datetime import datetime
from glob import glob
import os

In [4]:
def preprocess_reviews(review: str) -> str:
    review = unidecode.unidecode(review + " ").lower()\
                                              .replace("&", "and")\
                                              .replace("-", " ")\
                                              .replace(" w/", " with ")

    review = re.sub('\*|\"|\$|#', '', review)
    review = re.sub(r'(\W)(?=\1)', '', review)

    review = ' '.join(review.split())

    return review

In [5]:
nlp = spacy.load('../NER/menuitem_ner_model_epoch_50')

In [8]:
# detecting food items for all reviews
for file_name in glob("../NER/restaurant wise reviews/*/*.jsonl"):
    data = srsly.read_jsonl(file_name)
    data_ann = []
    for d in data:
        review = d.get("review")
        review = preprocess_reviews(review)
        doc = nlp(review)

        entities = list(set(ent.text for ent in filter_spans(doc.ents)))

        r_data = {k:v for k, v in d.items() if k != "review"}
        r_data["review"] = doc.text
        r_data["entities"] = entities

        data_ann.append(r_data)
        
    if not os.path.exists("reviews with detected entities"):
        os.mkdir("reviews with detected entities")
        
    file_dir = os.path.join("reviews with detected entities", file_name.split("\\")[-2])
    
    if not os.path.exists(file_dir):
        os.mkdir(file_dir)
                 
    srsly.write_jsonl(os.path.join(file_dir, file_name.split("\\")[-1]), data_ann)

## Number of reviews pre vs post covid

In [7]:
num_reviews = {"Chicago, IL" : {}, "Phoenix, AZ" : {}}
num_reviews_food = {"Chicago, IL" : {}, "Phoenix, AZ" : {}}

In [8]:
pre_covid_start_date = datetime(year = 2019, month=12, day=1)
pre_covid_end_date = datetime(year = 2020, month=2, day=29)

for file_name in glob("reviews with detected entities/*/*.jsonl"):
    data = srsly.read_jsonl(file_name)
    location = file_name.split("\\")[-2]
    business_name = file_name.split("\\")[-1].split(".")[0]
    
    monthwise_num_reviews = defaultdict(int)
    monthwise_num_reviews_food = defaultdict(int)
    
    for d in data:
        date = datetime.strptime(d.get("date"), "%Y-%m-%d")
        
        if (date >= pre_covid_start_date):
            ent = d.get("entities")
            month = date.month
            
            monthwise_num_reviews[month] += 1

            if len(ent) != 0 :
                monthwise_num_reviews_food[month] += 1
                    
    num_reviews[location][business_name] = {"total" : dict(monthwise_num_reviews), 
                                            "food" : dict(monthwise_num_reviews_food)}

In [3]:
import pandas as pd

In [92]:
data_df = {"location":[],"type":[],12:[],1:[],2:[],3:[],4:[],5:[]}

for loc, v in num_reviews.items():

    total = defaultdict(int)
    food = defaultdict(int)
    for i in v.values():
        for f_t, dict_m in i.items():
            if f_t == "food":
                for month, count in dict_m.items():
                    food[month] += count
            if f_t == "total":
                for month, count in dict_m.items():
                    total[month] += count
                    
    data_df["location"].extend([loc] * 2)
    data_df["type"].append("total")
    for month, count in total.items():
        if month!=6:
            data_df[month].append(count)
    
    data_df["type"].append("food")
    for month, count in food.items():
        if month!=6:
            data_df[month].append(count)
            
reviews_count_df = pd.DataFrame(data_df).set_index(["location", "type"])

In [93]:
reviews_count_df

Unnamed: 0_level_0,Unnamed: 1_level_0,12,1,2,3,4,5
location,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Chicago, IL",total,5485,5280,5068,3195,1482,1653
"Chicago, IL",food,4739,4564,4357,2773,1283,1390
"Phoenix, AZ",total,5226,5460,5994,3696,1549,1545
"Phoenix, AZ",food,4314,4530,4861,3022,1284,1224


In [10]:
data_df = {"location":[],"type":[],12:[],1:[],2:[],3:[],4:[],5:[]}

for loc, v in num_reviews.items():

    total = defaultdict(int)
    food = defaultdict(int)
    for i in v.values():
        for f_t, dict_m in i.items():
            if dict_m.get(3) and dict_m.get(4) and dict_m.get(5):
                if f_t == "food":

                    for month, count in dict_m.items():
                        food[month] += count
                if f_t == "total":
                    for month, count in dict_m.items():
                        total[month] += count
                    
    data_df["location"].extend([loc] * 2)
    data_df["type"].append("total")
    for month, count in total.items():
        if month!=6:
            data_df[month].append(count)
    
    data_df["type"].append("food")
    for month, count in food.items():
        if month!=6:
            data_df[month].append(count)
            
reviews_count_df = pd.DataFrame(data_df).set_index(["location", "type"])

In [106]:
reviews_count_df

Unnamed: 0_level_0,Unnamed: 1_level_0,12,1,2,3,4,5
location,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Chicago, IL",total,3159,3067,2862,1961,1175,1257
"Chicago, IL",food,2605,2497,2356,1622,990,1026
"Phoenix, AZ",total,2950,3089,3325,2194,1201,1177
"Phoenix, AZ",food,2228,2389,2465,1685,917,884


In [35]:
from urllib.parse import unquote

In [64]:
rest_info = pd.read_csv("../databases/restaurants_info.csv")
rest_info.loc[:,"url"] = rest_info.url.str.apply(lambda x: unquote(x)[5:])
rest_info.loc[:,"categories"] = rest_info.categories.str.lower()
rest_info.loc[:,"categories"] = rest_info.categories.str.split(", ")

In [57]:
data_df = {}
for loc, v in num_reviews.items():
    print(loc)
    total = defaultdict(lambda: defaultdict(int))
    
    for rest, i in v.items():
        try:
            cuisines_list = rest_info[rest_info.url == rest].categories.values[0]

            for f_t, dict_m in i.items():
                if f_t == "total":
                    for month, count in dict_m.items():
                        for cuisine in cuisines_list:
                            total[month][cuisine] += count
        except:
            print(rest)

    total = {k:dict(sorted(v.items(), key=lambda k_v:k_v[1], reverse=True)[:10]) for k, v in total.items()}
    data_df[loc] = pd.DataFrame({**{k:list(tuple(v.keys())) for k, v in total.items()},
                                 **{str(k)+"_":list(tuple(v.values())) for k, v in total.items()}})[[12,"12_",1,
                                                                                                                   "1_",2,"2_",3,
                                                                                                                   "3_",4,"4_",5,"5_"]]

Chicago, IL
dave-and-busters-chicago-4
Phoenix, AZ


In [58]:
data_df["Chicago, IL"].to_csv("cuisines_chicago.csv")
data_df["Phoenix, AZ"].to_csv("cuisines_phoenix.csv")

## getting ingredients

In [14]:
import pickle
with open("../NER/restaurants_menus_df.pkl", "rb") as f:
    menus = pickle.load(f)

In [15]:
menus.menu

0       {'food_items': {'Pizza': [{'name': '16" new yo...
1       {'food_items': {'Breads': [{'name': 'grinder r...
2       {'food_items': {'Salads': [{'desc': 'Chopped R...
3       {'food_items': {'Wine': [{'name': 'sparkling c...
4       {'food_items': {'Add-Ons': [{'desc': 'Apple-Gr...
                              ...                        
1066    {'food_items': {'Paninis': [{'desc': 'Ham, bac...
1067    {'food_items': {'Appetizers': [{'desc': 'Marin...
1068    {'food_items': {'-': [{'desc': "Chairman's Res...
1069    {'food_items': {'Sides': [{'desc': 'Choice of ...
1070    {'food_items': {'Salad': [{'name': 'house sala...
Name: menu, Length: 1071, dtype: object

In [5]:
import re

In [17]:
def remove_any_brackets(item):
    ret = ''
    skip1c = 0
    skip2c = 0
    for i in item:
        if i == '[':
            skip1c += 1
        elif i == '(':
            skip2c += 1
        elif i == ']' and skip1c > 0:
            skip1c -= 1
        elif i == ')'and skip2c > 0:
            skip2c -= 1
        elif skip1c == 0 and skip2c == 0:
            ret += i
    return ret

def preprocess_menu_items(item):
    # remove content of brackets and detect 0 length string
    item = remove_any_brackets(item).strip()
    len_item = len(item)
    
    if (len_item > 70) | (len_item < 3):
        # not considering string with length more than 50
        return ""
    else:
        item = unidecode.unidecode(item + " ")\
                                        .replace(".", ". ")\
                                        .replace("-", " ")
        to_remove = [f'\*|\"|\$|#', # remove * and " and $ and #
                     f'\d+\s*(lb|pounds|pound|oz|ounces|ounce|inches|inch'
                             f'|grams|gram|pcs|pieces|piece|each|cup'
                             f'|bowl|scoops|scoop|pot|liters|liter'
                             f'|or less|off)\s*((of)*)\.*\s+',
                     f'\s*\S*[0-9]\S*'] # remove anyword with digits in it
                     
        for pattern in to_remove:
            item = re.sub(pattern, ' ', item)
        
        item = ' '.join(item.replace(".", "").split())
        
        if len(item) < 3:
            return ""
        else:
            return re.split(r'\s+and\s+|\s*with\s+', item)

In [286]:
ingred = set()
for menu in menus.menu:
    items_lists = menu["food_items"].values()
    for items_list in items_lists:
        desc_list = []
        for item in items_list:
            desc = " ".join(item.get("desc", "").split()).lower().replace("&", " and ").replace("w/", " with ")
            desc = re.split(r'\s+and\s+|\s*with\s+|,|\/\/|\||\/|\+', desc)
            desc_list.extend(desc)
#         if " and  " in " ".join([" ".join(i) for i in desc]):
#             print(desc)
        for i in desc_list:
            ingred |= set([j for j in preprocess_menu_items(i) if j != ""])

In [287]:
pd.DataFrame(list(ingred), columns=["ingredient"]).to_csv("ingredients.csv", index=False, columns=["ingredient"])

## food items trend

In [33]:
from rapidfuzz import fuzz
from rapidfuzz import process

In [323]:
ingred = list(ingred)

In [340]:
ingred_list = [i for i in ingred if len(i) <= 30]

In [351]:
ingred_list.append("the burger")

In [357]:
len(ingred_list)

41169

In [356]:
for file_name in glob("reviews with detected entities/*/*.jsonl"):
    data = srsly.read_jsonl(file_name)
    data_ann = []
    for d in data:
        date = datetime.strptime(d.get("date"), "%Y-%m-%d")
        if date >= datetime(year = 2019, month=12, day=1):
            entities = d.get("entities")
            entities = [ent for ent in entities if not process.extractOne(ent, ingred_list, score_cutoff=95.0)]

            r_data = {k:v for k, v in d.items() if k != "entities"}
            r_data["entities"] = entities

            data_ann.append(r_data)

    if not os.path.exists("reviews with ingreds removed"):
        os.mkdir("reviews with ingreds removed")

    file_dir = os.path.join("reviews with ingreds removed", file_name.split("\\")[-2])

    if not os.path.exists(file_dir):
        os.mkdir(file_dir)

    srsly.write_jsonl(os.path.join(file_dir, file_name.split("\\")[-1]), data_ann)

In [29]:
top_food_items_df = {}

for loc in ["Chicago, IL", "Phoenix, AZ"]:
    top_food_items_dict = defaultdict(lambda: defaultdict(int))
    
    for file_name in glob(f"reviews with ingreds removed/{loc}/*.jsonl"):
        data = srsly.read_jsonl(file_name)

        for d in data:
            month = datetime.strptime(d.get("date"), "%Y-%m-%d").month
            for item in d.get("entities"):
                if item not in ["meal", "happy hour", "a special", "date night", "authentic", 
                                "the street", "deep dish", "the chicago"]:
                    top_food_items_dict[month][item] += 1
                
    top_food_items_dict = {k:dict(sorted(v.items(), 
                                         key=lambda k_v:k_v[1], 
                                         reverse=True)) 
                           for k, v in top_food_items_dict.items()}
#     print(top_food_items_dict)

#     top_food_items_df[loc] = pd.DataFrame({k:list(tuple(v.items())) for k, v in top_food_items_dict.items()})[[12,1,2,3,4,5]]
    
    top_food_items_dict = {k:dict(sorted(v.items(), key=lambda k_v:k_v[1], reverse=True)) for k, v in top_food_items_dict.items()}
    max_rows = max([len(i) for i in top_food_items_dict.values()])
    names = {k:list(tuple(v.keys())) for k, v in top_food_items_dict.items()}
    names = {k:v if (len(v) == max_rows) else (v+[""]*(max_rows - len(v))) for k, v in names.items()}
    
    values = {str(k)+"_":list(tuple(v.values())) for k, v in top_food_items_dict.items()}
    values = {k:v if (len(v) == max_rows) else (v+[""]*(max_rows - len(v)))  for k, v in values.items()}
    
    top_food_items_df[loc] = pd.DataFrame({**names, **values})[[12,"12_",1,"1_",2,"2_",3,"3_",4,"4_",5,"5_"]]
                   

In [33]:
top_food_items_df["Chicago, IL"].to_csv("top_food_chicago_all.csv")
top_food_items_df["Phoenix, AZ"].to_csv("top_food_phoenix_all.csv")

## Ratings

In [30]:
ratings_df = {}

for loc in ["Chicago, IL", "Phoenix, AZ"]:
    ratings_dict = defaultdict(lambda: defaultdict(int))
    
    for file_name in glob(f"reviews with ingreds removed/{loc}/*.jsonl"):
        data = srsly.read_jsonl(file_name)

        for d in data:
            month = datetime.strptime(d.get("date"), "%Y-%m-%d").month
            ratings_dict[month][d.get("rating")] += 1
                
    ratings_dict = {k:dict(sorted(v.items(), 
                                         key=lambda k_v:k_v[0], 
                                         reverse=True)) 
                           for k, v in ratings_dict.items()}

    ratings_df[loc] = pd.DataFrame({**{"Rating":[5,4,3,2,1]},
                                 **{k:list(v.values()) for k, v in ratings_dict.items()}})[["Rating",12,1,2,3,4,5]]

In [32]:
ratings_df["Chicago, IL"].to_csv("Ratings_chicago.csv", index=False)
ratings_df["Phoenix, AZ"].to_csv("Ratings_phoenix.csv", index=False)

## Deliveries and Takeouts

In [63]:
del_takeout_df = {}
del_takeout_tag = ["delivered", "delivery", "deliveries", "deliver", 
                   "takeout", "takeouts", "take home"]

for loc in ["Chicago, IL", "Phoenix, AZ"]:
    del_takeout_dict = defaultdict(lambda: defaultdict(int))
    
    for file_name in glob(f"reviews with ingreds removed/{loc}/*.jsonl"):
        data = srsly.read_jsonl(file_name)

        for d in data:
            month = datetime.strptime(d.get("date"), "%Y-%m-%d").month
            del_takeout_dict[month]["Total"] += 1
            
            if any([word for word in d.get("review").split() if 
                    process.extractOne(word, del_takeout_tag, score_cutoff=95.0)]):
                del_takeout_dict[month]["del_takeout"] += 1
                
    del_takeout_dict = {k:dict(sorted(v.items(), 
                                         key=lambda k_v:k_v[0], 
                                         reverse=True))
                           for k, v in del_takeout_dict.items()}

    del_takeout_df[loc] = pd.DataFrame({**{"":["del/takeout","Total"]},
                                 **{k:list(v.values()) for k, v in del_takeout_dict.items() if k!=6}})[["",12,1,2,3,4,5]]

In [65]:
del_takeout_df["Chicago, IL"].to_csv("del_takeouts_chicago.csv", index=False)
del_takeout_df["Phoenix, AZ"].to_csv("del_takeouts_phoenix.csv", index=False)