In [1]:
import pandas as pd
import numpy as np

In [2]:
full_combo = pd.read_csv("full_combo.csv")
full_combo["ZipCode"].astype(int)
list_columns = ["Popular Foods", "Items", "Cuisine"]
for column in list_columns:
    full_combo[column] = full_combo[column].apply(lambda s: s.split(", ") if isinstance(s, str) else s)
full_combo

Unnamed: 0,Name,ZipCode,Rating,Price Category,Review Comment,Popular Foods,Items,Inspection Date,Inspection Grade,Inspection Critical,Cuisine,Address,grade and rating,items with no cuisine,cuisine with no items
0,218,10013,4.2,1.0,,,,,,,,"218 Grand St, New York, 10013",False,False,False
1,251 Ginza Sushi,10016,,,,,"[soda, spring water and soda, snapple ice tea,...",,,,"[japanese, sushi]","251 E 35th St, New York, 10016",False,False,False
2,3 Guys Restaurant,10021,,,“Quick lunch and good service”,,"[cold cereal, cold cereal with banana, hot oat...",,,,"[cafe, japanese, american, diner]","960 Madison Ave, New York, 10021",False,False,False
3,4 Caminos Mexican Restaurant,11385,4.1,2.0,,,,,,,,"67-22 Fresh Pond Rd, Queens, 11385",False,False,False
4,5 Brothers Gourmet Deli,10036,,,,,"[fresh homemade beef brisket sandwich, fresh t...",,,,"[sandwiches, deli food, american, burgers]","689 10th Ave, New York, 10036",False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799,ilili Restaurant,10001,4.5,3.0,,,"[lunch prix fixe special, falafel wrap, grille...",,,,"[middle eastern, mediterranean, tapas]","236 5th Ave, New York, 10001",False,False,False
800,taïm mediterranean kitchen,10038,4.3,1.0,,,,,,,,"75 Maiden Ln, New York, 10038",False,False,False
801,არგო,11223,4.6,2.0,,,,,,,,"1985 Coney Island Ave, Brooklyn, 11223",False,False,False
802,უბანი,11209,4.7,2.0,,,,,,,,"Georgian Cuisine, 8309 3rd Ave, 11209",False,False,False


#### extracting cuisine (basic) 
from restaurant name and review comment

In [3]:
cuisines = []
for line in full_combo["Cuisine"]:
    if isinstance(line, list):
        for cuisine in line:
            if cuisine not in cuisines:
                cuisines.append(cuisine.lower())
cuisines 

['japanese',
 'sushi',
 'cafe',
 'american',
 'diner',
 'sandwiches',
 'deli food',
 'burgers',
 'smoothies juices',
 'bakery pastries',
 'contemporary',
 'chinese',
 'asian',
 'latin american',
 'puerto rican',
 'pizza',
 'fast food',
 'coffee',
 'tea',
 'caf',
 'latin',
 'indian',
 'noodles',
 'spanish',
 'mediterranean',
 'tapas',
 'teahouses',
 'soups',
 'localorganic',
 'french',
 'american new',
 'lunch specials',
 'healthy',
 'thai',
 'seafood',
 'salads',
 'greek',
 'soul food',
 'southwestern',
 'mexican',
 'italian',
 'wine bar',
 'wings',
 'chicken',
 'pasta',
 'other',
 'ethiopian',
 'dominican',
 'steak',
 'bagels',
 'pretzels',
 'kosher',
 'vegetarian',
 'middle eastern',
 'lebanese',
 'moroccan',
 'cocktails',
 'bar',
 'hot dogs',
 'bistro',
 'european',
 'irish',
 'bar food',
 'steakhouse',
 'barbecue',
 'korean',
 'turkish',
 'alcohol',
 'brazilian',
 'south american',
 'pub',
 'gastropub',
 'argentinean',
 'nepali',
 'tibetan',
 'coffee tea',
 'vegan',
 'cajun creole'

In [4]:
def find_cuisines(name):
    cuisines_in_name = []
    if isinstance(name, str):
        for word in name.lower().split():
            if word in cuisines:
                cuisines_in_name.append(word.title())
    return cuisines_in_name

In [5]:
full_combo["Cuisine"] = full_combo.apply(
    lambda row: row["Cuisine"] if isinstance(row["Cuisine"], list) else find_cuisines(row["Name"]), axis=1)
full_combo["Cuisine"] = full_combo.apply(
    lambda row: row["Cuisine"] if isinstance(row["Cuisine"], list) else find_cuisines(row["Review Comment"]), axis=1)

# note: it can be assumed that the name of a restaurant will contain the cuisine name only if it is relevant - i.e., one would expect "John's Chinese Place" (serving Chinese cuisine) but not "John's not Korean Place" (serving Chinese cuisine). In contrast, such an assumption cannot be made about a review comment - the reviewer may have been under a false impression of the cuisine that should be expected, and this could be mentioned in their review (e.g. "If this place claimed to be Elvish, these sour sauces would make sense!") - resulting in some inaccuracies. Since this dataset is a little small and isn't 100% full of the data desired, and as this is, after all, just an exercise with what is essentially dummy data, this use was left in. In production, in real-world cases, a more complex approach would be required to ensure the extraction is accurate.  

## finding the main cuisine
As the lists are an accumulation of several sources: restaurant name, Trip Advisor, Menu Data, and Inspection Data, they can be somewhat confusing to make a specific recommendation. 

For example, "3 Guys Restaurant" is described as [American, Cafe, Japanese, Diner] - while the menu items are similar to: cereal, spinach omelette, belgian waffle nutella. One of the cuisine descriptors is not appropriate - namely, "Japanese".  

This will be done in several steps:
1. clean and transform the menu item data using TF-IDF 
2. clean the cuisine categories ensuring each menu has a specific designation of cuisine 
3. train models 
4. select best model to be used for the final dataset 

#### Creating Cuisine Categories
since the cuisines are lists, and the task here is a categorization task, we need each category to be somewhat unique. In order to do so, the lists should be minimized. 



The minimization will be following the assumption that the rarest cuisine is the most relevant one to the description - for example, a Mexican-Korean fusion restaurant will show up as ["Mexican", "Korean"] will be considered "Korean" if count("Mexican") > count("Korean"). Many examples from the dataset include combinations of cuisnies where this designation will be more accurate - such as ["American", "Burgers"], ["Burgers, "Sandwiches"], ["Deli Foods", "Eastern European"] - where either designation would be alright. 

In [6]:
full_combo["Cuisine"]

0                                              []
1                               [japanese, sushi]
2               [cafe, japanese, american, diner]
3                                       [Mexican]
4      [sandwiches, deli food, american, burgers]
                          ...                    
799        [middle eastern, mediterranean, tapas]
800                               [Mediterranean]
801                                            []
802                                            []
803                                            []
Name: Cuisine, Length: 804, dtype: object

In [16]:
## in order to clean up the cuisine names, several actions are taken here: :
# cleaning "&", removing the stopword "food", remove meaningless cuisines, merge all the cafe categories
# as well as differentiating between american cuisine and south-/latin-american cuisine by treating "american" as a stopword in those cases only. 
menu_freq = {}
def clean_cuisine(s):
    clean = s.lower().replace("&", "").strip() #replace(" food", "").
    # if "american" in clean:
    #     if clean != "american":  
    #         clean= " ".join([i for i in clean.split("american") if len(i) > 0]).strip()
    if "latin" in clean:
        clean = "latin american"
    if "american new" in clean:
        clean = "american"
    not_informaitve = ["other", "lunch specials", "mixed buffet"]
    for i in not_informaitve:
        if i == clean:
            clean = ""
    cafe_indicators = ["coffee", "tea", "cafe", "caf"]
    for i in cafe_indicators:
        if i in clean:
            clean = "cafe"
    return clean

In [19]:
full_combo["Cuisine"] = full_combo["Cuisine"].apply(lambda l: [clean_cuisine(item) for item in l if clean_cuisine(item) != ""] if isinstance(l, list) else l)

In [20]:
for cuisine_list in full_combo["Cuisine"]: 
    if isinstance(cuisine_list, list):
        for cuisine in cuisine_list:
            if cuisine in menu_freq:
                menu_freq[cuisine] += 1
            else:
                menu_freq[cuisine] = 1
cuisine_freq = pd.DataFrame.from_dict(menu_freq, orient='index')
cuisine_freq.reset_index(inplace=True)
cuisine_freq.columns = ["Cuisine", "Frequency"]
cuisine_freq.sort_values(["Frequency"], ascending = False, inplace=True)
cuisine_freq.reset_index(drop=True, inplace=True)
cuisine_freq

Unnamed: 0,Cuisine,Frequency
0,american,336
1,italian,238
2,pizza,194
3,cafe,168
4,sandwiches,142
...,...,...
102,pitas,2
103,mixed buffet,2
104,english,2
105,malaysia,2


In [21]:
full_combo["Cuisine Type Count"] = full_combo["Cuisine"].apply(lambda l: len(l) if isinstance(l, list) else 0)
full_combo["Cuisine"] = full_combo.apply(lambda row: None if row["Cuisine Type Count"] == 0 else row["Cuisine"], axis=1 )

In [22]:
full_combo["Cuisine"][full_combo["Cuisine Type Count"] == 1].value_counts()

Cuisine
[italian]             26
[diner]               17
[mexican]             16
[american]            16
[chinese]             15
[caribbean]           14
[latin american]      10
[indian]              10
[pizza]               10
[bar]                  9
[cafe]                 7
[thai]                 7
[dominican]            5
[spanish]              5
[french]               4
[burgers]              4
[asian]                3
[kosher]               2
[turkish]              2
[middle eastern]       2
[korean]               2
[eastern european]     2
[bangladeshi]          2
[greek]                2
[irish]                1
[hot dogs]             1
[jamaican]             1
[seafood]              1
[smoothies juices]     1
[peruvian]             1
[pub]                  1
[pasta]                1
[deli]                 1
[sandwiches]           1
[soul food]            1
[african]              1
[mediterranean]        1
Name: count, dtype: int64

In [12]:
## these categories are noisy; some items will have next to no examples. there needs to be some fine-tuning of the categories. since there are a few hundred cuisine lists with a total of 82 individual cuisines, it seems that running idxmax on a tf-idf matrix should be the best method to determine relevance; tf-idf is sensitive to both raw item frequency and to item specifcity, so it should produce better results than the raw frequency solution.
## It should be noted that in a real-life business scenario, a combination of a computational solution and some manual review should be appropriate - making up for potential "blind spots" in the data (such as combining "Russian", "Polish" and "Eastern European" into a single category). This would be appropriate in a case when the accuracy of the categories affects the business goals - which is (fortunately) not the case for this exercise.
## fitting tf-idf with unigrams and bigrams was tested, but bigrams were deemed unhelpful for this task 
menu_rest_data["Cuisine - String"] = menu_rest_data["Cuisine"].apply(lambda x: " ".join([clean_cuisine(i) for i in x]) if isinstance(x, list) else x)
vectorizer = TfidfVectorizer(ngram_range=(1, 1))
cuisine_tfidf_matrix = vectorizer.fit_transform(menu_rest_data["Cuisine - String"])
cuisine_tfidf = pd.DataFrame(cuisine_tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

NameError: name 'menu_rest_data' is not defined

In [None]:
cuisine_totals = cuisine_tfidf.sum(axis=0).sort_values(ascending=False)
top_15_freq = cuisine_freq.head(15)
top_15_tfidf = cuisine_totals.head(15)

plt.figure(figsize=(12, 6))
plt.suptitle("Top 15 Restaurants - ", fontsize=14, fontweight='bold')
plt.subplot(1, 2, 1)
sns.barplot(x=top_15_freq["Frequency"],  y=top_15_freq["Cuisine"], hue=top_15_freq["Cuisine"], palette='deep', legend=False)
plt.ylabel('Cuisines')
plt.title('Top 15 Cuisines - Frequency')
plt.subplot(1, 2, 2)
sns.barplot(x=top_15_tfidf.values,  y=top_15_tfidf.index, hue=top_15_tfidf.index,  legend=False, palette='deep')
plt.ylabel('Cuisines')
plt.title('Top 15 Cuisines - TF-IDF')
plt.show()
## the difference in ranking shows that TF-IDF performs better than just frequency: "sandwiches" and "american" are less prominent, while "chinese" is ranked more highly. In simple, actionable terms - the list on the right would be a preferable priority of choosing cuisines when compared to the list on the left. For example, ["diner", "sandwiches", "burgers"] will be consolidated into "burgers" if the lowest frequency is used alone; it will be consolidated as "diner" if highest TF-IDF score is used. Since there is no way to definitively say that the cuisine name chosen is the most accurate one for the specific restaurant without a human being reviewing the menu, this will do. 

In [None]:
menu_rest_data["Cuisine - TF-IDF"] = cuisine_tfidf.idxmax(axis=1)

In [None]:
def find_cuisine_from_cuisine_list(cuisine_list):
    if isinstance(cuisine_list, list):
        clean_list = [clean_cuisine(cuisine) for cuisine in cuisine_list]
        best_match = ""
        for cuisine in cuisine_freq["Cuisine"]:
            if cuisine in clean_list:
                best_match = cuisine
                break
        return best_match
menu_rest_data["Cuisine - Least Frequent"] = menu_rest_data["Cuisine"].apply(find_cuisine_from_cuisine_list)

In [None]:
menu_rest_data[["Cuisine", "Cuisine - Least Frequent", "Cuisine - TF-IDF"]]