In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Part 1: Showing the Dataset

In [3]:
df = pd.read_csv('zomato_res.csv')
df

Unnamed: 0,Additional_outlet_count,Call,Cost_for_two,Cuisines,Delivery_Time_min_order,Features,Home_Delivery,Operational_hours,Rating_votes,Restaurant_Location,Restaurant_Name,Restaurant_Type,View_Menu
0,1 more outlet in Mumbai,True,"₹1,500","Finger Food, Continental, European, Italian",,"Food Hygiene Rated Restaurants In Mumbai, Best...",False,12noon – 1am (Mon-Sun),4.9 132...,Kamala Mills Compound,Lord of the Drinks,"Lounge,Casual Dining",True
1,1 more outlet in Mumbai,True,₹800,Pizza,,"Value For Money, Best of Mumbai",False,11am – 12:30AM (Mon-Sun),4.6 597...,Malad West,Joey's Pizza,Quick Bites,True
2,,True,"₹2,500",Seafood,,"Super Seafood, Best of Mumbai",False,"Closed (Mon),12noon – 3pm, 7pm – 12midnight...",4.5 143...,"Linking Road, Bandra West",Bastian,"Casual Dining,Bar",True
3,,True,"₹1,800","Finger Food, Continental",,"Where's The Party?, Best of Mumbai, Food Hygie...",False,12noon – 1am (Mon-Sun),4.9 327...,Lower Parel,Tamasha,"Lounge,Bar",True
4,2 more outlets in Mumbai,True,₹450,"North Indian, Street Food, Fast Food, Chinese",45 min ...,,True,"12noon – 4pm, 7pm – 11:45pm (Mon-Sun)",4.1 142...,Vashi,Bhagat Tarachand,Casual Dining,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7280,,True,₹600,"South Indian, North Indian, Chinese, Fast Food...",45 min ...,,True,8am – 12midnight (Mon-Sun),3.5 115...,Chembur,Hotel Shreedevi,Casual Dining,True
7281,,True,₹550,"Fast Food, North Indian",20 min ...,,True,"9am – 9pm (Mon, Wed, Thu, Fri, Sat, Sun), Clos...",3.5 121...,"Pali Hill, Bandra West",Mac Craig,Quick Bites,True
7282,,True,₹350,"Fast Food, Beverages, Rolls",,,False,8am – 1:30am (Mon-Sun),3.5 213...,Juhu,Ice n Rolls,Quick Bites,True
7283,,True,"₹2,000","Seafood, Chinese",40 min ...,,True,12noon – 12:30AM (Mon-Sun),2.6 195...,"Hotel King's International, Juhu",Temple Flower - Hotel Kings International,Casual Dining,True


In [4]:
['Restaurant_Name'].get_feature_names()

AttributeError: 'list' object has no attribute 'get_feature_names'

# Part 2: Data Cleaning

In [39]:
# Checking the Null Value
df.isnull().sum()

Additional_outlet_count    6285
Call                          0
Cost_for_two                  0
Cuisines                      1
Delivery_Time_min_order    4771
Features                   5695
Home_Delivery                 0
Operational_hours            12
Rating_votes                  0
Restaurant_Location          19
Restaurant_Name               0
Restaurant_Type             656
View_Menu                     0
dtype: int64

In [3]:
# Removing the Null Value of Cuisines and Features --> Because these two will be our main indicator
df = df.dropna(subset=['Cuisines', 'Features', 'Restaurant_Type'])
df.isnull().sum()

Additional_outlet_count    1076
Call                          0
Cost_for_two                  0
Cuisines                      0
Delivery_Time_min_order     549
Features                      0
Home_Delivery                 0
Operational_hours             0
Rating_votes                  0
Restaurant_Location           0
Restaurant_Name               0
Restaurant_Type               0
View_Menu                     0
dtype: int64

# Part 3: Convert Data Into Vector

In [4]:
# # Checking on how much distinct Cuisines
ext = CountVectorizer(tokenizer= lambda x: x.split(','))
zcuisines = ext.fit_transform(df['Cuisines'].head(1316))
len(ext.get_feature_names())

168

In [124]:
# Inserting the vector into an array and check the shape
print(zcuisines.toarray())
print(zcuisines.toarray().shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]]
(1316, 168)


In [None]:
# Reset index in order for the restaurant that have the original index between 1317 and onwards
#to be included in the Cosine Similarity

#Formula:
# df = df.reset_index()
# del df['index']

# Part 4: Cosine Similarity

Cosine Similarity Formula = $ \displaystyle \frac {\sum (_{n}A x _{n}B)} {\sqrt {\sum (_{n}A)^2} x \sqrt {\sum (_{n}B)^2}} $

In [131]:
# Measure the similarity between Cuisines using Cosine similarity
cos_score = cosine_similarity(zcuisines)
cos_score[0]

array([1., 0., 0., ..., 0., 0., 0.])

# Part 5: Recommendation

In [132]:
# Checking the Index of the favorite restaurant input by the user, assuming it is 'Waters 24 - Vintage Cafe' in this case
resto_input = "Waters 24 - Vintage Cafe"
resto_index = df[df['Restaurant_Name'] == resto_input].index[0]
resto_index

1315

In [134]:
# Check if the index of favorite restaurant has 100% similarity or not
a = list(enumerate(cos_score[resto_index]))
a[resto_index]

(1315, 1.0000000000000002)

In [135]:
# Ranking List
resto_sug = sorted(a, key=lambda x:x[1], reverse=True)
resto_sug.remove(a[resto_index])
resto_sug[:10]

[(141, 1.0000000000000002),
 (12, 0.8660254037844388),
 (146, 0.8660254037844388),
 (431, 0.816496580927726),
 (461, 0.816496580927726),
 (498, 0.816496580927726),
 (520, 0.816496580927726),
 (528, 0.816496580927726),
 (583, 0.816496580927726),
 (602, 0.816496580927726)]

In [129]:
# Restaurant Recommendation for the user (in this case'Waters 24 - Vintage Cafe')
for i in resto_sug[:10]:
    print(df.iloc[i[0]]['Restaurant_Name'])

Bayview Cafe
BKC | DIVE.
The DanSing Bottle
Hotel Sai Palace Garden
Chawlas²
The Bunker- Family Dining & Bar
Cinnamon N Nutmeg
Cinnamon N Nutmeg
Kake Di Rasoi
Red Sun Multicuisine Family Restaurant


NameError: name 'ext' is not defined