In [34]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Part 1: Showing the Dataset

In [36]:
df = pd.read_csv('zomato_res.csv')
df

Unnamed: 0,Additional_outlet_count,Call,Cost_for_two,Cuisines,Delivery_Time_min_order,Features,Home_Delivery,Operational_hours,Rating_votes,Restaurant_Location,Restaurant_Name,Restaurant_Type,View_Menu
0,1 more outlet in Mumbai,True,"₹1,500","Finger Food, Continental, European, Italian",,"Food Hygiene Rated Restaurants In Mumbai, Best...",False,12noon – 1am (Mon-Sun),4.9 132...,Kamala Mills Compound,Lord of the Drinks,"Lounge,Casual Dining",True
1,1 more outlet in Mumbai,True,₹800,Pizza,,"Value For Money, Best of Mumbai",False,11am – 12:30AM (Mon-Sun),4.6 597...,Malad West,Joey's Pizza,Quick Bites,True
2,,True,"₹2,500",Seafood,,"Super Seafood, Best of Mumbai",False,"Closed (Mon),12noon – 3pm, 7pm – 12midnight...",4.5 143...,"Linking Road, Bandra West",Bastian,"Casual Dining,Bar",True
3,,True,"₹1,800","Finger Food, Continental",,"Where's The Party?, Best of Mumbai, Food Hygie...",False,12noon – 1am (Mon-Sun),4.9 327...,Lower Parel,Tamasha,"Lounge,Bar",True
4,2 more outlets in Mumbai,True,₹450,"North Indian, Street Food, Fast Food, Chinese",45 min ...,,True,"12noon – 4pm, 7pm – 11:45pm (Mon-Sun)",4.1 142...,Vashi,Bhagat Tarachand,Casual Dining,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7280,,True,₹600,"South Indian, North Indian, Chinese, Fast Food...",45 min ...,,True,8am – 12midnight (Mon-Sun),3.5 115...,Chembur,Hotel Shreedevi,Casual Dining,True
7281,,True,₹550,"Fast Food, North Indian",20 min ...,,True,"9am – 9pm (Mon, Wed, Thu, Fri, Sat, Sun), Clos...",3.5 121...,"Pali Hill, Bandra West",Mac Craig,Quick Bites,True
7282,,True,₹350,"Fast Food, Beverages, Rolls",,,False,8am – 1:30am (Mon-Sun),3.5 213...,Juhu,Ice n Rolls,Quick Bites,True
7283,,True,"₹2,000","Seafood, Chinese",40 min ...,,True,12noon – 12:30AM (Mon-Sun),2.6 195...,"Hotel King's International, Juhu",Temple Flower - Hotel Kings International,Casual Dining,True


# Part 2: Data Cleaning

In [37]:
# Checking the Null Value
df.isnull().sum()

Additional_outlet_count    6285
Call                          0
Cost_for_two                  0
Cuisines                      1
Delivery_Time_min_order    4771
Features                   5695
Home_Delivery                 0
Operational_hours            12
Rating_votes                  0
Restaurant_Location          19
Restaurant_Name               0
Restaurant_Type             656
View_Menu                     0
dtype: int64

In [38]:
# Removing the Null Value of Cuisines and Features --> Because these two will be our main indicator
df = df.dropna(subset=['Cuisines', 'Features', 'Restaurant_Type'])
df.isnull().sum()

Additional_outlet_count    1076
Call                          0
Cost_for_two                  0
Cuisines                      0
Delivery_Time_min_order     549
Features                      0
Home_Delivery                 0
Operational_hours             0
Rating_votes                  0
Restaurant_Location           0
Restaurant_Name               0
Restaurant_Type               0
View_Menu                     0
dtype: int64

# Part 3: Combining the Indicators

In [30]:
# print(f'rows = {df.shape[0]} | columns = {df.shape[1]}')
# Combining all of the column that are selected as indicators
# df['multi'] = (pd.Series(df[['Cuisines', 'Features', 'Restaurant_Type']].values.tolist()).str.join(','))
# df

Unnamed: 0,Additional_outlet_count,Call,Cost_for_two,Cuisines,Delivery_Time_min_order,Features,Home_Delivery,Operational_hours,Rating_votes,Restaurant_Location,Restaurant_Name,Restaurant_Type,View_Menu,multi
0,1 more outlet in Mumbai,True,"₹1,500","Finger Food, Continental, European, Italian",,"Food Hygiene Rated Restaurants In Mumbai, Best...",False,12noon – 1am (Mon-Sun),4.9 132...,Kamala Mills Compound,Lord of the Drinks,"Lounge,Casual Dining",True,"Finger Food, Continental, European, Italian,Fo..."
1,1 more outlet in Mumbai,True,₹800,Pizza,,"Value For Money, Best of Mumbai",False,11am – 12:30AM (Mon-Sun),4.6 597...,Malad West,Joey's Pizza,Quick Bites,True,"Pizza,Value For Money, Best of Mumbai,Quick Bites"
2,,True,"₹2,500",Seafood,,"Super Seafood, Best of Mumbai",False,"Closed (Mon),12noon – 3pm, 7pm – 12midnight...",4.5 143...,"Linking Road, Bandra West",Bastian,"Casual Dining,Bar",True,"Seafood,Super Seafood, Best of Mumbai,Casual D..."
3,,True,"₹1,800","Finger Food, Continental",,"Where's The Party?, Best of Mumbai, Food Hygie...",False,12noon – 1am (Mon-Sun),4.9 327...,Lower Parel,Tamasha,"Lounge,Bar",True,"Finger Food, Continental,Where's The Party?, B..."
5,,True,"₹1,600","Modern Indian, European, Pizza, Grill",,"Best of Mumbai, Food Hygiene Rated Restaurants...",False,1pm – 1am (Mon-Sun),4.8 198...,"Linking Road, Bandra West",JLWA,"Casual Dining,Bar",True,"Modern Indian,Best of Mumbai,Casual Dining,Bar"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,5 more outlets in Mumbai,True,₹600,"South Indian, Beverages",50 min ...,15% off on all orders,True,8am – 12midnight (Mon-Sun),3.7 117...,Prabhadevi,YumYumSouth,Quick Bites,True,
7034,,True,₹300,"Mughlai, Biryani, North Indian, Kebab",45 min ...,10% off on all orders,True,11am – 11pm (Mon-Sun),3.6 243...,Vashi,Lazeezo,Quick Bites,True,
7042,,True,₹500,"Mughlai, Lucknowi, Biryani",,10% off on your first order,False,7pm – 4am (Mon-Sun),2.2 112...,Malad West,Ibrahim's,Quick Bites,True,
7049,,True,₹400,"North Indian, Rolls",45 min ...,20% off on all orders,True,"11:30am – 4pm, 6pm – 11:30pm (Mon, Wed, Thu...",3.2 11 ...,Chakala,Foodiee Live,Quick Bites,True,


In [6]:
# We have a problem showing that only the first 613 rows have their indicators merged.
# So I decided to split it into 3 parts with the multiplier of 613

#### Saving as "Contoh" ####
# df.to_excel('contoh.xlsx', sheet_name='sheet1')

#### Re-open Contoh, then make a copy on it to include the 614th-1226th row to a new dataset
# main = pd.read_excel('contoh.xlsx', sheet_name="sheet1")
# main.drop(0:612, axis=0)

In [39]:
A = pd.read_excel('A.xlsx', sheet_name='PartA')
B = pd.read_excel('A.xlsx', sheet_name='PartB')
C = pd.read_excel('A.xlsx', sheet_name='PartC')

In [40]:
B['multi'] = (pd.Series(df[['Cuisines', 'Features', 'Restaurant_Type']].values.tolist()).str.join(','))
C['multi'] = (pd.Series(df[['Cuisines', 'Features', 'Restaurant_Type']].values.tolist()).str.join(','))

In [41]:
D = pd.concat([A, B, C])

In [22]:
# Checking on how much distinct multi (combined between cuisines, restaurant types, and features)
ext = CountVectorizer(tokenizer= lambda x: x.split(','))
zmulti = ext.fit_transform(D['multi'].head(1316))
len(ext.get_feature_names())

339

In [11]:
# Inserting the vector into an array and check the shape
print(zmulti.toarray())
print(zmulti.toarray().shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(1316, 339)


In [12]:
# Reset index in order for the restaurant that have the original index between 1317 and onwards
#to be included in the Cosine Similarity

#Formula:
D = D.reset_index()
del D['index']

# Part 4: Cosine Similarity

Cosine Similarity Formula = $ \displaystyle \frac {\sum (_{n}A x _{n}B)} {\sqrt {\sum (_{n}A)^2} x \sqrt {\sum (_{n}B)^2}} $

In [13]:
# Measure the similarity between Cuisines using Cosine similarity
cos_score = cosine_similarity(zmulti)
cos_score[0]

array([1.        , 0.1767767 , 0.31622777, ..., 0.375     , 0.        ,
       0.11785113])

# Part 5: Recommendation

In [14]:
# Checking the Index of the favorite restaurant input by the user, assuming it is 'Waters 24 - Vintage Cafe' in this case
resto_input = "Joey's Pizza"
resto_index = D[D['Restaurant_Name'] == resto_input].index[0]
resto_index

1

In [15]:
# Check if the index of favorite restaurant has 100% similarity or not
a = list(enumerate(cos_score[resto_index]))
a[resto_index]

(1, 1.0)

In [16]:
# Ranking List
resto_sug = sorted(a, key=lambda x:x[1], reverse=True)
resto_sug.remove(a[resto_index])
resto_sug[:10]

[(613, 1.0),
 (1225, 1.0),
 (183, 0.6123724356957946),
 (901, 0.6123724356957946),
 (83, 0.5773502691896258),
 (279, 0.5773502691896258),
 (405, 0.5773502691896258),
 (464, 0.5773502691896258),
 (546, 0.5773502691896258),
 (741, 0.5773502691896258)]

In [17]:
# Restaurant Recommendation for the user (in this case'Waters 24 - Vintage Cafe')
for i in resto_sug[:10]:
    print(D.iloc[i[0]]['Restaurant_Name'])

Konkan Chilly
Hyderabad Xpress
Quench - All Day Pub
99 Bollywood Bar
Olive Bar & Kitchen
266 - The Wine Room And Bar
Hariprasad
Smaaash
Above & Beyond
Gutt Gully


In [23]:
ext.get_feature_names()

[' afghani',
 ' all-day dining',
 ' american',
 ' andhra',
 ' arabian',
 ' arabian nights',
 ' asian',
 ' awadhi',
 ' bakery',
 ' bar food',
 ' bbq',
 ' beer in a bar',
 ' bengali',
 ' best food',
 ' best of mumbai',
 ' beverages',
 ' biryani',
 ' brilliant biryanis',
 ' british',
 ' bubble tea',
 ' burger',
 ' burmese',
 ' cafe',
 ' cafe food',
 ' cantonese',
 ' charcoal chicken',
 ' chettinad',
 ' chinese',
 ' coffee',
 ' continental',
 ' corporate favorites',
 ' custom bakes',
 ' dessert picks',
 ' desserts',
 ' dumplings',
 ' european',
 ' european & mediterranean',
 ' fast food',
 ' finger food',
 ' food hygiene rated restaurants in mumbai',
 ' french',
 ' fusion',
 ' german',
 ' gigs and events',
 ' goan',
 ' gourmet pizza',
 ' great buffets',
 ' greek',
 ' grill',
 ' gujarati',
 ' happy hours',
 ' healthy food',
 ' hot pot',
 ' hot stuff by bira 91',
 ' hyderabadi',
 ' ice cream',
 ' indian',
 ' indonesian',
 ' international',
 ' international coffee day',
 ' irani cafés',
 ' ir