In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
dataset=pd.read_csv("zomato.csv")

In [3]:
df=pd.DataFrame(dataset)

In [33]:
df.head()

Unnamed: 0,name,online_order,book_table,rate,location,cuisines,cost,reviews_list,city,Mean Rating
0,Jalsa,True,True,4.1,Banashankari,"North Indian, Mughlai, Chinese",800.0,a to you to the the are just there on the of a...,Banashankari,3.99
1,Spice Elephant,True,False,4.1,Banashankari,"Chinese, North Indian, Thai",800.0,had been here for with out to be a for all of ...,Banashankari,3.97
2,San Churro Cafe,True,False,3.8,Banashankari,"Cafe, Mexican, Italian",800.0,is not that and its not a and the is not that ...,Banashankari,3.58
3,Addhuri Udupi Bhojana,False,False,3.7,Banashankari,"South Indian, North Indian",300.0,and been there and was will if its the at on o...,Banashankari,3.45
4,Grand Village,False,False,3.8,Basavanagudi,"North Indian, Rajasthani",600.0,very in is of with and of of its you some very...,Banashankari,3.58


In [5]:
#deleting unnecessary columns
df=df.drop(['url','dish_liked','phone'],axis=1)

In [6]:
#Removing the duplicates
df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [7]:
df.head()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,Banashankari,Quick Bites,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51674 entries, 0 to 51716
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   address                      51674 non-null  object
 1   name                         51674 non-null  object
 2   online_order                 51674 non-null  object
 3   book_table                   51674 non-null  object
 4   rate                         43907 non-null  object
 5   votes                        51674 non-null  int64 
 6   location                     51653 non-null  object
 7   rest_type                    51447 non-null  object
 8   cuisines                     51629 non-null  object
 9   approx_cost(for two people)  51329 non-null  object
 10  reviews_list                 51674 non-null  object
 11  menu_item                    51674 non-null  object
 12  listed_in(type)              51674 non-null  object
 13  listed_in(city)              51

In [9]:
df.duplicated().sum()

0

In [10]:
#Remove the Nan values from the dataset
df.isnull().sum()

address                           0
name                              0
online_order                      0
book_table                        0
rate                           7767
votes                             0
location                         21
rest_type                       227
cuisines                         45
approx_cost(for two people)     345
reviews_list                      0
menu_item                         0
listed_in(type)                   0
listed_in(city)                   0
dtype: int64

In [11]:
df.dropna(how='any',inplace=True)

In [12]:
df.isnull().sum().count()

14

In [13]:
df.head(2)

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari


In [14]:
#Changing the column names
df=df.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type','listed_in(city)':'city'})

In [15]:
#Some Transformations
df['cost']=df['cost'].astype(str) #Changing the cost to string
df['cost']=df['cost'].apply(lambda x : x.replace(',','.'))#Using lambda function to replace ',' from cost
df['cost']=df['cost'].astype(float)

In [16]:
df.cost.unique()

array([800.  , 300.  , 600.  , 700.  , 550.  , 500.  , 450.  , 650.  ,
       400.  , 900.  , 200.  , 750.  , 150.  , 850.  , 100.  ,   1.2 ,
       350.  , 250.  , 950.  ,   1.  ,   1.5 ,   1.3 , 199.  ,  80.  ,
         1.1 , 160.  ,   1.6 , 230.  , 130.  ,   1.7 ,   1.4 ,   1.35,
         2.2 ,   2.  ,   1.8 ,   1.9 , 180.  , 330.  ,   2.5 ,   2.1 ,
         3.  ,   2.8 ,   3.4 ,  50.  ,  40.  ,   1.25,   3.5 ,   4.  ,
         2.4 ,   2.6 ,   1.45,  70.  ,   3.2 , 560.  , 240.  , 360.  ,
         6.  ,   1.05,   2.3 ,   4.1 , 120.  ,   5.  ,   3.7 ,   1.65,
         2.7 ,   4.5 ])

In [17]:
df.rate.unique()

array(['4.1/5', '3.8/5', '3.7/5', '3.6/5', '4.6/5', '4.0/5', '4.2/5',
       '3.9/5', '3.1/5', '3.0/5', '3.2/5', '3.3/5', '2.8/5', '4.4/5',
       '4.3/5', 'NEW', '2.9/5', '3.5/5', '2.6/5', '3.8 /5', '3.4/5',
       '4.5/5', '2.5/5', '2.7/5', '4.7/5', '2.4/5', '2.2/5', '2.3/5',
       '3.4 /5', '-', '3.6 /5', '4.8/5', '3.9 /5', '4.2 /5', '4.0 /5',
       '4.1 /5', '3.7 /5', '3.1 /5', '2.9 /5', '3.3 /5', '2.8 /5',
       '3.5 /5', '2.7 /5', '2.5 /5', '3.2 /5', '2.6 /5', '4.5 /5',
       '4.3 /5', '4.4 /5', '4.9/5', '2.1/5', '2.0/5', '1.8/5', '4.6 /5',
       '4.9 /5', '3.0 /5', '4.8 /5', '2.3 /5', '4.7 /5', '2.4 /5',
       '2.1 /5', '2.2 /5', '2.0 /5', '1.8 /5'], dtype=object)

In [18]:
#Removing the rows that has 'NEW' as the rating
df=df.loc[df.rate!='NEW']

In [19]:
#Removing the rows that has '-' as the rating
df=df.loc[df.rate!='-']

In [20]:
#Removing the rows that has '/5' as the rating
def remove_slash_and_convert(text):
    if '/5' in text:
        text = text.replace('/5', '')
    return text.strip()

In [21]:
df['rate'] = df['rate'].apply(remove_slash_and_convert).astype(float)

In [22]:
df.rate.unique()

array([4.1, 3.8, 3.7, 3.6, 4.6, 4. , 4.2, 3.9, 3.1, 3. , 3.2, 3.3, 2.8,
       4.4, 4.3, 2.9, 3.5, 2.6, 3.4, 4.5, 2.5, 2.7, 4.7, 2.4, 2.2, 2.3,
       4.8, 4.9, 2.1, 2. , 1.8])

In [23]:
#Adjust the column names
df.name=df.name.apply(lambda x:x.title())
    #to make data compatible with the algorithms
df.online_order.replace(('Yes','No'),('True','False'),inplace=True) 
df.book_table.replace(('Yes','No'),('True','False'),inplace=True)

In [24]:
restaurants = list(df['name'].unique())
df['Mean Rating']=0

In [25]:
for restaurant in restaurants:
    mean_rating = df['rate'][df['name'] == restaurant].mean()
    df.loc[df['name'] == restaurant, 'Mean Rating'] = mean_rating

# Initialize Min-Max scaler
scaler = MinMaxScaler(feature_range=(1, 5))

# Scale the 'Mean Rating' column and round to 2 decimal places
df['Mean Rating'] = scaler.fit_transform(df[['Mean Rating']]).round(2)

# Text Preprocessing

In [26]:
#Lower casing
df['reviews_list']=df['reviews_list'].str.lower()

#Removal of Punctuations
import string
PUNCT_REMOVE=string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('','',PUNCT_REMOVE))

df["reviews_list"]=df["reviews_list"].apply(lambda text: remove_punctuation(text))

#Removal of Stopwords
STOPWORDS=set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word in STOPWORDS])

df["reviews_list"]=df["reviews_list"].apply(lambda text: remove_stopwords(text))

#Removal of URLS
def remove_urls(text):
    url_pattern =re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'',text)

df['reviews_list']=df['reviews_list'].apply(lambda text:remove_urls(text))
df[['reviews_list','cuisines']].sample(5)

Unnamed: 0,reviews_list,cuisines
15455,is is should your because this in i have it fo...,"Kebab, Biryani"
16390,we a of and there is no it is and just the was...,"Street Food, Fast Food, Rolls, Desserts"
44170,this is on the of been here a of their is the ...,"Cafe, Continental"
13098,and is a the were to i here be but was not i a...,"Maharashtrian, North Indian"
14072,i had and the was very and in i am having this...,"Kerala, Biryani, South Indian, North Indian, C..."


In [27]:
#Restaurant Names
restaurant_names = list(df['name'].unique())
def get_top_words(column,top_nu_of_words,nu_of_word):
    vec=CountVectorizer(ngram_range=nu_of_word,stop_words='english')
    bag_of_words=vec.fit_transform(column)
    sum_words=bag_of_words.sum(axis=0)
    words_freq=[(word,sum_words[0,idx]) for word, idx in vec.vocalubary_.items()]
    words_freq=sorted(words_freq,key= lambda x:x[1] , reverse=True)
    return words_freq[:top_nu_of_words]

df=df.drop(['address','rest_type', 'type', 'menu_item', 'votes'],axis=1)

df_percent=df.sample(frac=0.5)

# TF-IDF Vectorization

In [28]:
df_percent.set_index('name',inplace=True)
indices=pd.Series(df_percent.index)

#creating tf-idf matrix

tfidf= TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
tfidf_matrix=tfidf.fit_transform(df_percent['reviews_list'])

cosine_similarities=linear_kernel(tfidf_matrix,tfidf_matrix)

In [47]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','Mean Rating'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new

In [48]:
recommend('Bottle & Glass')

  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.

TOP 10 RESTAURANTS LIKE Bottle & Glass WITH SIMILAR REVIEWS: 


  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))


Unnamed: 0,cuisines,Mean Rating
Fenny'S Lounge And Kitchen,"Mediterranean, Pizza, Continental, Seafood, Salad",4.51
1131 Bar + Kitchen,"Continental, Asian, Italian, North Indian",4.48
Salut,"Continental, Finger Food, Seafood, Pizza",3.97
By The Blue - Grand Mercure,"North Indian, Mughlai",3.97
Bottle & Glass,"Continental, Chinese, Italian",3.84
Ss Bucket Biryani,"Biryani, North Indian, Chinese",3.8
Samaikya,"Andhra, Seafood, Biryani",3.71
Throwback - Pub & Dine,"Italian, North Indian, Continental",3.71
Atithi,North Indian,3.63
The Bong Palate,"Bengali, North Indian, Chinese",3.58


In [49]:
recommend('Salut')

  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.

TOP 10 RESTAURANTS LIKE Salut WITH SIMILAR REVIEWS: 


  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))
  df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating']][df_percent.index == each].sample()))


Unnamed: 0,cuisines,Mean Rating
Brik Oven,"Cafe, Pizza, Beverages",4.61
Olive Bar And Kitchen,"Mediterranean, European, Salad",4.51
Hoppipola,"Continental, Mediterranean, European",4.26
Kaifu,"Chinese, Asian",4.1
Cafe Coffee Day The Square,Cafe,3.97
Nolimmits Lounge And Club,"Chinese, Continental, North Indian, Burger, Pizza",3.9
Taco Bell,"Mexican, American, Fast Food",3.81
Kullad Cafe,"North Indian, Cafe, Fast Food, Beverages",3.71
Karama Restaurant,"Arabian, North Indian, Beverages",3.67
The Waffles Hut,Desserts,3.64
