# Importing Packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import re
import string
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from nltk.corpus import stopwords
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Reading dataset

In [2]:
zomato_dt = pd.read_csv("zomato.csv")
zomato_dt.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


# Understanding dataset

In [3]:
zomato_dt.shape

(51717, 17)

In [4]:
zomato_dt.columns

Index(['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'phone', 'location', 'rest_type', 'dish_liked', 'cuisines',
       'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)'],
      dtype='object')

In [5]:
zomato_dt.isnull().sum()#checking null values

url                                0
address                            0
name                               0
online_order                       0
book_table                         0
rate                            7775
votes                              0
phone                           1208
location                          21
rest_type                        227
dish_liked                     28078
cuisines                          45
approx_cost(for two people)      346
reviews_list                       0
menu_item                          0
listed_in(type)                    0
listed_in(city)                    0
dtype: int64

In [6]:
zomato=zomato_dt.drop(['url','dish_liked','phone'], axis = 1)#dropping unwanted columns

In [7]:
zomato.duplicated().sum()

43

In [8]:
zomato.drop_duplicates(inplace= True)#removing duplicates

In [9]:
zomato.duplicated().sum()

0

In [10]:
zomato.isnull().sum()

address                           0
name                              0
online_order                      0
book_table                        0
rate                           7767
votes                             0
location                         21
rest_type                       227
cuisines                         45
approx_cost(for two people)     345
reviews_list                      0
menu_item                         0
listed_in(type)                   0
listed_in(city)                   0
dtype: int64

In [11]:
zomato.dropna(how ='any',inplace =True)#Remove the NaN values

In [12]:
zomato.isnull().sum()

address                        0
name                           0
online_order                   0
book_table                     0
rate                           0
votes                          0
location                       0
rest_type                      0
cuisines                       0
approx_cost(for two people)    0
reviews_list                   0
menu_item                      0
listed_in(type)                0
listed_in(city)                0
dtype: int64

In [13]:
zomato= zomato.rename(columns={'approx_cost(for two people)':'cost' , 'listed_in(type)' : 'type' , 'listed_in(city)':'city' })#Changing the column names

In [14]:
zomato.columns

Index(['address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'location', 'rest_type', 'cuisines', 'cost', 'reviews_list',
       'menu_item', 'type', 'city'],
      dtype='object')

# Data Transformations

In [15]:
zomato['cost']=zomato['cost'].astype(str)

In [16]:
zomato['cost']=zomato['cost'].apply(lambda x:x.replace(',','.'))

In [17]:
zomato['cost']=zomato['cost'].astype(float)

In [18]:
zomato=zomato.loc[zomato.rate !='NEW']

In [19]:
zomato=zomato.loc[zomato.rate !='-'].reset_index(drop = True)

# Data Cleaning

In [20]:
remove_slash = lambda x:x.replace('/5', '') if type(x)==np.str else x

In [21]:
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype("float")

In [22]:
zomato["online_order"].head()

0    Yes
1    Yes
2    Yes
3     No
4     No
Name: online_order, dtype: object

In [23]:
zomato['name']= zomato['name'].apply(lambda x:x.title())

# Changing column name

In [24]:
zomato.online_order.replace(('Yes','No'),(True, False),inplace=True)

In [25]:
zomato.book_table.replace(('Yes','No'),(True, False),inplace=True)

In [26]:
restaurants=list(zomato['name'].unique())
zomato['Mean Rating'] = 0

In [27]:
for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()

In [28]:
scaler = MinMaxScaler(feature_range = (1,5))
zomato[['Mean Rating']]=scaler.fit_transform(zomato[["Mean Rating"]]).round(2)

In [29]:
zomato['reviews_list']=zomato['reviews_list'].str.lower()

# Removing punctuation

In [30]:
Punc_remove = string.punctuation

In [31]:
Punc_remove 

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [32]:
def remove_punctuation(text):
    return text.translate(str.maketrans('','', Punc_remove))

In [33]:
zomato['reviews_list']=zomato['reviews_list'].apply(lambda text: remove_punctuation(text))

# Removing stopwords

In [34]:
Stop_words = set(stopwords.words('english'))

In [35]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tgl146\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in Stop_words])

In [37]:
zomato['reviews_list']= zomato['reviews_list'].apply(lambda text: remove_stopwords(text))

# Removing URL

In [38]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'',text)

In [39]:
zomato["reviews_list"]= zomato["reviews_list"].apply(lambda text: remove_urls(text))

In [40]:
zomato.columns

Index(['address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'location', 'rest_type', 'cuisines', 'cost', 'reviews_list',
       'menu_item', 'type', 'city', 'Mean Rating'],
      dtype='object')

In [41]:
zomato[['reviews_list', 'cuisines']].sample(5)

Unnamed: 0,reviews_list,cuisines
3229,rated 30 ratedn never visited place ordered fo...,"Andhra, North Indian"
34921,rated 40 ratedn nice place guysits worth money...,"Continental, North Indian"
5905,rated 50 ratedn great servicei pleased rated 2...,Bakery
9427,rated 30 ratedn amazing food one best shawarma...,"Rolls, Lebanese"
41215,rated 30 ratedn amoeba affordable place bowlin...,"Continental, Chinese"


In [42]:
restaurant_names = list(zomato['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:top_nu_of_words]
    

In [43]:
zomato = zomato.drop(['address','rest_type',  'menu_item', 'type','votes'],axis = 1)

In [44]:
df_percent = zomato.sample(frac=0.5)

In [45]:
df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [46]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommend_restaurant = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommend_restaurant.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df_new = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))
    
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset=['cuisines','Mean Rating', 'cost'], keep=False)
    df_new = df_new.sort_values(by='Mean Rating', ascending=False).head(10)
    
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df_new)), name))
    
    return df_new
recommend('Pai Vihar')

TOP 8 RESTAURANTS LIKE Pai Vihar WITH SIMILAR REVIEWS: 


Unnamed: 0,cuisines,Mean Rating,cost
Atithi,"North Indian, Chinese, Street Food",3.63,800.0
Cinnamon,"North Indian, Chinese, Biryani",3.62,550.0
Magix'S Parattha Roll,"Fast Food, North Indian, Chinese, Mughlai, Rolls",3.52,400.0
Prasiddhi Food Corner,"Fast Food, North Indian, South Indian",3.45,200.0
Shrusti Coffee,"Cafe, South Indian",3.45,150.0
Mayura Sagar,"Chinese, North Indian, South Indian",3.32,250.0
Melange - Hotel Ekaa,"North Indian, Chinese, Continental, Mangalorean",2.81,900.0
Tamarind,"Chinese, North Indian, Continental",2.16,750.0
