In [1]:
import numpy as np
import pandas as pd

In [2]:
zomato = pd.read_csv("zomato.csv")
zomato.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


In [3]:
zomato.describe()

Unnamed: 0,votes
count,51717.0
mean,283.697527
std,803.838853
min,0.0
25%,7.0
50%,41.0
75%,198.0
max,16832.0


In [4]:
zomato.columns

Index(['url', 'address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'phone', 'location', 'rest_type', 'dish_liked', 'cuisines',
       'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)'],
      dtype='object')

In [5]:
# Let's delete unnecessary columns
# here we drop column "dish_liked", "phone", "url"
zomato2 = zomato.drop(['url','dish_liked','phone'], axis = 1)

In [6]:
# Let's check for duplicated values
print(zomato2.duplicated().sum())

43


In [7]:
# now let's drop the duplicate values
zomato2.drop_duplicates(inplace=True)

In [8]:
# now let's have a look at the null values in the dataset
print(zomato2.isnull().sum())

address                           0
name                              0
online_order                      0
book_table                        0
rate                           7767
votes                             0
location                         21
rest_type                       227
cuisines                         45
approx_cost(for two people)     345
reviews_list                      0
menu_item                         0
listed_in(type)                   0
listed_in(city)                   0
dtype: int64


In [9]:
# let's drop all the null values
zomato2.dropna(how='any', inplace=True)

In [10]:
# now change the names of some columns to make the columns easier to use in process
zomato = zomato2.rename(columns={'approx_cost(for two people)':'cost','listed_in(type)':'type', 'listed_in(city)':'city'})

In [11]:
zomato.columns

Index(['address', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'location', 'rest_type', 'cuisines', 'cost', 'reviews_list',
       'menu_item', 'type', 'city'],
      dtype='object')

In [12]:
# Some Transformations
zomato['cost'] = zomato['cost'].astype(str) #changing the cost to string

In [13]:
zomato['cost'] = zomato['cost'].apply(lambda x:x.replace(',','.')) # Using lambda function to replace ',' from cost

In [14]:
zomato['cost'] = zomato['cost'].astype(float)

In [15]:
# removing '/5' from Rates
zomato = zomato.loc[zomato.rate != 'NEW']
zomato = zomato.loc[zomato.rate != '-'].reset_index(drop=True)
remove_slash = lambda x:x.replace('/5', '') if type(x) == np.str else x
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')

In [16]:
# Adjust the column names
zomato.name = zomato.name.apply(lambda x:x.title())
zomato.online_order.replace(('Yes','No'), (True, False), inplace=True)
zomato.book_table.replace(('Yes','No'), (True, False), inplace=True)

In [17]:
# computing mean rating
restaurants = list(zomato['name'].unique())
zomato['Mean Rating'] = 0

In [18]:
for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zomato['Mean Rating'][zomato['name'] == restaurants[i]] = zomato['rate'][zomato['name'] == restaurants[i]].mean()


In [19]:
zomato.tail()

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,city,Mean Rating
41232,"136, SAP Labs India, KIADB Export Promotion In...",The Farm House Bar N Grill,False,False,3.7,34,Whitefield,"Casual Dining, Bar","North Indian, Continental",800.0,"[('Rated 4.0', 'RATED\n Ambience- Big and spa...",[],Pubs and bars,Whitefield,3.7
41233,"139/C1, Next To GR Tech Park, Pattandur Agraha...",Bhagini,False,False,2.5,81,Whitefield,"Casual Dining, Bar","Andhra, South Indian, Chinese, North Indian",800.0,"[('Rated 4.0', 'RATED\n A fine place to chill...",[],Pubs and bars,Whitefield,2.283333
41234,"Four Points by Sheraton Bengaluru, 43/3, White...",Best Brews - Four Points By Sheraton Bengaluru...,False,False,3.6,27,Whitefield,Bar,Continental,1.5,"[('Rated 5.0', ""RATED\n Food and service are ...",[],Pubs and bars,Whitefield,3.6
41235,Sheraton Grand Bengaluru Whitefield Hotel & Co...,Chime - Sheraton Grand Bengaluru Whitefield Ho...,False,True,4.3,236,"ITPL Main Road, Whitefield",Bar,Finger Food,2.5,"[('Rated 4.0', 'RATED\n Nice and friendly pla...",[],Pubs and bars,Whitefield,4.3
41236,"ITPL Main Road, KIADB Export Promotion Industr...",The Nest - The Den Bengaluru,False,False,3.4,13,"ITPL Main Road, Whitefield","Bar, Casual Dining","Finger Food, North Indian, Continental",1.5,"[('Rated 5.0', 'RATED\n Great ambience , look...",[],Pubs and bars,Whitefield,3.4


In [20]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (1,5))
zomato[['Mean Rating']] = scaler.fit_transform(zomato[['Mean Rating']]).round(2)

In [21]:
# Lower casing
zomato['review_list'] = zomato['reviews_list'].str.lower()

In [22]:
# now let's define a function to remove punctuation from the reviews
import string
PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('','',PUNCT_TO_REMOVE))
zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text: remove_punctuation(text))

In [23]:
# now let's remove the stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])
zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text: remove_stopwords(text))

In [24]:
# now let's remove the urls from the reviews
import re
def removeurls(text):
    url_pattern = re.compile(r'https?://S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [25]:
zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text: removeurls(text))

zomato[['reviews_list', 'cuisines']].sample(10)

Unnamed: 0,reviews_list,cuisines
29207,Rated 30 RATEDn Only went drinks pretty averag...,"Continental, North Indian, Chinese"
32981,Rated 50 RATEDn My time goto place ice cream W...,"Desserts, Ice Cream"
35387,Rated 50 RATEDn I always number times nice pla...,"Mithai, Street Food"
35025,Rated 40 RATEDn WowiekazawinAmazing place Wish...,"Finger Food, Continental, Pizza"
20494,Rated 30 RATEDn It first visit Arabian hotel o...,"Arabian, Chinese, North Indian"
29793,,"North Indian, Chinese"
12446,Rated 40 RATEDn Had SaturdaynSaturday niight K...,"Continental, Finger Food"
6387,Rated 40 RATEDn This small place well known cr...,North Indian
4390,Rated 10 RATEDn I friend mine lunch absolutely...,"Cafe, Burger, Italian, Fast Food"
21869,Rated 40 RATEDn Food 45nAmbience 45 Typical Ch...,Chinese


In [26]:
# let's process the names of restaurants
restaurants_names = list(zomato['name'].unique())
def get_top_words(column, top_nu_words, nu_of_word):
    vec = CountVectorizer(ngram_range= nu_of_word, stop_words='english')
    bag_of_words = vec.fit_transform(column)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x:x[1], reverse=True)
    return words_freq[:top_nu_of_words]

In [27]:
# now let's drop the unnecessary columns
zomato = zomato.drop(['address', 'rest_type', 'type', 'menu_item', 'votes'], axis=1)

In [28]:
# now randomly sample the data
df_percent = zomato.sample(frac=0.5)

df_percent.set_index('name', inplace=True)
indices = pd.Series(df_percent.index)

In [29]:
# now let's build the restaurant recommendation system
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# now let's perform tf-idf vectorization on the dataset
tfidf = TfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df=0, stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

MemoryError: Unable to allocate 3.17 GiB for an array with shape (20618, 20618) and data type float64

In [None]:
# Now the final step is to create a function to recommend restaurants
def recommend(name, cosine_similarities = cosine_similarities):
    
    # Create a list to put top restaurants
    recommendations = []
    
    # Find the index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # Find the restaurants with a similar cosine-sim value and order them from bigges number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending=False)
    
    # Extract top 30 restaurant indexes with a similar cosine-sim value
    top30_indexes = list(score_series.iloc[0:31].index)
    
    # Names of the top 30 restaurants
    for each in top30_indexes:
        recommendations.append(list(df_percent.index)[each])
    
    # Creating the new data set to show similar restaurants
    df = pd.DataFrame(columns=['cuisines', 'Mean Rating', 'cost'])
    
    # Create the top 30 similar restaurants with some of their columns
    for each in recommendations:
        df = df.append(pd.DataFrame(df_percent[['cuisines','Mean Rating', 'cost']][df_percent.index == each].sample()))

    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df = df.drop_duplicates(subset=['cuisines','Mean Rating', 'cost'], keep=False)
    df = df.sort_values(by='Mean Rating', ascending=False).head(10)
    print('TOP %s RESTAURANTS LIKE %s WITH SIMILAR REVIEWS: ' % (str(len(df)), name))   
    return df