In [2]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [20]:
zomato_real = pd.read_csv(r'C:\Users\triti\Downloads\zomato.csv\zomato.csv')
zomato_real.head()

Unnamed: 0,url,address,name,online_order,book_table,rate,votes,phone,location,rest_type,dish_liked,cuisines,approx_cost(for two people),reviews_list,menu_item,listed_in(type),listed_in(city)
0,https://www.zomato.com/bangalore/jalsa-banasha...,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,Yes,Yes,4.1/5,775,080 42297555\r\n+91 9743772233,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800,"[('Rated 4.0', 'RATED\n A beautiful place to ...",[],Buffet,Banashankari
1,https://www.zomato.com/bangalore/spice-elephan...,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,Yes,No,4.1/5,787,080 41714161,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800,"[('Rated 4.0', 'RATED\n Had been here for din...",[],Buffet,Banashankari
2,https://www.zomato.com/SanchurroBangalore?cont...,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,Yes,No,3.8/5,918,+91 9663487993,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800,"[('Rated 3.0', ""RATED\n Ambience is not that ...",[],Buffet,Banashankari
3,https://www.zomato.com/bangalore/addhuri-udupi...,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,No,No,3.7/5,88,+91 9620009302,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300,"[('Rated 4.0', ""RATED\n Great food and proper...",[],Buffet,Banashankari
4,https://www.zomato.com/bangalore/grand-village...,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,No,No,3.8/5,166,+91 8026612447\r\n+91 9901210005,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600,"[('Rated 4.0', 'RATED\n Very good restaurant ...",[],Buffet,Banashankari


## Data cleaning and feature engineering 
For this step we need to do a lot of stuff with the data such as:

    Deleting Unnecessary Columns
    Removing the Duplicates
    Remove the NaN values from the dataset
    Changing the column names
    Data Transformations
    Data Cleaning
    Adjust the column names

In [21]:
# Deleting unnecessary columns
zomato = zomato_real.drop(['url', 'dish_liked', 'phone'], axis = 1)

# Removing the duplicates
zomato.duplicated().sum()
zomato.drop_duplicates(inplace = True)

# Remove the NaN values
zomato.isnull().sum()
zomato.dropna(how = 'any', inplace = True)

# Changing the column names
zomato = zomato.rename(columns = {'approx_cost(for two people)' : 'cost', 
                                  'listed_in(type)' : 'type',
                                 'listen_in(city)' : 'city'})

# Some transformations
zomato['cost'] = zomato['cost'].astype(str)
zomato['cost'] = zomato['cost'].apply(lambda x: x.replace(',', '.'))
zomato['cost'] = zomato['cost'].astype(float)

# Removing '/5' from Rates
zomato = zomato.loc[zomato.rate != 'NEW']
zomato = zomato.loc[zomato.rate != '-'].reset_index(drop = True)
remove_slash = lambda x: x.replace('/5', '') if type(x) == np.str else x
zomato.rate = zomato.rate.apply(remove_slash).str.strip().astype('float')

# Adjust the column names
zomato.name = zomato.name.apply(lambda x: x.title())
zomato.online_order.replace(('Yes', 'No'), (True, False), inplace = True)
zomato.book_table.replace(('Yes', 'No'), (True, False), inplace = True)

# Computing Mean Rating
restaurants = list(zomato['name'].unique())

zomato['Mean Rating'] = 0

for i in range(len(restaurants)):
    zomato['Mean Rating'][zomato['name'] == restaurants[i]] == zomato['rate'][zomato['name'] == restaurants[i]].mean()
    
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range = (1,5))
zomato[['Mean Rating']] = scaler.fit_transform(zomato[['Mean Rating']]).round(2)

## Some text preprocessing 
This step includes:

    Lower casing
    Removal of Punctuations
    Removal of Stopwords
    Removal of URLs
    Spelling correction

In [5]:
# Lower Casing
zomato['reviews_list'] = zomato['reviews_list'].str.lower()

# Removal of punctuations
import string
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    return text.translate(str.maketrans('','', PUNCT_TO_REMOVE))

zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text: remove_punctuation(text))

# Removal of stopwords 
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text: remove_stopwords(text))

# Removal of URLs
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

zomato['reviews_list'] = zomato['reviews_list'].apply(lambda text: remove_urls(text))

zomato[['reviews_list', 'cuisines']].sample(5)


Unnamed: 0,reviews_list,cuisines
40857,rated 30 ratedn went evening coffee filter cof...,"North Indian, Continental, South Indian, Asian"
29712,,Biryani
23776,rated 50 ratedn im sooo delighted see place ar...,"Beverages, Sandwich"
11657,rated 40 ratedn 45nwhat absolutely beautiful e...,"Nepalese, Tibetan, Momos"
27358,rated 40 ratedn food good ambience nice thanks...,"North Indian, Mughlai, Chinese, South Indian"


In [34]:
zomato.head(5)

Unnamed: 0,address,name,online_order,book_table,rate,votes,location,rest_type,cuisines,cost,reviews_list,menu_item,type,listed_in(city),Mean Rating
0,"942, 21st Main Road, 2nd Stage, Banashankari, ...",Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"North Indian, Mughlai, Chinese",800.0,rated 40 ratedn beautiful place dine inthe int...,[],Buffet,Banashankari,1.0
1,"2nd Floor, 80 Feet Road, Near Big Bazaar, 6th ...",Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Chinese, North Indian, Thai",800.0,rated 40 ratedn dinner family turned good choo...,[],Buffet,Banashankari,1.0
2,"1112, Next to KIMS Medical College, 17th Cross...",San Churro Cafe,True,False,3.8,918,Banashankari,"Cafe, Casual Dining","Cafe, Mexican, Italian",800.0,rated 30 ratedn ambience good enough pocket fr...,[],Buffet,Banashankari,1.0
3,"1st Floor, Annakuteera, 3rd Stage, Banashankar...",Addhuri Udupi Bhojana,False,False,3.7,88,Banashankari,Quick Bites,"South Indian, North Indian",300.0,rated 40 ratedn great food proper karnataka st...,[],Buffet,Banashankari,1.0
4,"10, 3rd Floor, Lakshmi Associates, Gandhi Baza...",Grand Village,False,False,3.8,166,Basavanagudi,Casual Dining,"North Indian, Rajasthani",600.0,rated 40 ratedn good restaurant neighbourhood ...,[],Buffet,Banashankari,1.0


## Stopwords

In [31]:
from nltk.corpus import stopwords
en_stops = set(stopwords.words('english'))

all_words = ['There', 'is', 'a', 'tree','near','the','river']
for word in all_words: 
    if word not in en_stops:
        print(word)

There
tree
near
river


In [22]:
# Restaurant Names
restaurant_names = list(zomato['name'].unique())
def get_top_words(column, top_nu_of_words, nu_of_word):
    vec = CountVectorizer(ngram_range = nu_of_word, stop_words = 'english')
    bag_of_words = vec.fit_transform(column)
    sum_of_words = bag_of_words.sum(axis = 0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse = True)
    return words_freq[:top_nu_of_words]

zomato = zomato.drop(['address', 'rest_type', 'type', 'menu_item', 'votes'], axis = 1)

# Randomly sample 60% of your dataframe
df_percent = zomato.sample(frac = 0.5)

## TF-IDF Vectorization

In [24]:
df_percent.set_index('name', inplace = True)
indices = pd.Series(df_percent.index)

# Creating tf-idf matrix
tfidf = TfidfVectorizer(analyzer = 'word', ngram_range = (1,2), min_df = 0, stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(df_percent['reviews_list'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

## Function to recommend restaurants

In [38]:
def recommend(name, cosine_similarities = cosine_similarities):
    
    # creating a list to put top restaurants into
    recommend_restaurant = []
    
    # find index of the hotel entered
    idx = indices[indices == name].index[0]
    
    # find restaurants with a similar cosine-sim value and order them from biggs number
    score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)
    
    # extract top 30 restaurant indices with a similar cosaine-similarity value
    top30_indices = list(score_series.iloc[0:31].index)
    
    # names of the top 30 restaurants
    for each in top30_indices:
        recommend_restaurant.append(list(df_percent.index)[each])
        
    # create new dataset to show similar restaurants
    df_new = pd.DataFrame(columns = ['cuisines', 'Mean Rating', 'cost'])
    
    # create top 30 similar restaurants with some of their columns
    for each in recommend_restaurant:
        df_new = df_new.append(pd.DataFrame(df_percent[['cuisines', 'Mean Rating', 'cost']][df_percent.index == each].sample()))
        
    # Drop the same named restaurants and sort only the top 10 by the highest rating
    df_new = df_new.drop_duplicates(subset = ['cuisines', 'Mean Rating', 'cost'], keep = False)
    df_new = df_new.sort_values(by = 'Mean Rating', ascending = False).head(10)
    
    print('Top %s Restaurants like %s with similar reviews: '%(str(len(df_new)), name))
    
    return df_new

recommend('Pai Vihar')

Top 10 Restaurants like Pai Vihar with similar reviews: 


Unnamed: 0,cuisines,Mean Rating,cost
Queen Pearls,Pizza,1.0,500.0
Amaravati Authentic Andhra Restaurant,"Andhra, Chinese, North Indian, Biryani",1.0,600.0
Karavali Grand,"Mangalorean, Seafood, North Indian, Chinese",1.0,600.0
Wow Paratha,North Indian,1.0,400.0
Andhra Ruchulu,"Andhra, North Indian",1.0,800.0
Andhra Ruchulu,"Andhra, North Indian",1.0,400.0
Karavali Family Restaurant,"Seafood, Biryani, South Indian, North Indian, ...",1.0,500.0
Chicken Corner,"Biryani, North Indian, Fast Food, Chinese",1.0,400.0
King Of Spices,"South Indian, North Indian, Chinese, Biryani, ...",1.0,500.0
Red Chilli,"Chinese, Thai",1.0,600.0
