# Restaurant Recommendation System_
## Prepare import

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

## Load Data
### GoogleReview
#### View Sample, Check Null Data For Review

In [2]:
restaurants_review_data = pd.read_csv('GoogleReview_data_cleaned.csv', low_memory = False)
print("Review data:", restaurants_review_data.shape)
print("\nCheck NULL values in Reviews:\n-----------------------------")
print(restaurants_review_data.isnull().sum())
print("-----------------------------")
restaurants_review_data.head()

Review data: (222020, 5)

Check NULL values in Reviews:
-----------------------------
Author        0
Rating        0
Review        0
Restaurant    0
Location      0
dtype: int64
-----------------------------


Unnamed: 0,Author,Rating,Review,Restaurant,Location
0,Jia Pin Lee,4.0,Came here for the High Tea. Great service espe...,Cuisines Restaurant,Ipoh
1,Chui Yi Lum,2.0,"5 stars for the service, even though some of t...",Cuisines Restaurant,Ipoh
2,liezel wong,1.0,"Hi, thank you for your service. But! i feel so...",Cuisines Restaurant,Ipoh
3,Nazri Nor,1.0,I have the worse buffer dinner ever so far. Th...,Cuisines Restaurant,Ipoh
4,Fakru Imran's Channel,5.0,"That's are Known 5 Elmark "" 9H72 "" & KDK "" 3 K...",Cuisines Restaurant,Ipoh


## Data Preprocessing

In [None]:
# Consider only those author who have rated more than 10 restaurants and those restaurant which are having at least 20 ratings
x = restaurants_review_data.groupby('Author').count()['Rating'] > 10
quality_author  = x[x].index

restaurants_review_data = restaurants_review_data[restaurants_review_data['Author'].isin(quality_author)]

y = restaurants_review_data.groupby('Restaurant')['Rating'].count() >= 20
famous_restaurants = y[y].index

restaurants_review_data = restaurants_review_data[restaurants_review_data['Restaurant'].isin(famous_restaurants)]

# due to we filter out the author who only have rated more than 10 restaurants and those restaurant which 
restaurants_review_data = restaurants_review_data.reset_index()
# drop index column, axis = 0 indicates rows, 1 indicates columns
restaurants_review_data = restaurants_review_data.drop(['index'], axis=1)
restaurants_review_data.shape

# Get the restaurants in this dataset without duplicates
restaurants_data = restaurants_review_data.drop_duplicates(subset=['Restaurant'])
# no. of restaurants = 420
restaurants_data = restaurants_data.reset_index() 

# print()

# Get the authors in this dataset without duplicates
authors_data = restaurants_review_data.drop_duplicates(subset=['Author'])
# No. of authors in this dataset = 253
authors_data.shape


## Data Visualization

In [None]:
state_counts = restaurants_data.groupby('Location')['Restaurant'].count()
state_counts.plot(kind='bar', figsize = (10, 8))
plt.title('Number of Restaurant(s) by City', fontsize = 20)
plt.xlabel('City', fontsize = 17)
plt.ylabel('Number of Restaurants(s)', fontsize = 17)
for i in range(len(state_counts)):
    plt.text(x=i, y = state_counts.iloc[i], s = state_counts.iloc[i], ha = 'center', fontsize = 15)
plt.show()

In [None]:
# use pivot_table because in our datasets, a restaurant have multiple ratings, so we cannot just calculate similarity scores based on the restaurants_review_data
pt = restaurants_review_data.pivot_table(index = 'Restaurant', columns = 'Author', values = 'Rating').fillna(0)

# index = 'Restaurant' columns = 'Author' --> this is the average rating given to each restaurant by the authors
# index = 'Author' columns = 'Restaurant' --> this is the average rating given by the authors to each restaurant (the average rating of an author given to all restaurants)

In [None]:
# Using cosine similarity metrics
similarity_scores_cs = cosine_similarity(pt)

def recommendByCos(restaurantName, isFilter):
    index = np.where(pt.index == restaurantName)[0][0]
    similar_restaurants = sorted(enumerate(similarity_scores_cs[index]),key= lambda x: x[1], reverse =True)[1:]
    
    newData = restaurants_data.copy()
    newData['Score'] = np.nan
    
    for a, i in enumerate(similar_restaurants):
        newData.loc[newData['Restaurant'] == pt.index[i[0]], 'Score'] = similar_restaurants[a][1]
        
    newData = newData.sort_values('Score', ascending = False)
    
    if isFilter == False:
        return newData[:numberOfSuggestion]
    else:
        newData = newData[newData['Location'] == cityToFilter]
        for idx, row in newData.iterrows():
            if row['Score'] == 0.0:
                newData = newData.drop(idx)

        return newData

In [None]:
print("Recommend Using Linear Kernel")
restaurant = input('Enter a restaurant name: ')
numberOfSuggestion = None
prompt = input('Do you want the recommendations based on city?(y/n)')
if prompt == 'y': 
    cityToFilter = input('Enter a city (KL/Petaling Jaya/Miri/Ipoh): ')
    restaurants = recommendByCos(restaurant, True)
    num_rows, num_cols = restaurants.shape
else: 
    numberOfSuggestion = int(input('Enter the number of recommendations you want: '))
    restaurants = recommendByCos(restaurant, False)
    num_rows, num_cols = restaurants.shape
    
if num_rows == 0:
    print('There is no recommend restaurants from the city you provided')
else:
    print(restaurants)

In [None]:
similarity_scores_lk = linear_kernel(pt)

def recommendByLin(restaurantName, isFilter):
    index = np.where(pt.index== restaurantName)[0][0]
    similar_restaurants = sorted(enumerate(similarity_scores_lk[index]), key = lambda x: x[1], reverse = True)[1:]
    
    newData = restaurants_data.copy()
    newData['Score'] = np.nan
    
    for a, i in enumerate(similar_restaurants):
        newData.loc[newData['Restaurant'] == pt.index[i[0]], 'Score'] = similar_restaurants[a][1]
        
    newData = newData.sort_values('Score', ascending = False)
    
    if isFilter == False:
        return newData[:numberOfSuggestion]
    else:
        newData = newData[newData['Location'] == cityToFilter]
        for idx, row in newData.iterrows():
            if row['Score'] == 0.0:
                newData = newData.drop(idx)

        return newData

In [None]:
# # A function for retrieving the restaurant info from the restaurants_data dataframe
# def retrieveRestaurant(dataframe):
#     recommendedRestaurant = dataframe
#     restaurants = pd.DataFrame(columns=['Author', 'Rating', 'Review', 'Restaurant', 'Location'])
#     for i in range(len(recommendedRestaurant)):
#         restaurants = restaurants.append(restaurants_data[restaurants_data['Restaurant'] == recommendedRestaurant[i]], ignore_index = True)
#     return restaurants

In [None]:
print("Recommend Using Linear Kernel")
restaurant = input('Enter a restaurant name: ')
numberOfSuggestion = None
prompt = input('Do you want the recommendations based on city?(y/n)')
if prompt == 'y': 
    cityToFilter = input('Enter a city (KL/Petaling Jaya/Miri/Ipoh): ')
    restaurants = recommendByLin(restaurant, True)
    num_rows, num_cols = restaurants.shape
else: 
    numberOfSuggestion = int(input('Enter the number of recommendations you want: '))
    restaurants = recommendByLin(restaurant, False)
    num_rows, num_cols = restaurants.shape
    
if num_rows == 0:
    print('There is no recommend restaurants from the city you provided')
else:
    print(restaurants)