In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# This is from Google Review
# df = pd.read_csv('GoogleReview_data_cleaned.csv')

# This is from Tripasor Review
df = pd.read_csv('TripAdvisor_data_cleaned.csv')
df = df[['Author', 'Rating', 'Review', 'Restaurant', 'Location']]

df.head()

Unnamed: 0,Author,Rating,Review,Restaurant,Location
0,kmc1e2018,5.0,"David, Thanga, Mikail and Chef Steven gave bri...",Chambers Grill,KL
1,"MizOthmanKuala Lumpur, Malaysia",5.0,We visited for family celebration and as usual...,Chambers Grill,KL
2,"Relax36268533224Hjarno, Denmark",5.0,Perfect dinner after a long journey.After trav...,Chambers Grill,KL
3,tstrry,5.0,Had a great Tomahawk for me and hubby thanks t...,Chambers Grill,KL
4,Naim_123456789,5.0,"Excellent ambient. Excellent service by David,...",Chambers Grill,KL


In [4]:
#Check Null values in Dataframe
df.isnull().sum()

Author        0
Rating        0
Review        0
Restaurant    0
Location      0
dtype: int64

In [5]:
df.shape

(139764, 5)

In [6]:
# Copy/Prepare data
df_data = df[['Author', 'Review', 'Rating', 'Restaurant', 'Location']]
df.head()

Unnamed: 0,Author,Rating,Review,Restaurant,Location
0,kmc1e2018,5.0,"David, Thanga, Mikail and Chef Steven gave bri...",Chambers Grill,KL
1,"MizOthmanKuala Lumpur, Malaysia",5.0,We visited for family celebration and as usual...,Chambers Grill,KL
2,"Relax36268533224Hjarno, Denmark",5.0,Perfect dinner after a long journey.After trav...,Chambers Grill,KL
3,tstrry,5.0,Had a great Tomahawk for me and hubby thanks t...,Chambers Grill,KL
4,Naim_123456789,5.0,"Excellent ambient. Excellent service by David,...",Chambers Grill,KL


In [7]:
# Building User-based Collaborative Filtering
author_restaurant_matrix = df_data.pivot_table(index = 'Author', columns = ['Restaurant'], values = 'Rating').fillna(0)
author_restaurant_matrix.head(20)

Restaurant,101 Food Center,103 Coffee Workshop,16th St Cafe,1825 Gallery Hotel,1919 Restaurant,1919 Restaurant and Gallery,1957,1Stop Station Cafe,20 Chulia Lane Cafe,21 Bistro,...,restoran saravanna,s11kopitiam,sarvana bhavan,twenty-one kitchen+bar,wan chai,Ăn Viet,淡杯叻沙 Tampoi Laksa,老行家 Expert (RH Plaza),阿胜乐乐,龍門客棧 Dragon Door Inn Steamboat Restaurant
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Anne L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Charleston C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Suresh RKuala Lumpur, Malaysia",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"-Cheng-Hui-__-Singapore, Singapore",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"-IDDQD-Munich, Germany",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-Miss-Williams-123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"-Miss-Williams-123London, United Kingdom",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-alfred-alfred-Tsingapore,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-aquamaryn-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"-blacktop_3-Toronto, Canada",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
restaurant_author_matrix = author_restaurant_matrix.transpose()
restaurant_author_matrix.head(15)

Author,Anne L,Charleston C,"Suresh RKuala Lumpur, Malaysia","-Cheng-Hui-__-Singapore, Singapore","-IDDQD-Munich, Germany",-Miss-Williams-123,"-Miss-Williams-123London, United Kingdom",-alfred-alfred-Tsingapore,-aquamaryn-,"-blacktop_3-Toronto, Canada",...,邱 伟,金钱 倩,阳雪 李,란영 김,민정 장,잭 용,태규 김,현호 이,혜수 신,혜진 서
Restaurant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
101 Food Center,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103 Coffee Workshop,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16th St Cafe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1825 Gallery Hotel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1919 Restaurant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1919 Restaurant and Gallery,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1Stop Station Cafe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20 Chulia Lane Cafe,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21 Bistro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Identify Ratings For Restaurants Per Author
# restaurant_ratings = restaurant_author_matrix['5525 Gunner']
restaurant_ratings = restaurant_author_matrix['邱 伟']
restaurant_ratings.head(10)

Restaurant
101 Food Center                0.0
103 Coffee Workshop            0.0
16th St Cafe                   0.0
1825 Gallery Hotel             0.0
1919 Restaurant                0.0
1919 Restaurant and Gallery    0.0
1957                           0.0
1Stop Station Cafe             0.0
20 Chulia Lane Cafe            0.0
21 Bistro                      0.0
Name: 邱 伟, dtype: float64

In [20]:
# restaurant_ratings = restaurant_author_matrix['几米林Jimmy']
restaurant_ratings = restaurant_author_matrix['Anne L']
restaurant_ratings.head(10)

Restaurant
101 Food Center                0.0
103 Coffee Workshop            0.0
16th St Cafe                   0.0
1825 Gallery Hotel             0.0
1919 Restaurant                0.0
1919 Restaurant and Gallery    0.0
1957                           0.0
1Stop Station Cafe             0.0
20 Chulia Lane Cafe            0.0
21 Bistro                      0.0
Name: Anne L, dtype: float64

In [21]:
# Find correlation between authors based on the ratings of restaurants
similar_authors = restaurant_author_matrix.corrwith(restaurant_ratings)

# Create a dataframe with similar movies as the index column and correlation as another column
similar_authors = pd.DataFrame(similar_authors, columns = ['correlation'])
similar_authors.head(10)

Unnamed: 0_level_0,correlation
Author,Unnamed: 1_level_1
Anne L,-0.000841
Charleston C,-0.000841
"Suresh RKuala Lumpur, Malaysia",-0.000841
"-Cheng-Hui-__-Singapore, Singapore",-0.000841
"-IDDQD-Munich, Germany",-0.000841
-Miss-Williams-123,-0.000841
"-Miss-Williams-123London, United Kingdom",-0.000841
-alfred-alfred-Tsingapore,-0.000841
-aquamaryn-,0.562319
"-blacktop_3-Toronto, Canada",-0.000841


In [22]:
# Recommending Restaurants Based on The Most Similar Authors
most_similar_authors = similar_authors.sort_values('correlation', ascending = False).iloc[1:11]
most_similar_authors

Unnamed: 0_level_0,correlation
Author,Unnamed: 1_level_1
marview,0.562319
"CmozziePenang Island, Malaysia",0.562319
Mimi G,0.562319
E7174,0.562319
sunnyc658,0.562319
dumpling_18Sydney,0.562319
"B HLondon, United Kingdom",0.562319
"yychan69Manchester, United Kingdom",0.562319
"B0b1314Adelaide, Australia",0.562319
FinnMcCoolAsiaAsia,0.562319


In [23]:
# Extract Author of the most similar users
authors = most_similar_authors.index.values.tolist()
authors[0]

'marview'

In [24]:
recommendation = df_data[df_data['Author'] == authors[0]]
recommendation.head(10)

Unnamed: 0,Author,Review,Rating,Restaurant,Location
92845,marview,"The food was served quickly, the staff very he...",4.0,Boatman Restaurant,Penang


In [25]:
# Data Frame Slicing by Condition
recommendation = df_data.loc[(df_data['Author'] == authors[0]) & (df_data['Rating'] > 0), ['Restaurant', 'Rating']]
recommendation.head(10)

Unnamed: 0,Restaurant,Rating
92845,Boatman Restaurant,4.0


In [16]:
#Build the ratings matrix using pivot_table function
author_restaurant_matrix = df_data.pivot_table(index = 'Author', columns = ['Restaurant'], values = 'Rating').fillna(0)
author_restaurant_matrix.head()

Restaurant,'D' Selera Kelate,16th St. Cafe,1919 Restaurant Ipoh,20 Chulia Lane Cafe,21 Bistro,218 Hainan Lor Mee,27@cove,28 Food Centre,3 :15 Auntie Hong's Cooking,33 Blue Room,...,寒舍 HANSHE @Perling,心安素食斋料食馆 Xin An Vegetarian Cafe,怡保古早味猪肠粉 Ipoh Traditional Style Chee Cheong Fun,我家餐館Our Kitchen Nyonya Restaurant,海皇粿条仔Restaurant Hi Wan,田園粥火锅 Farmland Porridge Steamboat,相聚火锅 The Gather BBQ Steamboat(新犀鸟阁 New Hornbill BBQ Steamboat）,越南小廚 V NAM KITCHEN,青山角 / Green Hill Corner,食得福美食中心Cedar Point Food Centre
Author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
# cikgusally,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#Ativ Mindworks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#GJBlane RICE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#JL_King_Of_Music,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
#MyNameIsMuna #MUNALICIOUS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [27]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of author-restaurant tuples from the testing dataset
    id_pairs = zip(df_data['Author'], df_data['Restaurant'])
    
    #Predict the rating for every author-restaurant tuple
    y_pred = np.array([cf_model(author, restaurant) for (author, restaurant) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(df_data['Rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [36]:
#User Based Collaborative Filter using Mean Ratings
def cf_author_mean(author, restaurant):
    
    #Check if restaurant exists in r_matrix
    if restaurant in author_restaurant_matrix:
        #Compute the mean of all the ratings given to the restaurant
        mean_rating = author_restaurant_matrix[restaurant].mean()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3.0
    
    return mean_rating

#Compute RMSE for the Mean model
score(cf_author_mean)

4.338755362155678

In [32]:
#User Based Collaborative Filter using Weighted Mean
#Create a dummy ratings matrix with all null values imputed to 0
# author_restaurant_matrix_dummy = df_data.pivot_table(index = 'Author', columns = ['Restaurant'], values = 'Rating').fillna(0)
# author_restaurant_matrix_dummy.head()

In [40]:
# # Import cosine_score 
# from sklearn.metrics.pairwise import cosine_similarity

# #Compute the cosine similarity matrix using the dummy ratings matrix
# cosine_sim = cosine_similarity(author_restaurant_matrix, author_restaurant_matrix)

In [42]:
#Convert into pandas dataframe 
# cosine_sim = pd.DataFrame(cosine_sim, index=author_restaurant_matrix.index, columns=author_restaurant_matrix.index)

# cosine_sim.head(10)