# Personalized Recommender System Using Matrix Factorization Model for Restaurants in Toronto, Ontario, Canada

### Quang Ho

In [1]:
# Import python library

import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer

### Check Business dataset

In [2]:
# view top record of business dataset
business_df = pd.read_csv('C://Users/Quang.DESKTOP-BET5N5G/Documents/Training/Ryerson/CKME136 - Capstone course/\
Literature Reviews/Recommender System/Yelp_datasets/yelp_business.csv', encoding="latin-1")
# business_df.head()

business_df = business_df[['business_id', 'name', 'address', 'city', 'state', 'stars', 'review_count', 'categories']]
business_df = business_df.rename(columns = {'stars':'restaurant_rating'})
business_df.dropna(inplace=True)
business_df.head()

Unnamed: 0,business_id,name,address,city,state,restaurant_rating,review_count,categories
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""","""4855 E Warner Rd, Ste B9""",Ahwatukee,AZ,4.0,22,Dentists;General Dentistry;Health & Medical;Or...
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""","""3101 Washington Rd""",McMurray,PA,3.0,11,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""","""6025 N 27th Ave, Ste 1""",Phoenix,AZ,1.5,18,Departments of Motor Vehicles;Public Services ...
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""","""5000 Arizona Mills Cr, Ste 435""",Tempe,AZ,3.0,9,Sporting Goods;Shopping
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""","""581 Howe Ave""",Cuyahoga Falls,OH,3.5,116,American (New);Nightlife;Bars;Sandwiches;Ameri...


In [3]:
business_df.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'restaurant_rating',
       'review_count', 'categories'],
      dtype='object')

In [4]:
# total records
business_df.shape

(174565, 8)

In [5]:
# find null Values...
business_df.isnull().sum().sort_values(ascending=False)

categories           0
review_count         0
restaurant_rating    0
state                0
city                 0
address              0
name                 0
business_id          0
dtype: int64

In [6]:
# are all business Id's unique?
business_df.business_id.is_unique #business_id is all unique

True

In [7]:
# count business in cities
business_df.state.value_counts()

AZ     52214
NV     33086
ON     30208
NC     12956
OH     12609
       ...  
VA         1
CS         1
WHT        1
PKN        1
MT         1
Name: state, Length: 67, dtype: int64

In [8]:
# Filtering by Toronto, business categories and having at least 10 reviews
bus_tor_df = business_df[business_df['city'].str.contains("Toronto")]
bus_tor_fd_df = bus_tor_df[bus_tor_df['categories'].str.contains("Food|Coffee|Tea|\
Restaurants|Bakeries|Bars|Sports Bar|Pubs|Nighlife")]
# bus_tor_fd_df = bus_tor_fd_df[bus_tor_fd_df.review_count >= 10]


In [9]:
# Top 20 most reviewed businesses
bus_tor_fd_df[['business_id', 'name', 'address', 'review_count', 'city', 'restaurant_rating']].sort_values(ascending=False, by="review_count")[0:20]

Unnamed: 0,business_id,name,address,review_count,city,restaurant_rating
107580,r_BrIgzYcwo1NAuG9dLbpg,"""Pai Northern Thai Kitchen""","""18 Duncan Street""",1494,Toronto,4.5
158973,aLcFhMe6DDJ430zelCpd2A,"""Khao San Road""","""11 Charlotte St""",1270,Toronto,4.0
9526,RtUvSWO_UZ8V3Wpj0n077w,"""KINKA IZAKAYA ORIGINAL""","""398 Church St""",1184,Toronto,4.0
131553,N93EYZy9R0sdlEvubu94ig,"""Banh Mi Boys""","""392 Queen Street W""",973,Toronto,4.0
80828,iGEvDk6hsizigmXhDKs2Vg,"""Seven Lives Tacos Y Mariscos""","""69 Kensington Avenue""",945,Toronto,4.5
5191,RwRNR4z3kY-4OsFqigY5sw,"""Uncle Tetsu's Japanese Cheesecake""","""598 Bay Street""",848,Toronto,3.5
49153,-av1lZI1JDY_RZN2eTMnWg,"""Salad King Restaurant""","""340 Yonge Street""",808,Toronto,3.5
110247,zgQHtqX0gqMw1nlBZl2VnQ,"""Momofuku Noodle Bar""","""190 University Avenue, Ground Floor""",759,Toronto,3.0
68285,uAAWlLdsoUf872F1FKiX1A,"""Bang Bang Ice Cream and bakery""","""93a Ossington Avenue""",746,Toronto,4.0
36440,f5O7v_X_jCg2itqacRfxhg,"""Sansotei Ramen""","""179 Dundas Street W""",701,Toronto,4.0


In [10]:
# get only the list of food service industry in ONT
# bus_ont_fd_list = bus_ont_fd_df[['business_id']] 
# bus_ont_fd_list.head()

bus_tor_fd_list = bus_tor_fd_df[['business_id']] 
bus_tor_fd_list.head()

Unnamed: 0,business_id
15,l09JfMeQ6ynYs5MCJtrcmQ
20,lHYiCS-y8AFjUitv6MGpxg
24,VSGcuYDV3q-AAZ9ZPq4fBQ
25,1K4qrnfyzKzGgJPBEcJaNQ
27,AtdXq_gu9NTE5rx4ct_dGg


In [11]:
# total records
bus_tor_fd_list.shape

(9623, 1)

In [12]:
'''
# Check the current directory
import os
os.getcwd()
'''

'\n# Check the current directory\nimport os\nos.getcwd()\n'

In [13]:
# get % sample of dataset

# bus_tor_fd_pct = bus_tor_fd_list.sample(frac = .1) # sample as % of dataset

bus_tor_fd_pct = bus_tor_fd_list.sample(n = 200) # sample as exact number of businesses
bus_tor_fd_pct.shape

(200, 1)

### Check Review dataset

In [14]:
# view top records of review dataset
review_df = pd.read_csv('C://Users/Quang.DESKTOP-BET5N5G/Documents/Training/Ryerson/CKME136 - Capstone course/\
Literature Reviews/Recommender System/Yelp_datasets/yelp_review.csv', encoding="latin-1")
# review_df.head()


review_df = review_df[['user_id', 'business_id', 'stars', 'text']]
review_df = review_df.rename(columns = {'stars':'user_rating'})
review_df.dropna(inplace=True)
review_df.head()


Unnamed: 0,user_id,business_id,user_rating,text
0,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,Super simple place but amazing nonetheless. It...
1,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,Small unassuming place that changes their menu...
2,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,Lester's is located in a beautiful neighborhoo...
3,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,Love coming here. Yes the place always needs t...
4,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,Had their chocolate almond croissant and it wa...


In [15]:
review_df.shape

(5261668, 4)

### Filter review dataset by food service industry in Toronto

In [16]:
review_tor_fd_spl = pd.merge(review_df, bus_tor_fd_pct, on="business_id")
review_tor_fd_spl.shape

(8748, 4)

In [17]:
review_tor_fd_spl.head

<bound method NDFrame.head of                      user_id             business_id  user_rating  \
0     u0LXt3Uea_GidxRW1xcsfg  oWTn2IzrprsRkPfULtjZtQ            3   
1     XuCbLgo9j1q5dDh9251vkg  oWTn2IzrprsRkPfULtjZtQ            5   
2     20D8uXLZxn9yeLp4VA01kw  oWTn2IzrprsRkPfULtjZtQ            5   
3     13xeC5yKJO4BDyQOI3xxDQ  oWTn2IzrprsRkPfULtjZtQ            5   
4     SRA86Xyf9fbOAhICWttMLQ  oWTn2IzrprsRkPfULtjZtQ            4   
...                      ...                     ...          ...   
8743  0uacsMYppmm3Kl1bpWIOxQ  LXpHtaR1mSgoopDd5dKlEA            4   
8744  tOzTi_wIS2Gajy90pscoMw  LXpHtaR1mSgoopDd5dKlEA            1   
8745  FTNaQZ3t0dsVWw1WZUQGFg  J7zTx8hrZ3p1h_7JCdFthg            2   
8746  u9em9Z6dLsjanNj5bZQMjg  J7zTx8hrZ3p1h_7JCdFthg            5   
8747  aOwOsph3nlgDv2xUHzQ1tA  J7zTx8hrZ3p1h_7JCdFthg            4   

                                                   text  
0     Came here for a burger as one of my friends sa...  
1     Best burger place I

## Building the Machine Learning Model

In [18]:
# import the library for processing text
import string
import re
from nltk.corpus import stopwords
stop = []
for word in stopwords.words('english'):
    s = [char for char in word if char not in string.punctuation]
    stop.append(''.join(s))

In [19]:
# check stopwords
print(stopwords.words('english'))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## Filtering and Cleaning data

In [20]:
# create a function to clean text

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    # nopunc = [char for char in mess if char not in string.punctuation]
    text = [char for char in mess if char not in string.punctuation]
    
    # Join the characters again to form the string.
    # nopunc = ''.join(nopunc)
    text = ''.join(text)
    
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)   
    
    # Now just remove any stopwords
    return " ".join([word for word in text.split() if word.lower() not in stop])



In [21]:
tor_fd_spl = review_tor_fd_spl[['user_id', 'business_id', 'user_rating', 'text']]

In [22]:
%%time
tor_fd_spl['text'] = tor_fd_spl['text'].apply(text_process)

Wall time: 3.99 s


## Split the sample dataset into train and test datasets

In [23]:
from sklearn import model_selection
# test_side = 0.3

#Split train test
vld_size=0.3

X_train, X_valid, y_train, y_valid = \
train_test_split(tor_fd_spl['text'],  review_tor_fd_spl['business_id'], test_size = vld_size, random_state = 7)

In [24]:
userid_df = tor_fd_spl[['user_id','text']]
business_df = tor_fd_spl[['business_id', 'text']]
userid_df.head()

Unnamed: 0,user_id,text
0,u0LXt3Uea_GidxRW1xcsfg,Came burger one friends said good burgers peop...
1,XuCbLgo9j1q5dDh9251vkg,Best burger place Ive Toronto Hands know laws ...
2,20D8uXLZxn9yeLp4VA01kw,ever seen maenad True Blood get whipped frenzy...
3,13xeC5yKJO4BDyQOI3xxDQ,Holy shit place good fancy burger choices pret...
4,SRA86Xyf9fbOAhICWttMLQ,gotta saywe BLESSED good burger joints GTA spe...


In [25]:
business_df.head()

Unnamed: 0,business_id,text
0,oWTn2IzrprsRkPfULtjZtQ,Came burger one friends said good burgers peop...
1,oWTn2IzrprsRkPfULtjZtQ,Best burger place Ive Toronto Hands know laws ...
2,oWTn2IzrprsRkPfULtjZtQ,ever seen maenad True Blood get whipped frenzy...
3,oWTn2IzrprsRkPfULtjZtQ,Holy shit place good fancy burger choices pret...
4,oWTn2IzrprsRkPfULtjZtQ,gotta saywe BLESSED good burger joints GTA spe...


In [26]:
# review user 'u0LXt3Uea_GidxRW1xcsfg'
userid_df[userid_df['user_id']=='u0LXt3Uea_GidxRW1xcsfg']['text']


0      Came burger one friends said good burgers peop...
418    Chinese version Cheesecake factory Came friend...
503    reading reviews place thought would take famil...
Name: text, dtype: object

In [27]:
# combine all reviews of each user to form a single paragraph
userid_df = userid_df.groupby('user_id').agg({'text': ' '.join})

# combine all reviews for each business to form a single paragraph
business_df = business_df.groupby('business_id').agg({'text': ' '.join})

In [28]:
# check the result for user 'u0LXt3Uea_GidxRW1xcsfg'
userid_df.loc['Wtc1Sc-O99FJpO3o2gh59A']['text']


'Wow Best baguette pastries weve almost anywhere almond croissant better Zanes shops Montreal Paris compared holidays counter service always friendly helpful fancy tarts delicious really really enjoyed pumpkin tart thanksgiving even went back second one Enjoy'

In [30]:
# review business 'oWTn2IzrprsRkPfULtjZtQ'
business_df.loc['oWTn2IzrprsRkPfULtjZtQ']['text']



### Apply the TFIDF Vectorizer to extract features from the text

In [31]:
# from sklearn.feature_extraction.text import TfidfVectorizer

#userid vectorizer
userid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=4000)
userid_vectors = userid_vectorizer.fit_transform(userid_df['text'])

#Business id vectorizer
businessid_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=4000)
businessid_vectors = businessid_vectorizer.fit_transform(business_df['text'])

In [32]:
userid_vectors.shape

(6572, 4000)

In [33]:
businessid_vectors.shape

(200, 4000)

### Latent Factor Collaborative Filtering Optimization

In [34]:
# Create a matrix of users and businesss with ratings

userid_rating_matrix = pd.pivot_table(tor_fd_spl, values='user_rating', index=['user_id'], columns=['business_id'])

In [35]:
userid_rating_matrix.head()

business_id,-6GJMc_Ln7csmvWWvY2iug,-L88B-PrtzwujhLuYc2k_g,0-k_-9YoWXpmtiaLdKeQJQ,0KVx4FX9trgpM6pzH2C8_A,1jNteKQ2JuF6Sk1SI9X23Q,2uhQm_4-IxWLjbKgsrVd4w,2zzRaxhpouEjGaDh9R6meA,3JJWG-Eyj-czxaXaE7LVaA,3dG6AB7o5QMVQzga0zotiA,3hI6dSt3PiF0y6aXA0QeHw,...,y1AM7MeCN3HaS7SHnxcBfQ,y35FkLcNKlGt23tRwI8VoQ,y769FA7VJOW4QQYdYlR62A,yKuAM6kL5sSKtfoe9a3GpQ,yUO7zPWGbumX42UzjbuBmw,ywPaE5Ai8SK8TeUoeMgJ6Q,zQJh5xUJAlTlMrGIyYPY0g,zg5rJfgT4jhzg1d6r2twnA,zu8Wj0gkbdnia7Dt4grMkQ,zzvlwkcNR1CCqOPXwuvz2A
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--BumyUHiO_7YsHurb9Hkw,,,,,,,,,,,...,,,,,,,,,,
--Qh8yKWAvIP4V4K8ZPfHA,,,,,,,,,2.0,,...,,,,,,,,,,
-0jN99BBgMyJz7N3RetvAg,,,,,,,,,,,...,,,,,,,,,,
-1wbglcr6x1qrUbqP1YAIA,,,,,,,,,,,...,,,,,,,,,,
-2kCxY7_aw5hOz7fJnGMbQ,,,,,,,,,,,...,,,,,,,,,,


In [36]:
userid_rating_matrix.shape

(6572, 200)

In [37]:
P = pd.DataFrame(userid_vectors.toarray(), index=userid_df.index, columns=userid_vectorizer.get_feature_names())
Q = pd.DataFrame(businessid_vectors.toarray(), index=business_df.index, columns=businessid_vectorizer.get_feature_names())

In [38]:
Q.head()

Unnamed: 0_level_0,1,10,100,1010,1015,10pm,11,1130,11am,12,...,york,yorkville,young,yum,yummy,yuzu,zaatar,zangi,zero,zucchini
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-6GJMc_Ln7csmvWWvY2iug,0.011107,0.01996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039655,...,0.0,0.0,0.0,0.0,0.012307,0.0,0.0,0.0,0.0,0.0
-L88B-PrtzwujhLuYc2k_g,0.030874,0.002522,0.0,0.0,0.0,0.0,0.00915,0.0,0.0,0.006681,...,0.004237,0.0,0.0,0.003607,0.00311,0.0,0.0,0.0,0.005482,0.0
0-k_-9YoWXpmtiaLdKeQJQ,0.017295,0.012087,0.012044,0.0,0.0,0.0,0.003132,0.0,0.0,0.009148,...,0.0,0.0,0.002983,0.0,0.002129,0.0,0.0,0.0,0.003753,0.0
0KVx4FX9trgpM6pzH2C8_A,0.023472,0.00703,0.012259,0.0,0.0,0.0,0.0,0.0,0.0,0.027935,...,0.0,0.0,0.012143,0.010055,0.00867,0.0,0.0,0.0,0.0,0.0
1jNteKQ2JuF6Sk1SI9X23Q,0.009062,0.010179,0.0,0.0,0.0,0.0,0.003693,0.0,0.0,0.005393,...,0.00342,0.0,0.0,0.002912,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
def matrix_factorization(R, P, Q, steps=25, gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+\
                    lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        
    return P, Q


In [40]:
%%time
P, Q = matrix_factorization(userid_rating_matrix, P, Q, steps=25, gamma=0.001,lamda=0.02)

Wall time: 34min 42s


In [41]:
Q.head()

Unnamed: 0_level_0,1,10,100,1010,1015,10pm,11,1130,11am,12,...,york,yorkville,young,yum,yummy,yuzu,zaatar,zangi,zero,zucchini
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-6GJMc_Ln7csmvWWvY2iug,0.017132,0.034855,0.000511,0.000206,0.006185,8.4e-05,0.005804,0.000134,0.00014,0.059111,...,0.001657,0.000253,0.000348,0.000961,0.018579,5.2e-05,0.000193,9e-05,0.00012,0.000266
-L88B-PrtzwujhLuYc2k_g,0.084794,0.022455,0.003713,0.000201,0.000285,0.000191,0.032171,0.000293,0.000186,0.031082,...,0.009332,0.002626,0.000797,0.046546,0.022675,7e-05,0.000347,0.000525,0.022764,0.000203
0-k_-9YoWXpmtiaLdKeQJQ,0.059746,0.071194,0.054612,0.000202,0.000413,0.000126,0.009131,0.002521,0.000365,0.077925,...,0.002202,0.000439,0.00511,0.001401,0.039977,0.00013,4.6e-05,0.000725,0.008293,0.00015
0KVx4FX9trgpM6pzH2C8_A,0.075105,0.013967,0.021428,0.000155,0.000228,0.000123,0.000247,0.000167,0.000195,0.036572,...,0.000313,8e-05,0.021557,0.018982,0.035051,7.8e-05,1.1e-05,0.000221,0.00026,0.000135
1jNteKQ2JuF6Sk1SI9X23Q,0.039287,0.028941,0.005218,9.6e-05,0.000244,0.000112,0.012825,0.000168,0.000136,0.010727,...,0.033456,0.002979,0.003696,0.012473,0.00887,9.2e-05,5.5e-05,0.000436,0.000229,0.000105


In [42]:
Q.iloc[0].sort_values(ascending=False).head(10)

pub       0.383736
place     0.332432
food      0.331529
beer      0.329724
good      0.272876
wings     0.262686
nachos    0.251123
staff     0.243039
pubs      0.238149
really    0.229426
Name: -6GJMc_Ln7csmvWWvY2iug, dtype: float64

### Export Model

In [43]:
# Store P, Q and vectorizer in pickle file
import pickle
output = open('tor_recommendation_model.pkl', 'wb')
pickle.dump(P,output)
pickle.dump(Q,output)
pickle.dump(userid_vectorizer,output)
output.close()

## Prediction based on input text

In [44]:
%%time
words = "I look for newyork steak"
test_df= pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(text_process)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=userid_vectorizer.get_feature_names())

predictItemRating=pd.DataFrame(np.dot(test_v_df.loc[0],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:3]
topRecommendations

Wall time: 19 ms


Unnamed: 0_level_0,Rating
business_id,Unnamed: 1_level_1
eQCYur0033RkZdh5VjbF8A,0.262493
XDO88u3vWV64uekyHQUjCw,0.261403
oAq3-dXyDT3k4Mluc1ftTA,0.247535


In [45]:

for i in topRecommendations.index:
    print('Business Name: ' + bus_tor_fd_df[bus_tor_fd_df['business_id']==i]['name'].iloc[0])
    print('Address: '+bus_tor_fd_df[bus_tor_fd_df['business_id']==i]['address'].iloc[0])
    print(bus_tor_fd_df[bus_tor_fd_df['business_id']==i]['city'].iloc[0])

    print('Business Categories: '+ bus_tor_fd_df[bus_tor_fd_df['business_id']==i]['categories'].iloc[0])
    print('Star Rating: '+ str(bus_tor_fd_df[bus_tor_fd_df['business_id']==i]['restaurant_rating'].iloc[0])+ \
          '; Number of review: '+ str(bus_tor_fd_df[bus_tor_fd_df['business_id']==i]['review_count'].iloc[0]))
    print('')

Business Name: "Oliver & Bonacini CafÃ© Grill"
Address: "33 Yonge Street"
Toronto
Business Categories: Nightlife;Canadian (New);Restaurants;American (Traditional);Breakfast & Brunch;Italian;Wine Bars;Sandwiches;Bars
Star Rating: 3.5; Number of review: 204

Business Name: "Dairy Freeze"
Address: "1601 St Clair Avenue W"
Toronto
Business Categories: Diners;Fast Food;Breakfast & Brunch;Burgers;Restaurants
Star Rating: 3.0; Number of review: 39

Business Name: "Yamato Japanese Restaurant"
Address: "24 Bellair Street"
Toronto
Business Categories: Sushi Bars;Japanese;Restaurants
Star Rating: 3.5; Number of review: 134



## Measuring the accuracy of the Matrix Factorization Recommender System

In [46]:
f = open('tor_recommendation_model.pkl', 'rb')
P, Q, userid_vectorizer = pickle.load(f), pickle.load(f), pickle.load(f)


In [47]:
test_df = pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(text_process)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index,
                         columns=userid_vectorizer.get_feature_names())

In [48]:
test_df = X_valid.to_frame()
test_df['text'] = test_df['text'].apply(text_process)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index,
                         columns=userid_vectorizer.get_feature_names())

In [49]:
y_pred = []
for key, row in test_v_df.iterrows():
    predictItemRating=pd.DataFrame(np.dot(row,Q.T),index=Q.index,columns=['Rating'])
    topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:1]
    y_pred.append(topRecommendations.index[0])

In [50]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score #, classification_report, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
print('Accuracy for validation set is: ', accuracy_score(y_valid, y_pred))



Accuracy for validation set is:  0.5211428571428571
