# Yelp Recommendation system

The Yelp Data Set can be found [Yelp Dataset Challenge](https://www.yelp.com/dataset_challenge).
<br/>
I'd like to use business.json and user.json to build my recommendation system

# Process business data set

In [20]:
import json
import pandas as pd
file_business,file_review, file_user = [
    './yelp_dataset/business.json',
    './yelp_dataset/review.json',
    './yelp_dataset/user.json'
]

In [24]:
with open(file_business, encoding="utf8") as f:
    df_business = pd.DataFrame(json.loads(line) for line in f)

In [27]:
df_business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,2818 E Camino Acequia Drive,{'GoodForKids': 'False'},1SWheh84yJXfytovILXOAQ,"Golf, Active Life",Phoenix,,0,33.522143,-112.018481,Arizona Biltmore Golf Club,85016,5,3.0,AZ
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC
3,"15655 W Roosevelt St, Ste 237",,xvX2CttrVhyG2z1dFg_0xw,"Insurance, Financial Services",Goodyear,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,33.455613,-112.395596,Farmers Insurance - Paul Lorenz,85338,3,5.0,AZ
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC


## Find the Resturants information in "Las Vegas"

In [29]:
# city Las Vegas: bool
cond_city = df_business['city'] == "Las Vegas"

# isnull: bool
cond_category_not_null = ~df_business["categories"].isnull()

# restaurant in categories
# first, apply(str) convert categories to strings,
# then check if it contains Restaurants
cond_category_restaurant = df_business["categories"].apply(str).str.contains("Restaurants")

# Create filtered DataFrame, and name it df_filtered
df_filtered = df_business[cond_city & cond_category_not_null & cond_category_restaurant]

## Keep useful columns

* only keep some useful columns
    * business_id
    * name
    * categories
    * stars

In [30]:
selected_features = [u'business_id', u'name', u'categories', u'stars']

# Make a DataFrame that contains only the abovementioned columns, and name it as df_selected_business
df_selected_business = df_filtered[selected_features]

# Rename the column name "stars" to "avg_stars" to avoid naming conflicts with review dataset
df_selected_business.rename(columns={"stars":"avg_stars"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


## Save results to csv files

In [32]:
# Save to ./data/selected_business.csv for your next task
df_selected_business.to_csv("./selected_business.csv", index=False, encoding="utf-8")

## Load Review Dataset

In [33]:
with open(file_review, encoding="utf8") as f:
    df_review = pd.DataFrame(json.loads(line) for line in f)

## Join on business_id
filter data by date, keep comments from last 1 years (better perform filter before join, so data is smaller)

In [35]:
# Prepare the business dataframe and set index to column "business_id", and name it as df_left
df_left = df_selected_business.set_index('business_id')

# Prepare the review dataframe and set index to column "business_id", and name it as df_right
# Also filter date after '2018-02-01'
df_review = df_review[df_review['date']>'2018-02-01'].set_index('business_id')

# Join df_left and df_right. What type of join?
df_final = df_left.join(df_review,how='inner')

# You may want to reset the index 
df_final = df_final.reset_index()

In [37]:
# Save to data/last_2_years_restaurant_reviews.csv for your next task
df_final.to_csv('./last_1_years_restaurant_reviews.csv',index=False)

# Yelp Restaurant Recommender system

In [1]:
# load library

import pandas as pd
import numpy as np

from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from time import time

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("ggplot")

In [6]:
# load data
df = pd.read_csv("./last_1_years_restaurant_reviews.csv")
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Cajun/Creole, Seafood, Steakhouses, Restaurants",4.0,0,2018-05-08 16:39:25,0,tTD3wegtsiHOjVTLgsb7FA,5.0,On yelp 5 stars = Woohoo! as good as it gets! ...,0,joe4i-lcCFd2wViA2agGKw
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Cajun/Creole, Seafood, Steakhouses, Restaurants",4.0,0,2018-06-19 23:47:56,0,aie6cd-6-nbQ5SOMgfSteg,5.0,Excellent food and service!! Everyone enjoyed ...,0,CW2rG0leS6vuVHLyvUx4Cg
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Cajun/Creole, Seafood, Steakhouses, Restaurants",4.0,0,2018-04-05 00:49:01,0,HGegJEZMSilsyt7RNcxwCw,5.0,We had a great experience! Super friendly and...,0,hOQ70lGgr6hYRMcoS330Kw
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Cajun/Creole, Seafood, Steakhouses, Restaurants",4.0,0,2018-04-28 18:00:09,0,jR8Xh9MM9kxjD3aNMsKVYg,3.0,Very average mediocre steakhouse. The ambiance...,0,xXl-Q04KyQz6VOQyyFl5XA
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"Cajun/Creole, Seafood, Steakhouses, Restaurants",4.0,0,2018-06-27 01:10:57,0,VE6LgMGd35y0_yACyeXn0Q,2.0,This was our group's 2nd Vegas visit in a row ...,0,w-egZTgNCg0_aCx0Ci32Gg


# Collaborative Filtering Recommender

## Get Relevant Columns
* business_id
* user_id
* stars

In [5]:
# Get business_id, user_id, stars for recommender
df_stars = df[['business_id', 'user_id', 'stars']]

## Basic Information

In [7]:
# stars distribution
df_stars['stars'].value_counts()

5.0    97498
4.0    34304
1.0    25830
3.0    17700
2.0    13656
Name: stars, dtype: int64

In [8]:
# number of unique business_id (restaurant)
df_stars['business_id'].nunique()

4249

In [9]:
# number of unique user_id (user)
df_stars['user_id'].nunique()

102800

In [10]:
# user comments counts
df_user_counts = df_stars['user_id'].value_counts()
df_user_counts.head(10)

I-4KVZ9lqHhk8469X9FvhA    268
bLbSNkLggFnqwNNzzq-Ijw    133
3tCzdXFXke958utEjcSdmg    125
keBv05MsMFBd0Hu98vXThQ    123
NrpzAH3EoNhWUR8OysUhBQ    123
U4INQZOPSUaj8hMjLlZ3KA    122
G2ToQ5eT91v2hkIT7I4X7g    103
YE54kKTuqJJPNYWIKIpOEQ     92
qKUEhJUl0Z9MFW7R7BTvRQ     77
Xsyw11gZiltqWL5RvWGKJg     77
Name: user_id, dtype: int64

## Exclude Users With Less Reviews

There are many users that haven't given many reviews, exclude these users from the item-item similarity recommender.

In [17]:
# how many users only comments once
df_user_counts[df_user_counts == 1].sum()

71200

In [18]:
# users comment over 5 times
# count users comment over 5 times as active user
df_users = df_user_counts[df_user_counts > 5]
df_users.count()

4133

In [19]:
# sum of all comments
df_users.sum()

44701

In [20]:
# reorder dataframe
df_stars_cleaned = df_stars.set_index('user_id').loc[df_users.index,:].reset_index()
df_stars_cleaned.head()

Unnamed: 0,user_id,business_id,stars
0,I-4KVZ9lqHhk8469X9FvhA,-AD5PiuJHgdUcAK-Vxao2A,5.0
1,I-4KVZ9lqHhk8469X9FvhA,-_yEVC3_3M6YOsamYfNFEw,4.0
2,I-4KVZ9lqHhk8469X9FvhA,-hzQQ8tbXThpp0k5E_y1Dw,4.0
3,I-4KVZ9lqHhk8469X9FvhA,-oUM2uISux96lMGeawHIOA,5.0
4,I-4KVZ9lqHhk8469X9FvhA,-oUM2uISux96lMGeawHIOA,5.0


In [24]:
df_stars_cleaned['business_id'].nunique()

3626

## Utility Matrix

In [21]:
# Use pivot table to represent Utility matrix
df_utility = pd.pivot_table(data=df_stars_cleaned,
                            values='stars', 
                            index='user_id', 
                            columns='business_id',
                            fill_value=0) 

In [22]:
df_utility.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-0RkJ_uIduNLWQrphbADRw,-1m9o3vGRA8IBPNvNqKLmA,-3zffZUHoY8bQjGfPSoBKQ,-46pFijv3f2jZH-_Ze6CQA,-8R_-EkGpUhBk55K9Dd4mg,-AD5PiuJHgdUcAK-Vxao2A,-ADtl9bLp8wNqYX1k3KuxA,-Bf8BQ3yMk8U2f45r2DRKw,-BmqghX1sv7sgsxOIS2yAg,...,zmltWmTpoBt5sCU-5Kzj-Q,znWHLW1pt19HzW1VY6KfCA,zp-K5s3pGTWuuaVBWo6WZA,zpoZ6WyQUYff18-z4ZU1mA,zr42_UsWfaIF-rcp37OpwA,zsQk990PubOHjr1YcLkQFw,zttcrQP4MxNS5X5itzStXg,zuwba6QEBIDZT0tJZmNhdQ,zwNC-Ow4eIMan2__bS9-rg,zx_j6OuuHHa2afVoAZuLpA
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1D4lTzn2HzQlW7OZJo16Q,0,0.0,0,0,0,0,0,0.0,0,0,...,0,0,0,0.0,0,0,0,0,0,0
-4JBueG4IRIh2Ie5EEIJSg,0,0.0,0,0,0,0,0,0.0,0,0,...,0,0,0,0.0,0,0,0,0,0,0
-6gYApoyPvez737nxfnGqw,0,0.0,0,0,0,0,0,0.0,0,0,...,0,0,0,0.0,0,0,0,0,0,0
-6h_wj5ZOXZQyDyFdhd5BA,0,0.0,0,0,0,0,0,0.0,0,0,...,0,0,0,0.0,0,0,0,0,0,0
-6hzTZbyE_vAnlLe4lEkpA,0,0.0,0,0,0,0,0,0.0,0,0,...,0,0,0,0.0,0,0,0,0,0,0


In [23]:
df_utility.shape

(4133, 3626)

## Similarity Matrix

In [26]:
# Item-Item Similarity Matrix, I choose cosine_similarity here
item_sim_mat = cosine_similarity(df_utility.T) ## similarity matrics is a hyper parameter

In [27]:
item_sim_mat.shape

(3626, 3626)

## Calculate Neighborhood

In [46]:
# Find Neighborhoos with highest similarity
least_to_most_sim_indexes = np.argsort(item_sim_mat, axis=1)
# Hyperparameter: number of neighborhoods
neighborhood_size = 10 # can tune neighborhood size , hyper parameter
neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size:] ## least_to_most_sim_indexes[:, -neighborhood_size: -1]

## Prediction for a user

In [47]:
# Let's pick a lucky user
user_id = 'qKUEhJUl0Z9MFW7R7BTvRQ'

n_users = df_utility.shape[0]
n_items = df_utility.shape[1]

start_time = time()

items_rated_by_this_user = df_utility.loc[user_id,:].values.nonzero()[0]
# Just initializing so we have somewhere to put rating preds
out = np.zeros(n_items)
for item_to_rate in range(n_items):
    relevant_items_idx = np.intersect1d(neighborhoods[item_to_rate],
                                    items_rated_by_this_user,
                                    assume_unique=True)  # assume_unique speeds up intersection op
    relevant_items = df_utility.columns[relevant_items_idx].values
#     print (relevant_items_idx)
#     print (relevant_items)
#     print (df_utility.loc[user_id, relevant_items].values)
#     print (item_sim_mat[item_to_rate, relevant_items_idx])
#     print (item_sim_mat[item_to_rate, relevant_items_idx].sum())
    out[item_to_rate] = df_utility.loc[user_id, relevant_items].values.dot(item_sim_mat[item_to_rate, relevant_items_idx].T) / item_sim_mat[item_to_rate, relevant_items_idx].sum()

pred_ratings = np.nan_to_num(out)
# print(pred_ratings)
print("Execution time: %f seconds" % (time()-start_time))



Execution time: 11.180140 seconds


## Final Recommendations

In [48]:
# Hyperparameter: Recommend n movies
n = 10

# Get item indexes sorted by predicted rating
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))

# Find items that have been rated by user
item_index_rated_by_this_user = df_utility.loc[user_id,:].values.nonzero()[0]

# We want to exclude the items that have been rated by user
unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in item_index_rated_by_this_user]

df_utility.columns[unrated_items_by_pred_rating[-n:]].values

array(['0VjHFdczi6Nln_nn8bucJQ', 'Mv2SeBkY0DASmwO1-mM-iQ',
       'xxaqaD9IVsqxiNF8FoHdYA', '5iAFiMPfitcYDMeII5_dnQ',
       'XtE_J8DZXgjFbW1xVkqmUw', 'jSu2A7DibDjmUd3wUt6fPg',
       'gDyOfw49K5Sr4-vKVVwDfA', 'ejcVeQztxXY6NX2O2SE0Rw',
       'Gd2OE3FIWzv5boI99bM6Iw', 'Vg1C_1eqwIwkZLIXGMTW3g'], dtype=object)

## Evaluation
* may have overfitting risk
* can get a rough estimation of the performance of my recommender

In [49]:
# get item id that is rated by this user
item_rated_by_this_user = df_utility.columns[item_index_rated_by_this_user].values

# prediction rating score
y_pred = pred_ratings[item_index_rated_by_this_user]

# true rating score
y_true = df_utility.loc[user_id, item_rated_by_this_user].values

# RMSE
np.sqrt(np.mean((y_pred - y_true)**2))

0.5564217521152803

# Content-based recommender

## Extract Features

In [52]:
# group by business_id, then average numerical features

df_average = df.groupby(['business_id']).mean()
df_average.head()

Unnamed: 0_level_0,avg_stars,cool,funny,stars,useful
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
--9e1ONYQuAa-CB_Rrw7Tw,4.0,1.113475,0.666667,4.29078,1.304965
-0BxAGlIk5DJAGVkpqBXxg,3.0,0.0,0.0,3.75,0.0
-0RkJ_uIduNLWQrphbADRw,4.5,1.666667,0.666667,4.301587,1.825397
-1m9o3vGRA8IBPNvNqKLmA,4.5,0.320856,0.165775,4.561497,0.743316
-3zffZUHoY8bQjGfPSoBKQ,4.0,2.555556,1.733333,4.355556,2.711111


In [53]:
# group by business_id, extract categories data

categories_series = df.groupby(['business_id']).categories.apply(np.unique)
categories_series.head()

business_id
--9e1ONYQuAa-CB_Rrw7Tw    [Cajun/Creole, Seafood, Steakhouses, Restaurants]
-0BxAGlIk5DJAGVkpqBXxg                    [Food, Restaurants, Coffee & Tea]
-0RkJ_uIduNLWQrphbADRw    [Coffee & Tea, Cafes, Food, Restaurants, Break...
-1m9o3vGRA8IBPNvNqKLmA    [Bars, Cocktail Bars, Restaurants, African, Se...
-3zffZUHoY8bQjGfPSoBKQ    [Nightlife, Bars, American (New), Seafood, Res...
Name: categories, dtype: object

In [54]:
# convert categories data to string remove `[]`
categories_series = categories_series.str.join('').apply(lambda x: x[1:-1])

In [56]:
# business_id, categories table
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
categories_mat = vectorizer.fit_transform(categories_series).toarray()
categories = vectorizer.get_feature_names()
df_categories = pd.DataFrame(categories_mat,
                             columns=categories, 
                             index=categories_series.index)
df_categories.head()

Unnamed: 0_level_0,acai,acarons,accessories,acos,active,activities,acupuncture,adoption,adult,afes,...,wraps,yelp,yoga,yogur,yogurt,your,yourself,zakaya,zechuan,zoos
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--9e1ONYQuAa-CB_Rrw7Tw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0BxAGlIk5DJAGVkpqBXxg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-0RkJ_uIduNLWQrphbADRw,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-1m9o3vGRA8IBPNvNqKLmA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
-3zffZUHoY8bQjGfPSoBKQ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Dimensionality Reduction

In [57]:
# use svd to reduce dimension
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=150,
                   random_state=42)

svd.fit(categories_mat)
categories_svd = svd.transform(categories_mat)
df_categories_svd = pd.DataFrame(categories_svd,
                                 index=categories_series.index)

print(svd.explained_variance_ratio_.sum()) # total variance explained
df_categories_svd.head()

0.8999518166452352


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--9e1ONYQuAa-CB_Rrw7Tw,0.193714,-0.075249,-0.376415,0.526724,-0.366509,0.289663,-0.545589,0.002152,-0.261248,0.007109,...,-0.00446,-0.042583,0.003215,-0.039412,0.03617,-0.067481,-0.032062,-0.040863,0.049589,-0.072978
-0BxAGlIk5DJAGVkpqBXxg,0.605673,0.05651,0.679714,-0.188741,-0.036146,-0.209562,-0.488897,0.328494,0.028942,0.322817,...,0.058874,0.004971,0.000863,-0.006043,-0.027285,-0.034147,-0.024304,0.019209,-0.003411,0.059326
-0RkJ_uIduNLWQrphbADRw,1.25374,-0.543638,0.523794,-0.149007,-0.112086,-0.419827,-0.113909,1.022559,0.291423,0.26213,...,0.00674,-0.00673,0.006152,-0.006164,0.003069,-0.0034,-0.015504,0.049191,-0.094721,-0.012769
-1m9o3vGRA8IBPNvNqKLmA,1.415311,1.229378,0.074165,-0.049185,-0.051545,-0.185445,0.077454,-0.206934,-0.178,-0.557226,...,0.038001,-0.008299,0.023659,-0.031058,-0.030717,0.006331,-0.004666,0.006079,-0.015482,0.030894
-3zffZUHoY8bQjGfPSoBKQ,0.98651,0.876561,-0.900745,0.646937,-0.437903,0.150016,-0.093438,-0.174853,0.004386,-0.610071,...,0.019433,-0.008534,-0.004273,-0.021365,-0.04859,-0.024665,0.000635,0.010591,0.005858,0.043859


In [58]:
# join two to generate feature data for each business_id

df_business = df_average.join(df_categories_svd)