# **Yelp User Rec - Item Based Collaborative Filtering (Oregon)**



In [1]:
import pandas as pd
import numpy as np
import itertools
from io import StringIO
import re

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Importing Data** 


Json Files downloaded from:
https://www.kaggle.com/yelp-dataset/yelp-datasetbeing 

Datasets used: 
- yelp_academic_dataset_business.json
- yelp_academic_dataset_user.json
- yelp_academic_dataset_review.json

Files uploaded to path: '/content/drive/MyDrive/DataX/'

and read through Google Drive. 

In [3]:
business = []
with open('/content/drive/MyDrive/DataX/yelp_academic_dataset_business.json', 'r') as f:
    while True:
        lines = list(itertools.islice(f, 1000))
        if lines:
            lines_str = ''.join(lines)
            business.append(pd.read_json(StringIO(lines_str), lines=True))
        else:
            break
df_business = pd.concat(business)

In [4]:
user = []
with open('/content/drive/MyDrive/DataX/yelp_academic_dataset_user.json', 'r') as f:
    while True:
        lines = list(itertools.islice(f, 1000))
        if lines:
            lines_str = ''.join(lines)
            user.append(pd.read_json(StringIO(lines_str), lines=True))
        else:
            break
df_users = pd.concat(user)

In [5]:
review = []
with open('/content/drive/MyDrive/DataX/yelp_academic_dataset_review.json', 'r') as f:
    while True:
        lines = list(itertools.islice(f, 1000))
        if lines:
            lines_str = ''.join(lines)
            review.append(pd.read_json(StringIO(lines_str), lines=True))
        else:
            break
df_reviews = pd.concat(review)

# **Data Wrangling**

In [6]:
#Filtering the Data Frame to be businesses located only in Oregon

df_or = df_business[df_business['state'] == 'OR']
price_point = []
for attributes in df_or.attributes:
  if attributes == None:
    price_point.append(None)
  else:
    t_f = 'RestaurantsPriceRange2' in attributes
    if t_f:
      price = attributes['RestaurantsPriceRange2']
      if price == 'None':
        price_point.append(None)
      else:
        price = int(price)
        price_point.append(price)
    else:
      price_point.append(None)

In [7]:
restaurant_t_f = []
for categories in df_or.categories:
  if categories == None:
    restaurant_t_f.append(False)
  elif 'Restaurants' in categories: #or 'Food' in categories
    restaurant_t_f.append(True)
  else:
    restaurant_t_f.append(False)

df_or['Restaurant'] = restaurant_t_f

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [8]:
df_or['price_point'] = price_point
reviews_business = pd.merge(df_reviews, df_or, on = 'business_id')
counts = reviews_business['user_id'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [9]:
sum_stars = {}

for i in range(len(reviews_business)):
  user = reviews_business.iloc[i, 1]
  stars = reviews_business.iloc[i, 3]
  if user in sum_stars.keys():
    curr_val = sum_stars[user]
    sum_stars[user] = curr_val + stars
  else:
    sum_stars[user] = stars

In [10]:
mean_stars = []

for user in reviews_business['user_id']:
  mean = sum_stars[user] / counts[user]
  mean_stars.append(mean)
  
reviews_business['normalized_stars'] = reviews_business['stars_x'] - mean_stars

In [11]:
#This cell can be adjusted to include predictions for less users. 
#If you just want to see the model in action without spending too much time waiting for the cells to load.
#Feel free to uncomment the next line and comment out the current line:

#user_reviews_business = reviews_business[reviews_business['user_id'].isin(counts.index[counts > 99])]

user_reviews_business = reviews_business[reviews_business['user_id'].isin(counts.index[counts > 9])]

# **User-Business Ratings Matrix**

In [32]:
user_ratings = pd.pivot_table(user_reviews_business, values = 'stars_x', index = 'user_id', columns = ['business_id'])
user_ratings = user_ratings.dropna(thresh = 3, axis = 1)

In [33]:
#Normalized version
norm_user_ratings = pd.pivot_table(user_reviews_business, values = 'normalized_stars', index = 'user_id', columns = ['business_id'])
norm_user_ratings = norm_user_ratings.dropna(thresh = 3, axis = 1)

In [34]:
norm_user_ratings

business_id,--6COJIAjkQwSUZci_4PJQ,--UNNdnHRhsyFUbDgumdtQ,-0SmuXTzQITMmr_jfXrLrA,-0p-JeIeAY_u6NEEUMgGNg,-1Dcv3siosFTgDJhN4EgGg,-1h2qkElNfKjUPw6brMbIw,-2NkJMCUBnh3NEkmWQ1oWw,-3VQI-QXI7fr4TWH0d8p5A,-3acxwTxBUc6l3xCMrL1_Q,-3hnEon-pSQRGPDr80s3Gw,-3jJSQtMfsGacIUHaFcZbQ,-3zYQHzHqKA739pRcj9gWQ,-497StkBbvRrxUs6RwPA2w,-4Sgl3Plr1k0JpvaWAzTMA,-5-s0IGhN633QhhS_TSR9A,-5wOIgIfCMPI5LJ6uSlkig,-5zKNFxuoPm8L9OQ0LtzvA,-6FX2iidcEY5OMOY_Qu8hg,-6Pmtqzqcv2wwu9zatwnSQ,-79l25_zCDRHP9pk8q5xMg,-7m5rxrbMalZR2BK6J4zeQ,-8w_P2oWCY3PjX-C-iyPZQ,-921gYzvMFJmxJw9SMZ5mA,-99rAyLj7X3iRbwyPvT1CA,-9ap9pStLtFBYoMLRl7hVA,-9tGFWH-V_KwmGuGcBl3uA,-ALQON0qCeJcxptieqdUGA,-Bj2in9Cp19RxmLKP2MYMw,-C0K8n7G7qIGT5OWyPhNiA,-CPd_oNhKSF9R4cjNg0yzQ,-DUBAZshvsnJQ5_2z-gOcQ,-EMS05MOf72VAqiHi4Ymxw,-Eu-BXVKSvzQXngtlCJeGg,-F60MZqS1gebduEO4ZXrkQ,-F6NlEBUUf5PznCycJcOVg,-FNjIbIqybETW-eUaNMbLA,-GMkoMATTeESkf6bhcNQ3g,-Ger2pFfA-vwXbPyrfzxLQ,-Ghon5H01IghsBVEz2qQGA,-GpSholMbEzviScVV4QOIA,...,ziQ9b-nxL0zU5QOqrLmR_w,zicLQKifCIW7ICZxSAFERg,zim9z0Bv8OQRRZ7tnYq7EA,zimZZDsjRcuRPgZWkyCRwg,zioLxtBc9THNS2TOn9xW1w,zjLnKz70GU94A1s_0iclQg,zjQ-AscaT-2pyxiukpxiFQ,zkGmrEHkgmODkAOvLf6FXw,zkz1nksuZJwmtYeQTgLYUQ,zlIwd-Ph3P768CqLo2ihJg,zlg8Z8COVU1Ju9QNqdZA3w,zm6stMrEtf8CWXsXc36waQ,zmeiE_ZNndPcto-q-o_tDA,zmiVoJuRBYiZGWAgeUOIUw,zn6v37LzwfjdU_CwD5YJKA,zniMHN5RGmywJqRMt68tKQ,znlwoZXOLzC1uV8OA9K77A,znuiSxCNF6MyJFVmYUqpJg,zo1tR7KEU5iRuqMDhvH1Ew,zo95i58Pi_9DG1Sq8Nr1TA,zoH3uQn5-HP9Y4-zzlgkfQ,zpVL0nqN4WiD2knhhW-rOg,zqDaDcrPJbecQhT4z6okIg,zqLq_lgtov34FaYZidQxTQ,zqZuLJk_mOEuC131cXbeWg,zqbwECf-hsC0-X06f1CZeg,zsZx06ssEyPrOicFvmp7VA,zsc9WODKzlEYBxzx8orw6Q,zt_Dy0aW6LLY_k2Uo-TxDw,ztvWkrzw8DLOIBikIuAChw,zuGlW9RwNNYD2iI9BlwxDQ,zugu5Y9MeSAZ_1k1RfXMwQ,zvN-ArlsIr-INMapSnCARQ,zwCnvvz9eEgRsnmqijRDbA,zwhSGiftT_yzKSEmMCol6Q,zx1nvEj6m3GEChxvsPjOKA,zyBC3BUkH9klhPhMyQmxAQ,zyauuvAYdVweBK4L7wBRmw,zz5OUCGJvH-v9bDLxAjVsg,zzpmoTVq4yn86U7ArHyFBQ
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
-1pmeSv4fMJ1Sqt_k6isKg,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,-1.701923,,,,,,,,,,,,,,,,,,,,,,,,,,
-3ATrOPUw9ZFTYkMIFfmwQ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
-3s52C4zL_DHRK0ULG6qtg,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.043478,,,,,,,...,,,,,,,,,,0.956522,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
-53qy8hpIJRRvsEg6DVVzg,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.140845,,,,,,,,,,,,,...,,,,,,,,,,,,,,,0.359155,,,,,,,,,,,,,,,,,,,,,,,,,-2.140845
-A9ICz4e9hgrNK4-Fs3TmQ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
z_COVSPM6GIfxjkH9Ir7Tw,,,,,,,,1.060302,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.060302,,,,,,-0.939698,,,,,
zeKm4W2yTzrZzr1IOMDcvw,,,,,,,,,,,,,,,,,,,,,,,0.991379,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
zn70PhXYTU56znyMbJvfeg,,,,,,,,,,,,0.39959,-0.60041,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-1.600410,0.39959,
ztHM83LXuSKmj4H48TkTUQ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,0.927632,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.072368,,


# **Cosine Similarity Matrix**

In [130]:
#Code obtained from: https://medium.com/analytics-vidhya/speed-up-cosine-similarity-computations-in-python-using-numba-c04bc0741750
#Using jit, speeds up computational run-time

import math
from numba import jit
@jit(nopython=True)
def cosine_similarity(u_norm:np.ndarray, v_norm:np.ndarray, u:np.ndarray, v:np.ndarray):
    uv = 0
    uu = 0
    vv = 0
    for i in range(u_norm.shape[0]):
      val = u_norm[i]*v_norm[i]
      if ~math.isnan(val):
        uv += val
        uu += u[i]*u[i]
        vv += v[i]*v[i]
    cos_theta = 0
    if uu!=0 and vv!=0:
        cos_theta = uv/np.sqrt(uu*vv)
    return cos_theta

In [131]:
#This cell will take a long time to run depending on the number of user we are creating predictions for.
#This cell is calculating the cosine similarity between each and every business

norm_ratings_array = []
for i in range(len(norm_user_ratings.columns)):
  norm_ratings_array.append(norm_user_ratings.iloc[:, i].to_numpy())

ratings_array = []
for i in range(len(user_ratings.columns)):
  ratings_array.append(user_ratings.iloc[:, i].to_numpy())

similarity_matrix = []
for i in range(len(norm_user_ratings.columns)):
  corr = []
  for j in range(len(norm_user_ratings.columns)):
    similarity = cosine_similarity(norm_ratings_array[i], norm_ratings_array[j], ratings_array[i], ratings_array[j])
    corr.append(similarity)
  similarity_matrix.append(corr)

# **Rating Predictions**

In [145]:
#Creating this dictionary allows us to get the business name from its index.
#we are able to achieve the same function using df.iloc, however we are using a dictionary as this greatly speeds up the search runtime.

business_index = {}
i = 0
for business in user_ratings.columns:
  business_index[i] = business
  i += 1

In [161]:
def user_predictions(user, n_top=5):
  user_rated = user_ratings.loc[user, :]
  user_pred = []
  rated_places = user_ratings.columns[(user_ratings > 0).loc[user]]
  relevant_similarity_index = [user_ratings.columns.get_loc(x) for x in rated_places]
  for i in range(len(user_rated)):
    if user_rated[i] > 0:
      user_pred.append(-1.0)
    else:
      business_scores = similarity_matrix[i]
      actual_index = {}
      for index in relevant_similarity_index:
        actual_index[business_scores[index]] = index
      rel_business_scores = list(actual_index.keys())
      sorted_scores = sorted(rel_business_scores, reverse = True)
      score = 0.000000000001
      total_weight = 0.000000000001
      if n_top > len(sorted_scores):
        n_top = len(sorted_scores)
      for k in range(n_top):
        if sorted_scores[k] > 0:
          weight = sorted_scores[k]
          business_rating = user_rated[actual_index[weight]]
          score += (weight * business_rating)
          total_weight += weight
      predicted_score = score / total_weight
      user_pred.append(predicted_score)
  return user_pred

In [134]:
def recommend(user, top_business):
  recommendations = pd.DataFrame(index = [user])
  recommendations["User's_name"] = float("NaN")
  recommendations['business_name'] = float("NaN")
  recommendations['business_categories'] = float("NaN")
  recommendations['business_stars'] = float("NaN")
  recommendations['price_point'] = float("NaN")
  #recommendations['predicted_score'] = float("NaN")
  for i in range(len(recommendations)):
    recommendations.iloc[i, 0] = df_users[df_users['user_id'] == user]['name'].item()
    recommendations.iloc[i, 1] = df_or[df_or['business_id'] == top_business]['name'].item()
    recommendations.iloc[i, 2] = df_or[df_or['business_id'] == top_business]['categories'].item()
    recommendations.iloc[i, 3] = df_or[df_or['business_id'] == top_business]['stars'].item()
    recommendations.iloc[i, 4] = df_or[df_or['business_id'] == top_business]['price_point'].item()
    #recommendations.iloc[i, 5] = rec_ratings.loc[user, business]
  return recommendations

# **End Function**

In [162]:
def restaurant_recommender(user, restaurant=None, stars=None, price_point=None):
  temp_table = df_or.copy()
  if restaurant != None:
    temp_table = temp_table[temp_table['Restaurant'] == restaurant]
  if stars != None:
    temp_table = temp_table[temp_table['stars'] >= stars]
  if price_point != None:
    temp_table = temp_table[temp_table['price_point'] == price_point]
  chosen_businesses = [user_ratings.columns.get_loc(business) for business in user_ratings.columns if business in list(temp_table['business_id'])]
  business_predictions = user_predictions(user)
  top_index = 0
  for i in range(len(business_predictions)):
    if i not in chosen_businesses:
      business_predictions[i] = -1.0
    if business_predictions[i] > business_predictions[top_index]:
      top_index = i
  best_business = user_ratings.columns[top_index]
  to_return = recommend(user, best_business)
  #pd.set_option("display.max_rows", None, "display.max_columns", None)
  return to_return

In [136]:
user_ratings.index

Index(['-1pmeSv4fMJ1Sqt_k6isKg', '-3ATrOPUw9ZFTYkMIFfmwQ',
       '-3s52C4zL_DHRK0ULG6qtg', '-53qy8hpIJRRvsEg6DVVzg',
       '-A9ICz4e9hgrNK4-Fs3TmQ', '-IQoG8bdizVi-Hm2j_9kJg',
       '-MfY7TdPO7scYRULM_zUVg', '-OSEuBmsr31bGCJ1Nueb1Q',
       '-PYZjg-9bhZVFrT5rdKJew', '-QAPcuddGxs5BfgqnS9lig',
       ...
       'zDBOdWtl2PsNY38IeoE5cQ', 'zJyGifHlb3uwFyiaBFk3ZQ',
       'zSJSbya4yryJbfWBtvnqJA', 'zWGj3oOA_X8QPho_l5BRKg',
       'zWzEwzH6WBGpyrQkvkmywg', 'z_COVSPM6GIfxjkH9Ir7Tw',
       'zeKm4W2yTzrZzr1IOMDcvw', 'zn70PhXYTU56znyMbJvfeg',
       'ztHM83LXuSKmj4H48TkTUQ', 'ztvRQSJ2Be-7TtAYHuMu_w'],
      dtype='object', name='user_id', length=849)

# **Recommendation:**

Input User ID:

Filters: Restaurants, Stars, Price Point

In [163]:
restaurant_recommender('ztvRQSJ2Be-7TtAYHuMu_w', True, 4, 1)

Unnamed: 0,User's_name,business_name,business_categories,business_stars,price_point
ztvRQSJ2Be-7TtAYHuMu_w,Kody,21st Century Pizza,"Food Delivery Services, Pizza, Food, Restaurants",4.0,1.0
