This is the code for making a recommendation engine driven by the review text. The idea is to predict ratings for the business by understanding the user behaviour and the peculiar things offered by the business. For gauging the behaviour we use Natural Language Processing over the reviews. 
---


(At each point we save the arrays and other objects to not lose them due to probable crashing of the sytem due to RAM exhaustion)


In [0]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)


Mounted at /content/drive/


Loading the required libraries

In [0]:
import json
import numpy as np
import scipy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.kernel_ridge import KernelRidge
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression



Setting up the path variables

In [0]:
path1 = "/content/drive/My Drive/Sample_trial/"
path = "/content/drive/My Drive/yelp_dataset/"
path_data = "/content/drive/My Drive/Datasets/"
city_name = "Las Vegas"

Generatting a list of unique business IDs

In [0]:
businesses = set()

with open(path + 'business.json') as data_file:
    for line in data_file:
        buss = json.loads(line)
        if buss['city'] == city_name:
            businesses.add(buss['business_id'])

Counting the number of reviews given by each user and number of reviews recieved by a business

In [0]:
all_output = open(path + city_name + '_reviews.json', 'w')

user_count = {}
restaurant_count = {}

with open(path+'review.json') as data_file:
    for line in data_file:
        review = json.loads(line)
        if review['business_id'] in businesses:
            all_output.write(line)
            user_id, restaurant_id = review['user_id'], review['business_id']
            if user_id not in user_count: user_count[user_id] = 0
            user_count[user_id] += 1
            if restaurant_id not in restaurant_count: restaurant_count[restaurant_id] = 0
            restaurant_count[restaurant_id] += 1

all_output.close()


Filtering out the users and businesses having less than 25 corresponding reviews

In [0]:
active_user_threshold = 25
eligible_users = [user_id for (user_id, count) in user_count.items() if count >= active_user_threshold]
eligible_restaurants = [rest_id for (rest_id, count) in restaurant_count.items() if count >= active_user_threshold]
print (len(eligible_users), 'users are eligible, out of a total of', len(user_count))
print (len(eligible_restaurants), 'restaurants are eligible, out of a total of', len(restaurant_count))
eligible_users = set(eligible_users)
eligible_restaurants = set(eligible_restaurants)
filter_output = open(path + city_name + '_reviews_filtered.json', 'w')
total_count, eligible_review_count = 0, 0
with open(path + city_name + '_reviews.json') as data_file:
    for line in data_file:
        total_count += 1
        review = json.loads(line)
        if review['user_id'] in eligible_users and review['business_id'] in eligible_restaurants:
            filter_output.write(line)
            eligible_review_count += 1

print (eligible_review_count, 'reviews eligible, out of a total of', total_count)

filter_output.close()

Extracting the ratings matrix

In [0]:
skinny_output = open(path + city_name + '_reviews_ratings_only.txt', 'w')
with open(path + city_name + '_reviews_filtered.json') as data_file:
    for line in data_file:
        review = json.loads(line)
        #print(review)
        skinny_output.write(review['user_id'] + '\t' + review['business_id'] + '\t' + str(review['stars']) + '\t' + str(review['date']) + '\n')

skinny_output.close()


We create a mapping for each user_id and business_id to be able to use those as index for the arrays

In [0]:
# constructing the mappings from user_id and restaurant_id into matrix indices
unique_user_counter, unique_rest_counter = 0, 0
user_index_map, rest_index_map = {}, {}

with open(path + city_name + '_reviews_filtered.json') as data_file:
    for line in data_file:
        review = json.loads(line)
        if review['user_id'] not in user_index_map.keys(): 
            user_index_map[review['user_id']] = unique_user_counter
            unique_user_counter += 1
        if review['business_id'] not in rest_index_map.keys():
            rest_index_map[review['business_id']] = unique_rest_counter
            unique_rest_counter += 1

print ('unique users:', unique_user_counter)
print ('unique restaurants:', unique_rest_counter)

# save the index mappings 
with open(path + city_name + '_filtered_user_index.json', 'w') as user_idx_file:
    json.dump(user_index_map, user_idx_file)
with open(path + city_name + '_filtered_restaurants_index.json', 'w') as rest_idx_file:
    json.dump(rest_index_map, rest_idx_file)

unique users: 8635
unique restaurants: 10721


In [0]:
header = ['user_id', 'business_id', 'rating','date']
df = pd.read_csv(path + city_name + '_reviews_ratings_only.txt', sep='\t', names=header)

n_users = df.user_id.unique().shape[0]
n_items = df.business_id.unique().shape[0]
total_count = len(df)
print ('Number of users = ' + str(n_users) + ' | Number of items = ' + str(n_items))

print ('load the index mappings')
with open(path + city_name + '_filtered_user_index.json') as user_idx_file:
    user_id_map = json.load(user_idx_file)
with open(path + city_name + '_filtered_restaurants_index.json') as rest_idx_file:
    item_id_map = json.load(rest_idx_file)

print ('split and construct matrix')
train_data, test_data = train_test_split(df, test_size=0.25)

Number of users = 8635 | Number of items = 10721
load the index mappings
split and construct matrix


In [0]:
inverse_user_map = dict([(value, key) for key, value in user_id_map.items()]) 
inverse_item_map = dict([(value, key) for key, value in item_id_map.items()]) 

Creating a ratings matrix and calculating the sparsity of the data

In [0]:
train_data_matrix = np.zeros((n_users, n_items))

total_count = 0

for line in train_data.itertuples():
  train_data_matrix[user_id_map[line[1]], item_id_map[line[2]]] = line[3]
  total_count += 1
  

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
  test_data_matrix[user_id_map[line[1]], item_id_map[line[2]]] = line[3]
  total_count += 1


In [0]:
print ('total users:', n_users)
print ('total items:', n_items)
print ('total reviews:', total_count)
print ('sparsity:', total_count / (n_users * n_items * 1.0))

total users: 8635
total items: 10721
total reviews: 456288
sparsity: 0.004928802424520395


In [0]:
# train_data_matrix = scipy.sparse.csr_matrix(train_data.values)
# test_data_matrix = scipy.sparse.csr_matrix(test_data.values)

Concatenating reviews for each user and for each business

In [0]:
user_concat = ['' for _ in range(len(user_id_map))]
restaurant_concat = ['' for _ in range(len(item_id_map))]

with open(path + city_name + '_reviews_filtered.json') as data_file:
	for line in data_file:
	    review = json.loads(line)
	    text, user_id, restaurant_id = review['text'], review['user_id'], review['business_id']
	    user_concat[user_id_map[user_id]] += ' ' + text
	    restaurant_concat[item_id_map[restaurant_id]] += ' ' + text

with open(path + city_name + '_users_concat_reviews.txt', 'w') as data_file:
	json.dump(user_concat, data_file)

with open(path + city_name + '_restaurants_concat_reviews.txt', 'w') as data_file:
	json.dump(restaurant_concat, data_file)

Creating a TF-IDF vector for each of the concatenated reviews

In [0]:
with open(path + city_name + '_users_concat_reviews.txt') as data_file:
	user_corpus = json.load(data_file)

with open(path + city_name + '_restaurants_concat_reviews.txt') as data_file:
	restaurant_corpus = json.load(data_file)

# directly to tf-idf matrix
vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')

print ('constructing tf-idf matrix for the restaurants')
restaurants_X = vectorizer.fit_transform(restaurant_corpus)

print ('constructing tf-idf matrix for the users')
user_X = vectorizer.transform(user_corpus)

with open(path + city_name + '_user_X.np', 'wb') as file:
	np.save(file, user_X)

with open(path + city_name + '_restaurants_X.np', 'wb') as file:
	np.save(file, restaurants_X)
 

constructing tf-idf matrix for the restaurants
constructing tf-idf matrix for the users


In [0]:
user_X = np.load(path + city_name + '_user_X.np',allow_pickle=True)
restaurants_X = np.load(path + city_name + '_restaurants_X.np',allow_pickle=True)

Computing the similarities between vectors for user and vectors for restaurants

In [0]:
cosine_similarities = linear_kernel(user_X, restaurants_X)

In [0]:
with open(path + city_name + '_cosine_similarities.np', 'wb') as file:
	np.save(file, cosine_similarities)

In [0]:
restaurants_X = np.load(path + city_name + '_restaurants_X.np',allow_pickle=True)
restaurants_X = restaurants_X.item()
train_matrix = train_data_matrix

Creating a user preference matrix

In [0]:
user_preference = np.zeros((train_matrix.shape[0], restaurants_X.shape[1]))
for i in range(train_matrix.shape[0]):
	nonzero_index = np.nonzero(train_matrix[i,])[0]
	nonzero_values = train_matrix[i, nonzero_index]
	user_preference[i,] = nonzero_values.dot(restaurants_X[nonzero_index,].toarray())


In [0]:
with open(path1 + city_name + '_user_preference.np', 'wb') as file:
	np.save(file, user_preference)

In [0]:
user_preference = np.load(path1 + city_name + '_user_preference.np',allow_pickle=True)

Normalizing the user preference matrix

In [0]:
user_preference = normalize(user_preference)

In [0]:
with open(path1 + city_name + '_user_preference_normalize.np', 'wb') as file:
	np.save(file, user_preference)

Cell to call the garbage cleaner explicitly to clear the RAM

In [0]:
import gc
gc.collect()

0

In [0]:
user_preference = np.load(path1 + city_name + '_user_preference_normalize.np',allow_pickle=True)


In [0]:
print ('computing the similarity')
cosine_similarities = linear_kernel(user_preference, restaurants_X)

computing the similarity


In [0]:
with open(path + city_name + '_user_preference_cos.np', 'wb') as file:
	np.save(file, cosine_similarities)

In [0]:
overall_matrix = test_data_matrix + train_data_matrix

# '_cosine_similarities.np' or '_user_preference_cos.np'
sorting_method = '_cosine_similarities.np'

cosine_similarities = np.load(path + city_name + sorting_method)

test_predictions = np.zeros((n_users, n_items))
train_predictions = np.zeros((n_users, n_items))

overall_predictions = np.zeros((n_users,n_items))

Making predictions for the user using weighted regression method

In [0]:
for user_idx in range(n_users):
	print ('making predictions for the user:', user_idx)

	list_with_index = [(sim, rest_index) for (rest_index, sim) in enumerate(cosine_similarities[user_idx,])]
	list_with_index.sort()

	reverse_index = {old_index: new_index for (new_index, (_, old_index)) in enumerate(list_with_index)}

	nonzero_index = np.nonzero(train_data_matrix[user_idx,])[0]

	nonzero_values = train_data_matrix[user_idx, nonzero_index]

	sorted_index = [reverse_index[old_index] for old_index in nonzero_index]
	
	sorted_index = np.array(sorted_index).reshape(len(sorted_index), 1)
	nonzero_values = np.array(nonzero_values).reshape(len(nonzero_values), 1)

	clf = KernelRidge(alpha=1.0)
	clf.fit(sorted_index, nonzero_values)

	# make predictions
	overall_nonzero_index = np.nonzero(overall_matrix[user_idx,])[0]
	new_index = [reverse_index[old_index] for old_index in overall_nonzero_index]
	new_index = np.array(new_index).reshape(len(new_index), 1)
	pred = clf.predict(new_index)
	pred = np.maximum(1, np.minimum(pred, 5))
	overall_predictions[user_idx, overall_nonzero_index] = pred.reshape(1, len(overall_nonzero_index))

Function to calculate the RMSE

In [0]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()

    return math.sqrt(mean_squared_error(prediction, ground_truth))

In [0]:
print ('testing rmse:', rmse(overall_predictions, test_data_matrix))
print ('training rmse:', rmse(overall_predictions, train_data_matrix))


testing rmse: 1.4088356224819985
training rmse: 1.412949150924642


In [0]:
with open(path + city_name + sorting_method + '_all_predictions.np', 'wb') as file:
	np.save(file, overall_predictions)

In [0]:
sorting_method = '_cosine_similarities.np'

overall_predictions = np.load(path + city_name + sorting_method + '_all_predictions.np',allow_pickle=True)


In [0]:
x = pd.DataFrame(overall_predictions)

In [0]:
x.index = x.index.to_series().map(inverse_user_map)


In [0]:
x.columns = x.columns.to_series().map(inverse_item_map)


In [0]:
x = x.unstack().reset_index(name='rating')

In [0]:
x.to_csv(path + "predictions.csv")

# With last ratings as the test set

In [0]:
train = pd.read_csv(path_data + "train_noE.csv")
test = pd.read_csv(path_data + "test_noE.csv")

In [0]:
train = train[["user_id","business_id","rating"]]
test = test[["user_id","business_id","rating"]]

In [0]:
train_data_matrix = np.zeros((n_users, n_items))

total_count = 0

for line in train.itertuples():
  train_data_matrix[user_id_map[line[1]], item_id_map[line[2]]] = line[3]
  total_count += 1
  

test_data_matrix = np.zeros((n_users, n_items))
for line in test.itertuples():
  test_data_matrix[user_id_map[line[1]], item_id_map[line[2]]] = line[3]
  total_count += 1

In [0]:
restaurants_X = np.load(path + city_name + '_restaurants_X.np',allow_pickle=True)
restaurants_X = restaurants_X.item()
train_matrix = train_data_matrix

In [0]:
overall_matrix = test_data_matrix + train_data_matrix

# '_cosine_similarities.np' or '_user_preference_cos.np'
sorting_method = '_cosine_similarities.np'

cosine_similarities = np.load(path + city_name + sorting_method)

test_predictions = np.zeros((n_users, n_items))
train_predictions = np.zeros((n_users, n_items))

overall_predictions = np.zeros((n_users,n_items))

In [0]:
for user_idx in range(n_users):
	#print ('making predictions for the user:', user_idx)

	# list of (similarity, real restaurant index), sorted by similarity
	list_with_index = [(sim, rest_index) for (rest_index, sim) in enumerate(cosine_similarities[user_idx,])]
	list_with_index.sort()

	# list of (real restaurant index, the index of that restaurant in the sorted similarity list),
	# this list is sorted by the real restaurant index
	# use this to convert form the old real restaurant index, into the new index in the list_with_index 
	reverse_index = {old_index: new_index for (new_index, (_, old_index)) in enumerate(list_with_index)}
	# reverse_index.sort()

	# the real index of non-zero-rating restaurant
	nonzero_index = np.nonzero(train_data_matrix[user_idx,])[0]

	# the ratings of this user, listed in the original order in the training matrix
	nonzero_values = train_data_matrix[user_idx, nonzero_index]

	# for each of the non-zero real restaurant index, the corresponding new index 
	sorted_index = [reverse_index[old_index] for old_index in nonzero_index]
	
	sorted_index = np.array(sorted_index).reshape(len(sorted_index), 1)
	nonzero_values = np.array(nonzero_values).reshape(len(nonzero_values), 1)

	clf = KernelRidge(alpha=1.0)
	clf.fit(sorted_index, nonzero_values)

	# make predictions
	overall_nonzero_index = np.nonzero(overall_matrix[user_idx,])[0]
	#print (len(overall_nonzero_index))
	new_index = [reverse_index[old_index] for old_index in overall_nonzero_index]
	new_index = np.array(new_index).reshape(len(new_index), 1)
	pred = clf.predict(new_index)
	pred = np.maximum(1, np.minimum(pred, 5))
	#print (pred)
	overall_predictions[user_idx, overall_nonzero_index] = pred.reshape(1, len(overall_nonzero_index))

In [0]:
print ('testing rmse:', rmse(overall_predictions, test_data_matrix))
print ('training rmse:', rmse(overall_predictions, train_data_matrix))


testing rmse: 1.5558682235279286
training rmse: 1.405905385400033


In [0]:
with open(path_data + city_name + sorting_method + '_all_predictions.np', 'wb') as file:
	np.save(file, overall_predictions)

In [0]:
sorting_method = '_cosine_similarities.np'

overall_predictions = np.load(path_data + city_name + sorting_method + '_all_predictions.np',allow_pickle=True)


In [0]:
predictions_reviews = pd.DataFrame(overall_predictions)

In [0]:
predictions_reviews.index = predictions_reviews.index.to_series().map(inverse_user_map)


In [0]:
predictions_reviews.columns = predictions_reviews.columns.to_series().map(inverse_item_map)


In [0]:
predictions_reviews = predictions_reviews.unstack().reset_index(name='rating')

In [0]:
predictions_reviews.columns = ["business_id","user_id","rating_reviews"]

In [0]:
predictions_reviews.to_csv(path_data + "predictions_reviews.csv")
