# Content-based filtering model

In [1]:
import pandas as pd
import numpy as np
import json
import re
import string
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from scipy.sparse import csr_matrix

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yehorhryha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data_file = open("yelp_academic_dataset_review.json")
data = []
for line in data_file:
    data.append(json.loads(line))
review_df = pd.DataFrame(data)
data_file.close()

In [4]:
def preprocess_text(text):
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    stop_words = set(stopwords.words("english"))
    word_tokens = text.split()
    filtered_text = [word for word in word_tokens if word not in stop_words]
    filtered_text = ' '.join(filtered_text)
    return filtered_text

In [5]:
review_df['cleaned_text'] = review_df['text'].apply(preprocess_text)

In [6]:
def load_glove_vectors(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

In [7]:
glove_file = 'glove.42B.300d.txt'
embeddings_index = load_glove_vectors(glove_file)

In [8]:
def get_average_vector(text, embeddings_index):
    words = text.split()
    word_vectors = [embeddings_index[word] for word in words if word in embeddings_index]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(100) 

In [9]:
review_df['review_vector'] = review_df['cleaned_text'].apply(lambda x: get_average_vector(x, embeddings_index))

In [10]:
review_df_filtered = review_df[review_df['review_vector'].apply(lambda x: x.shape == (300,))]

In [11]:
restaurant_vectors = review_df_filtered.groupby('business_id')['review_vector'].apply(lambda x: np.mean(np.vstack(x.tolist()), axis=0))

In [12]:
restaurant_vectors

business_id
---kPU91CF4Lq2-WlRu9Lw    [-0.025884377, -0.0106101185, 0.033193346, -0....
--0iUa4sNDFiZFrAdIWhZQ    [-0.003858616, -0.0048574656, 0.051284056, -0....
--30_8IhuyMHbSOcNWd6DQ    [-0.009566194, 0.10207854, -0.014852737, -0.08...
--7PUidqRWpRSpXebiyxTg    [-0.038481254, -0.015079212, -0.007890894, -0....
--7jw19RH9JKXgFohspgQw    [0.031710885, 0.018421413, -0.10190451, -0.099...
                                                ...                        
zznZqH9CiAznbkV6fXyHWA    [-0.07251118, -0.018787667, 0.07007505, -0.165...
zztOG2cKm87I6Iw_tleZsQ    [0.0117372675, 0.10366254, -0.0061136256, -0.1...
zzu6_r3DxBJuXcjnOYVdTw    [-0.021689672, 0.04261571, -0.008005011, -0.24...
zzw66H6hVjXQEt0Js3Mo4A    [0.10033436, 0.04741541, -0.099287316, -0.1292...
zzyx5x0Z7xXWWvWnZFuxlQ    [-0.051175635, 0.0117875505, -0.007867022, -0....
Name: review_vector, Length: 150346, dtype: object

In [13]:
user_vectors = review_df_filtered.groupby('user_id')['review_vector'].apply(lambda x: np.mean(x.tolist(), axis=0))

In [14]:
user_vectors

user_id
---1lKK3aKOuomHnwAkAow    [-0.0655955, 0.0407599, -0.015283876, -0.15097...
---2PmXbF47D870stH1jqA    [-0.058902245, -0.018425545, 0.042226803, -0.1...
---UgP94gokyCDuB5zUssA    [-0.014254823, 0.04881997, 0.025271298, -0.131...
---fa6ZK37T9NjkGKI4oSg    [0.09433105, 0.090054296, -0.0765643, -0.08695...
---r61b7EpVPkb4UVme5tA    [-0.03895425, 0.026585538, 0.0327577, -0.11054...
                                                ...                        
zzz-M4QvkEpUWWPL9RTzLA    [-0.037178814, -0.0014889237, -0.03132782, -0....
zzzCg-_lpyYE82TlJCdwdw    [0.07843564, 0.16667348, -0.12678836, -0.07372...
zzzGgfvrSJ4AQeKtcgocIw    [-0.063432775, 0.040489618, -0.027906371, -0.1...
zzzMBVS73g3ZJ7qL8JyhiA    [0.005521926, 0.020848595, -0.045905232, -0.07...
zzzUFM4HFe0SFG0bPjntQA    [0.059421435, -0.052394625, -0.013942088, -0.0...
Name: review_vector, Length: 1987762, dtype: object

In [15]:
def recommend_restaurants(user_id, user_vectors, restaurant_vectors, reviews_df, top_n=5):
    user_vector = user_vectors[user_id]
    visited_restaurants = reviews_df[reviews_df['user_id'] == user_id]['business_id'].unique()
    unvisited_restaurant_vectors = restaurant_vectors[~restaurant_vectors.index.isin(visited_restaurants)]
    similarities = cosine_similarity([user_vector], unvisited_restaurant_vectors.tolist())
    similarity_scores = list(zip(unvisited_restaurant_vectors.index, similarities[0]))
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    return [restaurant for restaurant, score in similarity_scores[:top_n]]

In [16]:
user_id = 'bcjbaE6dDog4jkNY91ncLQ'
recommended_restaurants = recommend_restaurants(user_id, user_vectors, restaurant_vectors, review_df_filtered)
print(recommended_restaurants)

['34sfjPk297c7RVM67pU4tg', 'FiWGTyWJs4p4dLQc8P6-pg', 'ZU3LP5gHs3UybWmh9J13NA', 'chUeS0h9C8KGfyJO2Jqgew', 'XkfH7V4sy1bUSjLVR8E7gg']


# User-user collaborative filtering model

In [2]:
data_file = open("yelp_academic_dataset_review.json")
data = []
for line in data_file:
    data.append(json.loads(line))
review_df_filtered = pd.DataFrame(data)
data_file.close()

In [3]:
data_file = open("yelp_academic_dataset_user.json")
data = []
for line in data_file:
    data.append(json.loads(line))
user_df = pd.DataFrame(data)
data_file.close()

In [4]:
filtered_user_df = user_df[user_df['review_count'] >= 10]

In [5]:
review_df_filtered = review_df_filtered.merge(filtered_user_df[['user_id']], on='user_id', how='inner')

In [6]:
data_file = open("yelp_academic_dataset_business.json")
data = []
for line in data_file:
    data.append(json.loads(line))
business_df = pd.DataFrame(data)
data_file.close()

In [7]:
filtered_business_df = business_df[business_df["review_count"] >= 10]

In [8]:
review_df_filtered = review_df_filtered.merge(filtered_business_df[['business_id']], on='business_id', how='inner')

In [9]:
review_df_filtered_sample = review_df_filtered.sample(frac=0.1, random_state=42)

In [None]:
user_item_matrix = review_df_filtered_sample.pivot_table(index='user_id', columns='business_id', values='stars').fillna(0)
user_item_sparse_matrix = csr_matrix(user_item_matrix.values)
user_similarity = cosine_similarity(user_item_sparse_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

  user_item_matrix = review_df_filtered_sample.pivot_table(index='user_id', columns='business_id', values='stars').fillna(0)


In [None]:
def get_user_based_recommendations(user_id, n_recommendations=5):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:n_recommendations+1]
    similar_users_ratings = user_item_matrix.loc[similar_users.index]
    recommendations = similar_users_ratings.mean().sort_values(ascending=False)
    user_rated_businesses = user_item_matrix.loc[user_id]
    recommendations = recommendations[~recommendations.index.isin(user_rated_businesses[user_rated_businesses > 0].index)]
    return recommendations.head(n_recommendations).index.tolist()

In [None]:
user_id = 'bcjbaE6dDog4jkNY91ncLQ'
recommended_restaurants = get_user_based_recommendations(user_id)
print(recommended_restaurants)

# Item-item collaborative filtering model

In [None]:
item_user_matrix = user_item_matrix.T
item_similarity = cosine_similarity(item_user_matrix)
item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)

In [None]:
def get_item_based_recommendations(business_id, n_recommendations=5):
    similar_items = item_similarity_df[business_id].sort_values(ascending=False)[1:n_recommendations+1]
    return similar_items.index

In [None]:
user_id = 'bcjbaE6dDog4jkNY91ncLQ'
recommended_restaurants = get_item_based_recommendations(user_id)
print(recommended_restaurants)