# Content-based filtering model

In [1]:
import re
import string
import sys
sys.path.append('../../')

from os.path import join as pjoin

import pandas as pd
import numpy as np
import faiss
import nltk

from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix, coo_matrix
from sklearn.decomposition import TruncatedSVD

from src.utils import read_json_df

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrii\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
DATA_PATH = "../../data/"
DATASET_PATH = pjoin(DATA_PATH, "yelp_dataset/")

In [3]:
review_df = read_json_df(pjoin(DATASET_PATH, "yelp_academic_dataset_review.json"))

In [4]:
def preprocess_text(text):
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    stop_words = set(stopwords.words("english"))
    word_tokens = text.split()
    filtered_text = [word for word in word_tokens if word not in stop_words]
    filtered_text = ' '.join(filtered_text)
    return filtered_text

In [5]:
review_df['cleaned_text'] = review_df['text'].apply(preprocess_text)

In [6]:
def load_glove_vectors(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

In [7]:
glove_file = 'glove.6B.100d.txt'
embeddings_index = load_glove_vectors(pjoin(DATA_PATH, glove_file))

In [8]:
def get_average_vector(text, embeddings_index):
    words = text.split()
    word_vectors = [embeddings_index[word] for word in words if word in embeddings_index]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        some_key = list(embeddings_index.keys())[0]
        return np.zeros(embeddings_index[some_key].shape)

In [9]:
review_df['review_vector'] = review_df['cleaned_text'].apply(lambda x: get_average_vector(x, embeddings_index))

In [10]:
review_df_filtered = review_df[review_df['review_vector'].apply(lambda x: not np.allclose(np.zeros(embeddings_index[list(embeddings_index.keys())[0]].shape), x))]

KeyboardInterrupt: 

In [12]:
restaurant_vectors = review_df.groupby('business_id')['review_vector'].apply(lambda x: np.mean(np.vstack(x.tolist()), axis=0))

In [13]:
restaurant_vectors

business_id
---kPU91CF4Lq2-WlRu9Lw    [-0.21569727, 0.25217083, 0.2632159, -0.253070...
--0iUa4sNDFiZFrAdIWhZQ    [-0.108681165, 0.20706482, 0.23127718, -0.1635...
--30_8IhuyMHbSOcNWd6DQ    [0.086936384, 0.19277348, 0.14629911, -0.14296...
--7PUidqRWpRSpXebiyxTg    [-0.17370927, 0.23072208, 0.1910035, -0.147103...
--7jw19RH9JKXgFohspgQw    [-0.030771453, 0.20263673, 0.13858281, -0.0736...
                                                ...                        
zznZqH9CiAznbkV6fXyHWA    [-0.16085109, 0.27624714, 0.19358303, -0.19220...
zztOG2cKm87I6Iw_tleZsQ    [-0.03555422, 0.13701583, 0.14510758, 0.026494...
zzu6_r3DxBJuXcjnOYVdTw    [-0.11649682, 0.2802446, 0.18056643, -0.085451...
zzw66H6hVjXQEt0Js3Mo4A    [-0.06203431, 0.25265533, 0.26369348, -0.11718...
zzyx5x0Z7xXWWvWnZFuxlQ    [-0.10935566, 0.21559715, 0.19723853, -0.15965...
Name: review_vector, Length: 150346, dtype: object

In [14]:
user_vectors = review_df.groupby('user_id')['review_vector'].apply(lambda x: np.mean(x.tolist(), axis=0))

In [15]:
user_vectors

user_id
---1lKK3aKOuomHnwAkAow    [-0.18244882, 0.407449, 0.27219525, -0.1172297...
---2PmXbF47D870stH1jqA    [-0.19804399, 0.26012298, 0.158197, -0.1401182...
---UgP94gokyCDuB5zUssA    [-0.02868935, 0.20637617, 0.292074, -0.1764314...
---fa6ZK37T9NjkGKI4oSg    [0.028195942, 0.11133146, 0.08044941, -0.03462...
---r61b7EpVPkb4UVme5tA    [-0.10271549, 0.18715751, 0.2808567, -0.157469...
                                                ...                        
zzz-M4QvkEpUWWPL9RTzLA    [-0.13413207, 0.2519072, 0.15054464, -0.180143...
zzzCg-_lpyYE82TlJCdwdw    [-0.0790559, 0.312502, 0.4163658, -0.28613174,...
zzzGgfvrSJ4AQeKtcgocIw    [-0.113682896, 0.27475154, 0.23168959, -0.1276...
zzzMBVS73g3ZJ7qL8JyhiA    [-0.009016817, 0.044815574, 0.20541243, -0.207...
zzzUFM4HFe0SFG0bPjntQA    [-0.14672713, 0.28231618, 0.19622217, -0.16690...
Name: review_vector, Length: 1987929, dtype: object

In [16]:
def recommend_restaurants(user_id, user_vectors, restaurant_vectors, reviews_df, top_n=5):
    user_vector = user_vectors[user_id]
    visited_restaurants = reviews_df[reviews_df['user_id'] == user_id]['business_id'].unique()
    unvisited_restaurant_vectors = restaurant_vectors[~restaurant_vectors.index.isin(visited_restaurants)]
    similarities = cosine_similarity([user_vector], unvisited_restaurant_vectors.tolist())
    similarity_scores = list(zip(unvisited_restaurant_vectors.index, similarities[0]))
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    return [restaurant for restaurant, score in similarity_scores[:top_n]]

In [17]:
user_id = 'bcjbaE6dDog4jkNY91ncLQ'
recommended_restaurants = recommend_restaurants(user_id, user_vectors, restaurant_vectors, review_df)
print(recommended_restaurants)

['b92dSNftvCSPWLC4ZgNXCg', 'ztnYPn2w0R4lEJL266apzg', 'ftpaEvx7cMJmHBdQKd4y-g', 'ZU3LP5gHs3UybWmh9J13NA', '0kEyHYpI6ixtq87LV6I1Aw']


# User-user collaborative filtering model

In [4]:
user_df = read_json_df(pjoin(DATASET_PATH, "yelp_academic_dataset_user.json"))

In [5]:
filtered_user_df = user_df  # user_df[user_df['review_count'] >= 10]

In [6]:
review_df_filtered = review_df.merge(filtered_user_df[['user_id']], on='user_id', how='inner')

In [7]:
business_df = read_json_df(pjoin(DATASET_PATH, "yelp_academic_dataset_business.json"))

In [8]:
filtered_business_df = business_df  # [business_df["review_count"] >= 10]

In [9]:
review_df_filtered = review_df_filtered.merge(filtered_business_df[['business_id']], on='business_id', how='inner')

In [10]:
review_df_filtered_sample = review_df_filtered # review_df_filtered.sample(frac=0.1, random_state=42)

In [11]:
user_ids = review_df_filtered_sample['user_id'].astype('category').cat.codes
business_ids = review_df_filtered_sample['business_id'].astype('category').cat.codes
stars = review_df_filtered_sample['stars']

user_item_sparse_matrix = coo_matrix((stars, (user_ids, business_ids)))
# In case of (user_ids, business_ids) pair repeating, coo_matrix sums their stars. Possibly we should drop earlier reviews to avoid this

user_item_sparse_matrix = user_item_sparse_matrix.tocsr()

In [16]:
user_ids.name = "user_code"
user_ids_df = pd.concat((user_ids, review_df_filtered_sample['user_id']), axis=1)

business_ids.name = "business_code"
business_ids_df = pd.concat((business_ids, review_df_filtered_sample['business_id']), axis=1)

In [19]:
user_item_sparse_matrix.shape

(1987897, 150346)

In [24]:
svd = TruncatedSVD(n_components=100)  # Reduce to 100 dimensions to be able to fit into RAM
reduced_matrix = svd.fit_transform(user_item_sparse_matrix)

reduced_matrix = reduced_matrix.astype(np.float32)

reduced_matrix.shape

(1987897, 100)

In [25]:
index = faiss.IndexFlatIP(reduced_matrix.shape[1])
faiss.normalize_L2(reduced_matrix)

index.add(reduced_matrix)

In [46]:
def get_user_based_recommendations(user_id, n_recommendations=5, n_neighbours=100):
    user_code = user_ids_df.loc[user_ids_df['user_id'] == user_id, 'user_code'].index[0]

    sim_scores, sim_user_codes = index.search(reduced_matrix[user_code].reshape(1, -1), n_neighbours)

    sim_scores, sim_user_codes = sim_scores[0], sim_user_codes[0]
    # remove the same user found
    sim_scores = sim_scores[sim_user_codes != user_code]
    sim_user_codes = sim_user_codes[sim_user_codes != user_code]

    similar_users_ratings = user_item_sparse_matrix[sim_user_codes].toarray()  # (n_neighbours, n_businesses)

    # In the next step, ideally we should mean only over those users who have reviews for the business for each business
    recommendations_codes = np.argsort(similar_users_ratings.mean(axis=0)) # (n_businesses)
    recommendations_codes = recommendations_codes[:n_recommendations] # ()

    res = []
    for code in recommendations_codes:
        business_id = business_ids_df.loc[business_ids_df['business_code'] == code, 'business_id'].values[0]
        res.append(business_id)
    
    return res


In [47]:
user_id = 'bcjbaE6dDog4jkNY91ncLQ'
recommended_restaurants = get_user_based_recommendations(user_id)
print(recommended_restaurants)

['---kPU91CF4Lq2-WlRu9Lw', 'elXs6FDsrICenolgVQlWEw', 'el_ROdKSbFJyAuX6zeazxg', 'elbgEUwbfQaBq8fNFAaTjQ', 'elbqy5pmTuhPo6-VZR5g8w']


# Item-item collaborative filtering model

In [None]:
item_user_matrix = user_item_matrix.T
item_similarity = cosine_similarity(item_user_matrix)
item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)

In [None]:
def get_item_based_recommendations(business_id, n_recommendations=5):
    similar_items = item_similarity_df[business_id].sort_values(ascending=False)[1:n_recommendations+1]
    return similar_items.index

In [None]:
user_id = 'bcjbaE6dDog4jkNY91ncLQ'
recommended_restaurants = get_item_based_recommendations(user_id)
print(recommended_restaurants)