In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm.notebook import tqdm
import re
import ast
from scipy.sparse import dok_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix
from sklearn.decomposition import TruncatedSVD
import nmslib
import numpy as np

In [None]:
merged_df = pd.read_csv('final_data.csv', index_col=0)

# Collaborative Filtering Recommendation System

## Approach 1

The dataset encompasses user reviews and ratings of a wide range of titles. From these reviews, key keywords are extracted to define user similarity. Subsequently, dimensionality reduction techniques are applied to the matrix of users and keywords, effectively compressing the user-item interaction space. This process is conducted with precision to retain the core aspects of user preferences. Within this reduced space, nearest neighbors are determined through cosine similarity. Ultimately, only those titles with a review score exceeding 3.5 are recommended, ensuring a focus on quality in the suggestions provided to the users.

In [None]:
# Read the saved CSV file
csv_path = 'user_keywords.csv'
read_keywords = pd.read_csv(csv_path)
read_keywords.head(5)

Unnamed: 0,User_id,keywords
0,A00109803PZJ91RLT7DPN,"['calder', 'rang', 'saga']"
1,A00117421L76WVWG4UX95,"['queen', 'harlem', 'novel']"
2,A0015610VMNR0JC9XVL1,"['richest', 'man', 'babylon', 'babylonian', 'p..."
3,A002258237PFYJV336T05,"['swan', 'place']"
4,A00264602WCXBHHFPLTQ4,"['berenstain', 'bear', 'much', 'vacat']"


In [None]:
read_keywords['keywords'] = read_keywords['keywords'].apply(ast.literal_eval)
all_keywords = set(keyword for keywords_list in read_keywords['keywords'] for keyword in keywords_list)
user_to_index = {user_id: i for i, user_id in enumerate(read_keywords['User_id'].unique())}
keyword_to_index = {keyword: i for i, keyword in enumerate(all_keywords)}
binary_matrix = lil_matrix((len(user_to_index), len(keyword_to_index)), dtype=int)

for _, row in read_keywords.iterrows():
    user_idx = user_to_index[row['User_id']]
    for keyword in row['keywords']:
        keyword_idx = keyword_to_index[keyword]
        binary_matrix[user_idx, keyword_idx] = 1

binary_matrix_csr = binary_matrix.tocsr()
svd = TruncatedSVD(n_components=100)
reduced_matrix = svd.fit_transform(binary_matrix_csr)

In [None]:
reduced_matrix.shape

(1008961, 100)

In [None]:
reduced_matrix

array([[ 4.77838804e-03,  3.73991940e-05,  5.46856952e-04, ...,
        -9.60583957e-03, -1.86519052e-03, -2.68493460e-04],
       [ 4.46301998e-02,  5.75956629e-02,  9.29968519e-02, ...,
         6.96698932e-03, -5.53384913e-05, -3.73734952e-03],
       [ 2.57221649e-02,  3.96952101e-02,  4.78884723e-03, ...,
        -4.12892741e-02, -9.69739300e-02, -3.77882021e-02],
       ...,
       [ 9.79402716e-02,  2.81335410e-02,  3.26817536e-02, ...,
         5.02869198e-02, -1.27994348e-02,  3.88027999e-02],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 1.07813665e-01,  8.44295607e-01, -5.02445349e-01, ...,
         7.86021307e-02,  1.37227970e-01,  4.16715120e-03]])

In [None]:
# Initialize a new index, specifying the metric as cosine similarity
index = nmslib.init(method='hnsw', space='cosinesimil')

# Add all vectors to the index
index.addDataPointBatch(reduced_matrix)

# Create the index
index.createIndex({'post': 2}, print_progress=True)

# Query the index for nearest neighbors
ids, distances = index.knnQuery(reduced_matrix[0], k=10)  # Example: Find 10 nearest neighbors for the first user



0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

In [None]:
index.createIndex({'post': 2}, print_progress=True)

# Function to query nearest neighbors for each user
def query_all_neighbors(reduced_matrix, index, num_neighbors=10):
    neighbor_ids = []
    neighbor_distances = []

    # Query the index for each user in the reduced matrix
    for i in range(reduced_matrix.shape[0]):
        ids, distances = index.knnQuery(reduced_matrix[i], k=num_neighbors)
        neighbor_ids.append(ids)
        neighbor_distances.append(distances)

    return neighbor_ids, neighbor_distances

# Get all neighbor IDs and their distances
all_neighbor_ids, all_neighbor_distances = query_all_neighbors(reduced_matrix, index, num_neighbors=10)



0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
***************************************************

0%   10   20   30   40   50   60   70   80   90   100%
|----|----|----|----|----|----|----|----|----|----|
****************************************************


In [None]:
target_user_index = user_to_index['AVCGYZL8FQQTD']  # Replace with the actual target user_id
target_user_vector = reduced_matrix[target_user_index]

# Query for nearest neighbors of the target user
ids, distances = index.knnQuery(target_user_vector, k=10)

In [None]:
def get_neighbor_titles(user_id, neighbor_ids, df):
    # Get the titles and scores for the target user
    user_titles = set(df[df['User_id'] == user_id]['Title'])

    # Get the titles and scores for the neighbors
    neighbor_interactions = df[df['User_id'].isin(neighbor_ids)]

    # Filter for titles with a review score greater than 4
    high_score_titles = neighbor_interactions[neighbor_interactions['review/score'] > 3.5]['Title'].unique()

    # Recommend titles that neighbors rated highly and the user has not interacted with
    recommended_titles = set(high_score_titles) - user_titles

    return list(recommended_titles)

In [None]:
index_to_user = {idx: user_id for user_id, idx in user_to_index.items()}
neighbor_user_ids = [index_to_user[id] for id in ids]
recommended_titles = get_neighbor_titles('AVCGYZL8FQQTD', neighbor_user_ids, merged_df)

In [None]:
def rank_titles(titles, interactions_df):
    title_scores = interactions_df[interactions_df['Title'].isin(titles)].groupby('Title')['review/score'].mean()
    # Sort the titles based on the average score in descending order
    ranked_titles = title_scores.sort_values(ascending=False).index.tolist()
    return ranked_titles

In [None]:
merged_df.head(1)

Unnamed: 0,Id,Title,User_id,review/helpfulness,review/score,review/time,review/summary,review/text,description,authors,publisher,publishedDate,categories,ratingsCount,compound,Sentiment
0,1882931173,Its Only Art If Its Well Hung!,AVCGYZL8FQQTD,7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,,['Julie Strain'],,1996,['Comics & Graphic Novels'],2.0,0.9408,positive


In [None]:
merged_df[merged_df['User_id'] == 'AVCGYZL8FQQTD'].Title[0]

'Its Only Art If Its Well Hung!'

In [None]:
neighbor_user_ids = [index_to_user[id] for id in ids]
recommended_titles = get_neighbor_titles('AVCGYZL8FQQTD', neighbor_user_ids, merged_df)
ranked_recommendations = rank_titles(recommended_titles, merged_df)
print("Top 10 Recommendations:")
for i, title in enumerate(ranked_recommendations[:10], start=1):
    print(f"TOP {i}: {title}")

Top 10 Recommendations:
TOP 1: Shadows of the Neanderthal: Illuminating the Beliefs that Limit Our Organizations
TOP 2: Adobe(R) Photoshop(R) 7 One Click Wow!
TOP 3: Out of One Eye: The Art of Kit Williams
TOP 4: The Art of Photoshop
TOP 5: One Flew Over the Cuckoo's Nest
TOP 6: The Speed of Trust: the One Thing That Changes Everything
TOP 7: The Art of Auto-fellatio: Oral Sex for One
TOP 8: Zen And The Art of Motorcycle Maintenance
TOP 9: Zen and the Art of Motorcycle Maintenance : An Inquiry Into Values
TOP 10: Zen and the Art of Motorcycle Maintenance


# Approach 2

The recommendation system operates on a dataset of user-item interactions by focusing on user reviews and ratings of various titles. Separate embeddings are created for users and items, allowing the model to learn dense representations of each entity based on interaction data. The embeddings are concatenated and passed through a fully connected hidden layer with nonlinear activation (ReLU), facilitating the capture of complex interactions between user and item features. A linear output layer predicts the rating a user would give to an item, as demonstrated in the forward pass of the model. Additionally, the model supports querying similar items based on their learned embeddings. By computing cosine similarity between item embeddings, the system can suggest items that are contextually similar to a given item, enhancing user discovery and engagement.

referenced by https://www.kaggle.com/code/anshkgoyal/book-recommendation-system
to learn an effective implementation

In [None]:
user_ids = merged_df['User_id'].unique()
item_ids = merged_df['Title'].unique()
user_to_index = {user_id: idx for idx, user_id in enumerate(user_ids)}
item_to_index = {item_id: idx for idx, item_id in enumerate(item_ids)}
index_to_title = {idx: title for title, idx in item_to_index.items()}

user_indices = torch.tensor([user_to_index[user_id] for user_id in merged_df['User_id']], dtype=torch.long)
item_indices = torch.tensor([item_to_index[item_id] for item_id in merged_df['Title']], dtype=torch.long)
ratings = torch.tensor(merged_df['review/score'].values, dtype=torch.float32)

dataset = TensorDataset(user_indices, item_indices, ratings)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

class CollaborativeFilteringModel(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, hidden_dim):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        self.hidden_layer = nn.Linear(embedding_dim * 2, hidden_dim)
        self.relu = nn.ReLU()
        self.output_layer = nn.Linear(hidden_dim, 1)

    def forward(self, user_indices, item_indices):
        user_embedded = self.user_embedding(user_indices)
        item_embedded = self.item_embedding(item_indices)
        concatenated = torch.cat([user_embedded, item_embedded], dim=1)
        hidden_output = self.relu(self.hidden_layer(concatenated))
        output = self.output_layer(hidden_output)
        return output

    def get_similar_titles(self, input_title_index, top_k=100):
        device = self.item_embedding.weight.device
        input_title_index = torch.tensor([input_title_index], device=device)
        input_title_embedding = self.item_embedding(input_title_index)
        all_title_embeddings = self.item_embedding.weight
        similarities = F.cosine_similarity(input_title_embedding, all_title_embeddings)
        similar_title_indices = torch.argsort(similarities, descending=True)[:top_k]
        similar_titles = [index_to_title[idx.item()] for idx in similar_title_indices]

        return similar_titles

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_path="collaborative_filtering_model.pth"
model = torch.load(model_path, map_location=device)
model.to(device)

CollaborativeFilteringModel(
  (user_embedding): Embedding(1008961, 100)
  (item_embedding): Embedding(206711, 100)
  (hidden_layer): Linear(in_features=200, out_features=32, bias=True)
  (relu): ReLU()
  (output_layer): Linear(in_features=32, out_features=1, bias=True)
)

In [None]:
model.eval()
total_mse = 0.0
with torch.inference_mode():
    for user_batch, item_batch, rating_batch in tqdm(dataloader, desc="Evaluating..."):
        user_batch = user_batch.to(device)
        item_batch = item_batch.to(device)
        rating_batch = rating_batch.to(device)
        predictions = model(user_batch, item_batch).squeeze()
        mse = F.mse_loss(predictions, rating_batch)
        total_mse += mse.item()

average_mse = total_mse / len(dataloader)
rmse = average_mse ** 0.5
print(f'Mean Squared Error: {average_mse:.4f}')
print(f'Root Mean Squared Error: {rmse:.4f}')

Evaluating...:   0%|          | 0/38095 [00:00<?, ?it/s]

Mean Squared Error: 1.1099
Root Mean Squared Error: 1.0535


In [None]:
merged_df.head(1)

Unnamed: 0,Id,Title,User_id,review/helpfulness,review/score,review/time,review/summary,review/text,description,authors,publisher,publishedDate,categories,ratingsCount,compound,Sentiment
0,1882931173,Its Only Art If Its Well Hung!,AVCGYZL8FQQTD,7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,,['Julie Strain'],,1996,['Comics & Graphic Novels'],2.0,0.9408,positive


In [None]:
model.eval()
user_index = torch.tensor([user_to_index['AVCGYZL8FQQTD']], dtype=torch.long).to(device)
item_index = torch.tensor([item_to_index['Its Only Art If Its Well Hung!']], dtype=torch.long).to(device)
prediction = model(user_index, item_index).item()
print(f'Predicted Rating: {prediction:.4f}')

Predicted Rating: 3.9195


In [None]:
def get_collaborative_recommendations(model, title, num_recommendations=100):

    input_title_index = item_to_index[title]
    model.eval()
    with torch.inference_mode():
        similar_titles = model.get_similar_titles(input_title_index, top_k=num_recommendations)

    return similar_titles

In [None]:
input_title = "Its Only Art If Its Well Hung!"
collab_recommendations = get_collaborative_recommendations(model, input_title, num_recommendations=1000)
print("Top 10 Recommendations:")
for i, title in enumerate(collab_recommendations[1:11], start=1):
    print(f"Top{i}: {title}")

Top 10 Recommendations:
Top1: Rambling Recollections of a Soldier of Fortune 1842
Top2: Power Restructuring In China And Russia (Social Change in Global Perspective)
Top3: The Berenstain Bears - Trouble With Pets/The Sitter/Too Much T.V./Lost In a Cave
Top4: Carlo Crivelli, ([The Great masters in painting and sculpture])
Top5: Death At The Crossroads
Top6: Cloudbearer's Shadow (Sword in Exile, Book 1)
Top7: Confessions of Georgia Nicolson
Top8: Bedside Cardiology (Bedside Cardiology (Constant))
Top9: Design for Gardens
Top10: The Professional Trainer: A Human Resource Training and Development Guide
