# üìö Book Recommender System (BPR Model)

This project implements a **Bayesian Personalized Ranking (BPR)** based
neural collaborative filtering model for book recommendation.

The system:
- Trains on implicit feedback (user-book interactions)
- Learns user and book embeddings
- Evaluates ranking quality using HitRate and NDCG
- Provides interactive recommendations

## 1Ô∏è‚É£ Import Libraries

In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np
import random
import pickle

## 2Ô∏è‚É£ Load Dataset

In [32]:
ratings = pd.read_csv("data/ratings.csv")

print("Before filtering:")
print("Total ratings:", len(ratings))
print("Users:", ratings.user_id.nunique())
print("Books:", ratings.book_id.nunique())

Before filtering:
Total ratings: 981756
Users: 53424
Books: 10000


## 3Ô∏è‚É£ Filter Sparse Users & Books

In [33]:
min_user_ratings = 5
min_book_ratings = 5

user_counts = ratings.user_id.value_counts()
ratings = ratings[ratings.user_id.isin(user_counts[user_counts >= min_user_ratings].index)]

book_counts = ratings.book_id.value_counts()
ratings = ratings[ratings.book_id.isin(book_counts[book_counts >= min_book_ratings].index)]

print("\nAfter filtering:")
print("Total ratings:", len(ratings))
print("Users:", ratings.user_id.nunique())
print("Books:", ratings.book_id.nunique())


After filtering:
Total ratings: 932940
Users: 35710
Books: 10000


## 4Ô∏è‚É£ Train / Test Split

In [34]:
train_list = []
test_list = []

for user_id, user_data in ratings.groupby("user_id"):
    user_data = user_data.sample(frac=1, random_state=42)
    test_list.append(user_data.iloc[0])
    train_list.append(user_data.iloc[1:])

train_df = pd.concat(train_list)
test_df = pd.DataFrame(test_list)

print("Train size:", len(train_df))
print("Test size:", len(test_df))

Train size: 897230
Test size: 35710


## 5Ô∏è‚É£ Encode Users & Books

In [35]:
user2idx = {u: i for i, u in enumerate(train_df.user_id.unique())}
book2idx = {b: i for i, b in enumerate(train_df.book_id.unique())}

train_df["user_idx"] = train_df.user_id.map(user2idx)
train_df["book_idx"] = train_df.book_id.map(book2idx)

test_df["user_idx"] = test_df.user_id.map(user2idx)
test_df["book_idx"] = test_df.book_id.map(book2idx)
test_df = test_df.dropna()

num_users = len(user2idx)
num_books = len(book2idx)

print("Encoded users:", num_users)
print("Encoded books:", num_books)

Encoded users: 35710
Encoded books: 10000


## 6Ô∏è‚É£ BPR Model Definition

In [85]:
class BPRModel(nn.Module):
    def __init__(self, num_users, num_books, emb_dim=256):
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.book_emb = nn.Embedding(num_books, emb_dim)

        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.book_emb.weight, std=0.01)

    def forward(self, user, item):
        user_vec = self.user_emb(user)
        item_vec = self.book_emb(item)
        return (user_vec * item_vec).sum(dim=1)

## 7Ô∏è‚É£ Training Setup

This section prepares everything required for model training:

‚Ä¢ Build user interaction history  
‚Ä¢ Define BPR training dataset  
‚Ä¢ Initialize DataLoader  
‚Ä¢ Initialize model and optimizer  


In [87]:
user_seen = train_df.groupby("user_idx")["book_idx"].apply(set).to_dict()

class BPRDataset(Dataset):
    def __init__(self, train_df, num_books, user_seen, num_negatives=10):
        self.users = train_df.user_idx.values
        self.pos_items = train_df.book_idx.values
        self.num_books = num_books
        self.user_seen = user_seen
        self.num_negatives = num_negatives

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user = self.users[idx]
        pos_item = self.pos_items[idx]

        negatives = []
        while len(negatives) < self.num_negatives:
            neg_item = random.randint(0, self.num_books - 1)
            if neg_item not in self.user_seen[user]:
                negatives.append(neg_item)

        return (
            torch.tensor(user, dtype=torch.long),
            torch.tensor(pos_item, dtype=torch.long),
            torch.tensor(negatives, dtype=torch.long)
        )

train_loader = DataLoader(
    BPRDataset(train_df, num_books, user_seen),
    batch_size=2048,
    shuffle=True
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BPRModel(num_users, num_books, emb_dim=256).to(device)

optimizer = torch.optim.Adam(
    model.parameters(),
    lr=0.001,
    weight_decay=1e-5
)

epochs = 20

## 8Ô∏è‚É£ Model Training

In [88]:
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for user, pos_item, neg_items in train_loader:
        user = user.to(device)
        pos_item = pos_item.to(device)
        neg_items = neg_items.to(device)

        optimizer.zero_grad()

        pos_score = model(user, pos_item)

        expanded_users = user.unsqueeze(1).expand_as(neg_items).reshape(-1)
        flat_neg_items = neg_items.reshape(-1)

        neg_score = model(expanded_users, flat_neg_items)
        neg_score = neg_score.reshape(user.size(0), -1)

        diff = pos_score.unsqueeze(1) - neg_score
        loss = -F.logsigmoid(diff).mean()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1, Loss: 0.6310
Epoch 2, Loss: 0.4497
Epoch 3, Loss: 0.4044
Epoch 4, Loss: 0.3922
Epoch 5, Loss: 0.3874
Epoch 6, Loss: 0.3847
Epoch 7, Loss: 0.3831
Epoch 8, Loss: 0.3821
Epoch 9, Loss: 0.3814
Epoch 10, Loss: 0.3806
Epoch 11, Loss: 0.3801
Epoch 12, Loss: 0.3797
Epoch 13, Loss: 0.3794
Epoch 14, Loss: 0.3790
Epoch 15, Loss: 0.3786
Epoch 16, Loss: 0.3784
Epoch 17, Loss: 0.3781
Epoch 18, Loss: 0.3780
Epoch 19, Loss: 0.3780
Epoch 20, Loss: 0.3779


## 9Ô∏è‚É£ Evaluation Metrics

Evaluate ranking quality using:

- HitRate@5
- HitRate@10
- NDCG@10

In [67]:
model.eval()

K_values = [5, 10]
num_negatives = 100

hit_rates = {k: 0 for k in K_values}
ndcg_10 = 0

all_book_indices = set(range(num_books))

for i in range(len(test_df)):

    user = int(test_df.iloc[i]["user_idx"])
    true_book = int(test_df.iloc[i]["book_idx"])

    seen_books = user_seen[user]

    negative_candidates = list(all_book_indices - seen_books - {true_book})
    sampled_negatives = random.sample(negative_candidates, num_negatives)

    candidate_books = sampled_negatives + [true_book]

    user_tensor = torch.tensor([user] * len(candidate_books)).to(device)
    book_tensor = torch.tensor(candidate_books).to(device)

    with torch.no_grad():
        scores = model(user_tensor, book_tensor)

    scores = scores.cpu()

    # Sort descending
    ranked_indices = torch.argsort(scores, descending=True).numpy()
    ranked_books = [candidate_books[idx] for idx in ranked_indices]

    # Compute HitRate@K
    for K in K_values:
        if true_book in ranked_books[:K]:
            hit_rates[K] += 1

    # Compute NDCG@10
    if true_book in ranked_books[:10]:
        rank_position = ranked_books.index(true_book) + 1
        ndcg_10 += 1 / np.log2(rank_position + 1)

# Normalize
for K in K_values:
    hit_rates[K] /= len(test_df)

ndcg_10 /= len(test_df)

print("\n================ EVALUATION METRICS ================")
print(f"HitRate@5  : {hit_rates[5]:.4f}")
print(f"HitRate@10 : {hit_rates[10]:.4f}")
print(f"NDCG@10    : {ndcg_10:.4f}")
print("====================================================\n")


HitRate@5  : 0.4924
HitRate@10 : 0.6698
NDCG@10    : 0.3964



## üîü Save Trained Model

In [76]:
torch.save(model.state_dict(), "bpr_model.pth")
print("Model saved.")

Model saved.


## 1Ô∏è‚É£1Ô∏è‚É£ Reload Model for Inference

In [None]:
model = BPRModel(num_users, num_books, emb_dim=256).to(device)
model.load_state_dict(torch.load("bpr_model.pth", map_location=device))
model.eval()

print("\nModel loaded successfully.")

## 1Ô∏è‚É£2Ô∏è‚É£ Interactive Recommendation System

In [72]:
book_data = pd.read_csv("data/books.csv")

idx2book = {v: k for k, v in book2idx.items()}

def recommend_books(user_id, top_k=10):

    if user_id not in user2idx:
        print("User not found.")
        return

    user_idx = user2idx[user_id]
    seen_books = user_seen.get(user_idx, set())

    user_tensor = torch.tensor([user_idx] * num_books).to(device)
    book_tensor = torch.tensor(range(num_books)).to(device)

    with torch.no_grad():
        scores = model(user_tensor, book_tensor)

    scores = scores.cpu()

    for seen in seen_books:
        scores[seen] = -1e9

    top_indices = torch.topk(scores, top_k).indices.tolist()
    recommended_ids = [idx2book[idx] for idx in top_indices]

    recommendations = book_data.set_index("book_id").loc[
        [bid for bid in recommended_ids if bid in book_data.book_id.values]
    ][["title", "authors"]].reset_index()

    if recommendations.empty:
        print("No new books available to recommend for this user.")
        return

    print("\n===================================")
    print(f"Recommendations For User: {user_id}")
    print("===================================\n")

    for i, row in recommendations.iterrows():
        print(f"{i+1}. {row['title']}")
        print(f"   Author: {row['authors']}")
        print(f"   Book ID: {row['book_id']}\n")


def recommend_similar_books(book_title, top_k=10):

    book_data["clean_title"] = book_data["title"].str.lower().str.strip()

    matched = book_data[
        book_data["clean_title"].str.contains(book_title.lower(), na=False)
    ]

    if matched.empty:
        print("No matching books found.")
        return

    matched = matched.head(10)

    print("\nMatched Books:")
    print("===================================")

    for i, row in enumerate(matched.itertuples()):
        print(f"{i} - {row.title}")
        print(f"    Author: {row.authors}")
        print(f"    Book ID: {row.book_id}\n")

    selected_idx = int(input("Enter the number of the book you want: "))

    selected_book = matched.iloc[selected_idx]
    book_id = selected_book["book_id"]

    print("\nYou selected:")
    print("-----------------------------------")
    print(f"Title : {selected_book['title']}")
    print(f"Author: {selected_book['authors']}")
    print(f"Book ID: {book_id}")
    print("-----------------------------------")

    print("\nOptions:")
    print("1 - Finalize this book")
    print("2 - Show similar books")

    choice = input("Select option (1 or 2): ")

    if choice == "1":
        print("\nBook finalized. Returning to main menu.")
        return

    if book_id not in book2idx:
        print("\nThis book was filtered out during training.")
        return

    book_idx = book2idx[book_id]

    with torch.no_grad():
        book_embedding = model.book_emb.weight[book_idx]
        scores = torch.matmul(model.book_emb.weight, book_embedding).cpu()

    scores[book_idx] = -1e9

    top_indices = torch.topk(scores, top_k).indices.tolist()
    recommended_ids = [idx2book[idx] for idx in top_indices]

    recommendations = book_data.set_index("book_id").loc[
        [bid for bid in recommended_ids if bid in book_data.book_id.values]
    ][["title", "authors"]].reset_index()

    print("\n===================================")
    print(f"Books Similar To: {selected_book['title']}")
    print("===================================\n")

    if recommendations.empty:
        print("No similar books found in training set.\n")
    else:
        for i, row in recommendations.iterrows():
            print(f"{i+1}. {row['title']}")
            print(f"   Author: {row['authors']}")
            print(f"   Book ID: {row['book_id']}\n")


while True:

    print("\n===== BOOK RECOMMENDER SYSTEM =====")
    print("1 - Recommend for Random User")
    print("2 - Recommend Similar Books")
    print("3 - Exit")

    choice = input("\nSelect option (1, 2, or 3): ").strip()

    if choice == "1":
        random_user = random.choice(list(user2idx.keys()))
        print("\nRandom demo user selected:", random_user)
        recommend_books(random_user)

    elif choice == "2":
        book_input = input("\nEnter part of a book title: ").strip()
        recommend_similar_books(book_input)

    elif choice == "3":
        print("\nExiting recommender system. Goodbye üëã")
        break


===== BOOK RECOMMENDER SYSTEM =====
1 - Recommend for Random User
2 - Recommend Similar Books
3 - Exit



Select option (1, 2, or 3):  3



Exiting recommender system. Goodbye üëã


## üìä Dataset Analysis

In [90]:
print("\n================ DATASET ANALYSIS ================\n")

num_users = ratings.user_id.nunique()
num_books = ratings.book_id.nunique()
num_ratings = len(ratings)

total_possible = num_users * num_books
sparsity = 1 - (num_ratings / total_possible)

print("Dataset Statistics:")
print(f"Total Users        : {num_users}")
print(f"Total Books        : {num_books}")
print(f"Total Ratings      : {num_ratings}")
print(f"Sparsity           : {sparsity:.4f}")

print("\n--------------------------------------------------")

books = pd.read_csv("data/books.csv")[["book_id", "title"]]
popularity = ratings.groupby("book_id").size().reset_index(name="rating_count")
popularity = popularity.merge(books, on="book_id", how="inner")

most_popular = popularity.sort_values("rating_count", ascending=False).head(10)
least_popular = popularity.sort_values("rating_count", ascending=True).head(10)

def print_section(df, header):
    print(f"\n{header}")
    print("=" * 60)
    for rank, row in enumerate(df.itertuples(), 1):
        print(f"{rank}. {row.title}  | Ratings: {row.rating_count}")
    print("-" * 60)

print_section(most_popular, "TOP 10 MOST POPULAR BOOKS")
print_section(least_popular, "TOP 10 LEAST POPULAR BOOKS")

print("\nDistribution Statistics:")
print(f"Average ratings per book: {popularity['rating_count'].mean():.2f}")
print(f"Median ratings per book : {popularity['rating_count'].median():.2f}")

print("\n====================================================\n")



Dataset Statistics:
Total Users        : 35710
Total Books        : 10000
Total Ratings      : 932940
Sparsity           : 0.9974

--------------------------------------------------

TOP 10 MOST POPULAR BOOKS
1. Harry Potter and the Half-Blood Prince (Harry Potter, #6)  | Ratings: 100
2. Dreamland  | Ratings: 100
3. In Our Time  | Ratings: 100
4. The Snows of Kilimanjaro and Other Stories  | Ratings: 100
5. A Moveable Feast  | Ratings: 100
6. To Have and Have Not  | Ratings: 100
7. Carter Beats the Devil  | Ratings: 100
8. Extremely Loud and Incredibly Close  | Ratings: 100
9. The View from Saturday  | Ratings: 100
10. Olivia Joules and the Overactive Imagination  | Ratings: 100
------------------------------------------------------------

TOP 10 LEAST POPULAR BOOKS
1. Ghostwritten  | Ratings: 21
2. Jesus Freaks: Stories of Those Who Stood for Jesus, the Ultimate Jesus Freaks (Jesus Freaks, #1)  | Ratings: 31
3. The Man From St. Petersburg  | Ratings: 33
4. Next  | Ratings: 36
5. Son