## Importing Dataset

In [77]:
%pip install kagglehub[pandas-datasets]

zsh:1: no matches found: kagglehub[pandas-datasets]
Note: you may need to restart the kernel to use updated packages.


In [78]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import Counter
# from google.colab import drive
import kagglehub
from kagglehub import KaggleDatasetAdapter
# drive.mount('/content/drive/')

# Load the latest version
books_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Books.csv",
)

ratings_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Ratings.csv",
)

users_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Users.csv",
)

  books_df = kagglehub.load_dataset(
  result = read_function(
  ratings_df = kagglehub.load_dataset(
  users_df = kagglehub.load_dataset(


### Only keeping books with more than three ratings

In [79]:
print(books_df.shape)
books_df = books_df[books_df.groupby('Book-Title')['Book-Title'].transform('count') > 5]
print(books_df.shape)

(271360, 8)
(3978, 8)


### Joining Books, Ratings, and Users tables together

In [80]:
df_ratings_books = pd.merge(ratings_df, books_df, on="ISBN", how='inner')
df = pd.merge(df_ratings_books, users_df, on='User-ID')
df['User-ID'] = df['User-ID'].astype(str)
df['Year-Of-Publication'] = pd.to_numeric(df['Year-Of-Publication'], errors='coerce')
df = df.dropna(subset=['Year-Of-Publication'])
df = df.dropna(subset=['Age'])
df = df[df['Book-Rating'] > 0]
df = df[df['Age'] <= 100]
df = df[df['Year-Of-Publication'] > 0]
df['Book-Rating'].describe()

count    13181.000000
mean         7.956301
std          1.778792
min          1.000000
25%          7.000000
50%          8.000000
75%          9.000000
max         10.000000
Name: Book-Rating, dtype: float64

### Combining my own data into the training set

In [81]:
personal_df = pd.read_csv("./fine-tuning-book-set.txt")
end_index = len(df)
df = pd.concat([df, personal_df], ignore_index=True, sort=False)

In [82]:
personal_df.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age
0,1234567890,451521951,5,The Count of Monte Cristo,Alexandre Dumas,1844,Signet Book,23
1,1234567890,684813637,5,1776,David McCullough,2005,Simon & Schuster,23


In [83]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [84]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from sklearn.preprocessing import  StandardScaler

# User Tower -- User-ID, Age
# Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

class BookRecommenderDataset(Dataset):
    """
    A PyTorch Dataset class for book recommendation tasks.

    Parameters
    ----------
    dataframe : pd.DataFrame
        The input data containing user, item, and possibly interaction features.

    Attributes
    ----------
    data : pd.DataFrame
        The processed version of the input dataframe.
    encoders : dict
        A dictionary mapping column names to fitted label encoders.
    reverse_encoders : dict
        A dictionary mapping column names to reverse label encoders (index to label).
    scalers : dict
        A dictionary mapping column names to fitted scalers for numerical features.
    """

    def __init__(self, data):
        self.encoders = {} # {'Column name': {'value': idx, ...}, ...}
        self.reverse_encoders = {} # {'Column name': {idx: 'value', ...}, ...}
        self.scalers = {}
        self.data = data
        # self.data = data.sample(frac=0.20, random_state=42).reset_index(drop=True)
        self.preprocess(self.data)

    def preprocess(self, data):
        self.encode_information()

    def encode_information(self):
        """
        Maps {key: index} pairs and StandardScaler for real valued numbers
        """
        label_encoders = ['User-ID', 'ISBN', 'Book-Title', 'Book-Author', 'Publisher']
        standard_scalers = ['Age', 'Year-Of-Publication']

        for col in label_encoders:
            unique_vals = self.data[col].astype(str).unique()
            self.encoders[col] = {val: idx + 1 for idx, val in enumerate(unique_vals)}
            self.reverse_encoders[col] = {idx + 1: val for idx, val in enumerate(unique_vals)}
            self.data[col] = self.data[col].astype(str).map(self.encoders[col]).fillna(0).astype(int)

        for col in standard_scalers:
            self.scalers[col] = StandardScaler()
            self.data[[col]] = self.scalers[col].fit_transform(self.data[[col]])

        # Manually adding my own User-ID so I don't need to adjust nn.Embedding later
        # max_user_idx = max(self.encoders['User-ID'].values())
        # self.encoders['User-ID']["1234567890"] = max_user_idx + 1
        # self.reverse_encoders['User-ID'][max_user_idx + 1] = "1234567890"

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        return {
            "User-ID": torch.tensor(row["User-ID"], dtype=torch.long),
            "User-Age": torch.tensor(row["Age"], dtype=torch.float32),
            "Book-ISBN": torch.tensor(row["ISBN"], dtype=torch.long),
            "Book-Title": torch.tensor(row["Book-Title"], dtype=torch.long),
            "Book-Author": torch.tensor(row["Book-Author"], dtype=torch.long),
            "Book-Publisher": torch.tensor(row["Publisher"], dtype=torch.long),
            "Book-Year-Of-Publication": torch.tensor(row["Year-Of-Publication"], dtype=torch.float32),
        }


dataset = BookRecommenderDataset(df)


In [85]:
dataset[0]

{'User-ID': tensor(1),
 'User-Age': tensor(-0.1115),
 'Book-ISBN': tensor(1),
 'Book-Title': tensor(1),
 'Book-Author': tensor(1),
 'Book-Publisher': tensor(1),
 'Book-Year-Of-Publication': tensor(0.4470)}

In [86]:
train_size = int(0.7 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])

# Dataloaders
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)
test_loader = DataLoader(test_data, batch_size=128, shuffle=False)

In [87]:
next(iter(train_loader))

{'User-ID': tensor([2266, 1241, 4740, 5530,  549, 1259,   74, 5446, 1971, 5419, 5096,  474,
         6252,  519, 6554, 5487, 2162, 2935,  698, 6015, 6232, 5242, 2920, 6167,
         3811,  328, 1472, 2422, 2839, 2931, 5406, 6181, 4954, 2728, 1995, 4871,
         2867, 3042, 3664, 3086, 4828, 3440, 5737, 2911, 1581, 4488, 1928, 4720,
         6159, 4701, 3307, 2129,  929, 5865, 3916, 4920, 4469, 3701, 6069, 1754,
         5283, 3599, 3919,  476, 2193, 4896, 6425, 1402, 4689, 2366, 2746, 4090,
         2409, 3677,   79,   87,  944, 6537, 5214,  436, 3850, 4401,  838, 2496,
         5673, 6240, 2558, 5283, 1140,  810,   60,  749,  505,  738,  758, 2339,
         2931, 4715, 1750, 3368,  152, 2575, 6340, 3204, 2912, 4666, 2525, 3284,
         2820, 5644, 6258, 1311,  745, 1201, 2886, 1553,  108, 6619, 2554, 6149,
         2315, 4987,  197, 2480, 4845, 4221, 3034, 3503]),
 'User-Age': tensor([-1.0714, -0.5114, -0.4314, -0.9114, -0.4314,  1.4883, -0.1115,  0.2085,
         -0.2715,  0.2085, 

## Two Tower Model for Recommendations

In [88]:
class UserTower(nn.Module):

    # User Tower -- User-ID, Age

    def __init__(self, num_users, embedding_dim=16):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim, padding_idx=0)

        self.user_mlp = nn.Sequential(
            nn.Linear(embedding_dim + 1, 512), # 1 embedding + 1 numerical
            nn.ReLU(),
            # nn.LayerNorm(512),
            nn.Dropout(0.2),
            nn.Linear(512, embedding_dim),
        )

    def forward(self, user_id, age):
        """
        user_id: (batch,) int64
        review_mean: (batch,) float32
        """
        user_emb = self.user_embedding(user_id)
        age = age.unsqueeze(1)
        x = torch.cat([user_emb, age], dim=1)
        return self.user_mlp(x)

    def get_embedding(self, data):
        return self.forward(data['User-ID'], data['User-Age'])


In [89]:
class ItemTower(nn.Module):
    def __init__(self, num_isbn, num_titles, num_authors, num_publishers, embedding_dim=16):
        super().__init__()

        # Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

        self.book_isbn_embedding = nn.Embedding(num_isbn, embedding_dim, padding_idx=0)
        self.book_title_embedding = nn.Embedding(num_titles, embedding_dim, padding_idx=0)
        self.book_author_embedding = nn.Embedding(num_authors, embedding_dim, padding_idx=0)
        self.book_publisher_embedding = nn.Embedding(num_publishers, embedding_dim, padding_idx=0)

        self.item_mlp = nn.Sequential(
            nn.Linear(embedding_dim * 4 + 1, 512),  # 4 embeddings + 1 numerical
            nn.ReLU(),
            # nn.LayerNorm(512),
            nn.Dropout(0.2),
            nn.Linear(512, embedding_dim),
        )

    def forward(self, isbn, book_title, book_author, book_publisher, book_year_of_publication):
        book_isbn_emb = self.book_isbn_embedding(isbn)
        book_title_emb = self.book_title_embedding(book_title)
        book_author_emb = self.book_author_embedding(book_author)
        book_publisher_emb = self.book_publisher_embedding(book_publisher)
        book_year = book_year_of_publication.unsqueeze(1)

        x = torch.cat([
            book_isbn_emb,
            book_title_emb,
            book_author_emb,
            book_publisher_emb,
            book_year
        ], dim=1)

        return self.item_mlp(x)

    def get_embedding(self, data):
        return self.forward(
            data['Book-ISBN'],
            data['Book-Title'],
            data['Book-Author'],
            data['Book-Publisher'],
            data['Book-Year-Of-Publication'],
        )


In [90]:
class TwoTowers(nn.Module):
    def __init__(self, user_tower: UserTower, item_tower: ItemTower):
        super().__init__()
        self.user_tower = user_tower
        self.item_tower = item_tower

    def forward(self, data):
        user_vector = self.user_tower.get_embedding(data)
        item_vector = self.item_tower.get_embedding(data['pos_item'])
        return (user_vector * item_vector).sum(dim=1)

In [91]:
# example_data = next(iter(train_loader))

NUM_USERS = len(dataset.encoders['User-ID']) + 1
NUM_ISBN = len(dataset.encoders['ISBN']) + 1
NUM_TITLES = len(dataset.encoders['Book-Title']) + 1
NUM_AUTHORS = len(dataset.encoders['Book-Author']) + 1
NUM_PUBLISHERS = len(dataset.encoders['Publisher']) + 1

EMBEDDING_SIZE = 128

user_tower = UserTower(num_users=NUM_USERS,  embedding_dim=EMBEDDING_SIZE)

item_tower = ItemTower(
    num_isbn=NUM_ISBN,
    num_titles=NUM_TITLES,
    num_authors=NUM_AUTHORS,
    num_publishers=NUM_PUBLISHERS,
    embedding_dim=EMBEDDING_SIZE
)

two_towers = TwoTowers(
    user_tower,
    item_tower
).to(device)


In [99]:
def calculate_recall_at_k(two_towers, epoch, EPOCHS):
    """
    Gathers the current item embeddings, 
    calculates similarity between each user and the items. 

    Calculates and returns recall@k metric of the recommendations 
    made to the user.
    """
    entire_dataset = DataLoader(dataset, batch_size=1, shuffle=False)
    all_item_embeddings = []
    for batch in entire_dataset:
        item_embedding = two_towers.item_tower.get_embedding(batch)
        all_item_embeddings.append(item_embedding)
    all_item_embeddings = torch.cat(all_item_embeddings, dim=0)

    total_recall = 0.0
    num_users = 0
    k = 20  

    for idx, batch in enumerate(test_loader):
        user_embedding = two_towers.user_tower.get_embedding(batch) 
        similarity_scores = user_embedding @ all_item_embeddings.T  # [batch_size, num_items]

        top_scores, top_indices = torch.topk(similarity_scores, k=k, dim=1)

        for user_id, items, scores in zip(batch['User-ID'], top_indices, top_scores):
            user_rows = dataset.data[dataset.data['User-ID'] == user_id.item()]

            # Recommended books (Book-Title IDs)
            recommended_book_ids_set = set([dataset.data.iloc[idx.item()]['Book-Title'] for idx in items])
            actual_book_ids_set = set(user_rows['Book-Title'].tolist())

            hits = len(recommended_book_ids_set & actual_book_ids_set)  # intersection
            recall_at_k = hits / len(actual_book_ids_set)

            total_recall += recall_at_k
            num_users += 1

    average_recall_at_k = total_recall / num_users
    print(f"Epoch {epoch}/{EPOCHS}, Average Recall@{k}: {average_recall_at_k:.4f}\n")
    return average_recall_at_k


## Training

#### Main Training Loop

In [93]:
%rm -rf ./logs/

In [94]:
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import datetime

EPOCHS = 100
LEARNING_RATE = 2e-4
EARLY_STOPPING_PATIENCE = 15
TEMPERATURE = 0.1
MODEL_SAVE_PATH = "/models"

# loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(two_towers.parameters(), lr=LEARNING_RATE)
writer = SummaryWriter('./logs/')

best_test_loss = float('inf')
early_stopping_counter = 0
global_step = 0

In [106]:

for epoch in range(1, EPOCHS + 1):

    # -- Main Loop --
    running_train_loss = 0.0
    two_towers.train()

    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()

        user_embedding = two_towers.user_tower.get_embedding(batch) 
        item_embedding = two_towers.item_tower.get_embedding(batch)

        logits = (user_embedding @ item_embedding.T) / TEMPERATURE
        labels = torch.arange(user_embedding.size(0)).to(device) 
        loss = F.cross_entropy(logits, labels)

        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()

    # -- Test Loop -- 
    two_towers.eval()
    running_test_loss = 0.0
    with torch.no_grad():
        for batch in test_loader:
            user_embedding = two_towers.user_tower.get_embedding(batch) 
            item_embedding = two_towers.item_tower.get_embedding(batch)

            logits = (user_embedding @ item_embedding.T) / TEMPERATURE
            labels = torch.arange(user_embedding.size(0)).to(device)
            loss = F.cross_entropy(logits, labels)

            running_test_loss += loss.item()

    avg_train_loss = running_train_loss / len(train_loader)
    avg_test_loss = running_test_loss / len(test_loader)
    print(f"Epoch {epoch}/{EPOCHS}, Average Training Loss: {avg_train_loss:.4f}, ")
    print(f"Epoch {epoch}/{EPOCHS}, Average Test Loss: {avg_test_loss:.4f}, ")
    calculate_recall_at_k(two_towers, epoch, EPOCHS)
    
    if epoch % 10 == 0:
        torch.save(two_towers.state_dict(), f"./{MODEL_SAVE_PATH}/two_towers_epoch{epoch}_test{avg_test_loss:.2}_train{avg_train_loss:.2f}.pt")


writer.close()
print("Training complete.")


Epoch 1/100, Average Training Loss: 0.8589, 
Epoch 1/100, Average Test Loss: 8.9229, 
Epoch 1/100, Average Recall@20: 0.2367

Epoch 2/100, Average Training Loss: 0.8617, 
Epoch 2/100, Average Test Loss: 8.7882, 
Epoch 2/100, Average Recall@20: 0.2401

Epoch 3/100, Average Training Loss: 0.8595, 
Epoch 3/100, Average Test Loss: 8.9298, 
Epoch 3/100, Average Recall@20: 0.2387

Epoch 4/100, Average Training Loss: 0.8648, 
Epoch 4/100, Average Test Loss: 8.8396, 
Epoch 4/100, Average Recall@20: 0.2385

Epoch 5/100, Average Training Loss: 0.8651, 
Epoch 5/100, Average Test Loss: 8.8612, 
Epoch 5/100, Average Recall@20: 0.2402

Epoch 6/100, Average Training Loss: 0.8733, 
Epoch 6/100, Average Test Loss: 8.8952, 
Epoch 6/100, Average Recall@20: 0.2394

Epoch 7/100, Average Training Loss: 0.8480, 
Epoch 7/100, Average Test Loss: 8.9586, 
Epoch 7/100, Average Recall@20: 0.2381

Epoch 8/100, Average Training Loss: 0.8704, 
Epoch 8/100, Average Test Loss: 8.9418, 
Epoch 8/100, Average Recall@20: 


Average Recall@20 for this batch: 0.2373


### Seeing what the model recommends to me after training

- It should have seen me somewhere in the training data and should have learned enough information from the other data to generalize over what I might like.
- I will pass my username and age into the User Tower. And then conduct a dot product between my vector and the matrix of learned item embeddings to get relevance scores.
- I will then conduct some semi-manual ranking based on removing what I have already read and other info.
- Then I will make the final 50 recommendations for me.

In [None]:

# Pretrained model:
# two_towers.load_state_dict(torch.load("models/two_towers_epoch20_test6.4_train3.18.pt", map_location=device))

FileNotFoundError: [Errno 2] No such file or directory: 'models/two_towers_epoch20_test6.4_train3.18.pt'

In [107]:
# Getting all item embeddings
entire_dataset = DataLoader(dataset, batch_size=1, shuffle=False)
all_item_embeddings = []
for batch in entire_dataset:
    item_embedding = two_towers.item_tower.get_embedding(batch)
    all_item_embeddings.append(item_embedding)
all_item_embeddings = torch.cat(all_item_embeddings, dim=0)

# Getting a single embedding for my learned user
paul_user_id = dataset.encoders['User-ID']['1234567890']
paul_age = dataset.scalers['Age'].transform([[24]])[0][0]
paul_batch = {
    'User-ID': torch.tensor([paul_user_id], dtype=torch.long, device=device),
    'User-Age': torch.tensor([paul_age], dtype=torch.float32, device=device)
}
paul_user_embedding = two_towers.user_tower.get_embedding(paul_batch) # [1 batch, 128 dimensions]



In [103]:
print(all_item_embeddings.shape)
print(paul_user_embedding.shape)

torch.Size([13197, 128])
torch.Size([1, 128])


In [None]:
similarity_scores = (paul_user_embedding @ all_item_embeddings.T).squeeze()
top_k = 1000
top_scores, top_indices = torch.topk(similarity_scores, top_k)

unique_recommendations = []
seen_titles = set()
read_isbns = personal_df['ISBN'].astype(str).to_list()

for score, idx in zip(top_scores.detach().cpu().numpy(), top_indices.detach().cpu().numpy()):
    row = dataset.data.iloc[idx]  # pandas row

    title_idx = int(row['Book-Title'])
    author_idx = int(row['Book-Author'])
    isbn_idx = int(row['ISBN'])

    title = dataset.reverse_encoders['Book-Title'][title_idx]
    author = dataset.reverse_encoders['Book-Author'][author_idx]
    isbn = dataset.reverse_encoders['ISBN'][isbn_idx]

    # skip duplicates or already read books
    if title in seen_titles or isbn in read_isbns:
        continue

    seen_titles.add(title)
    unique_recommendations.append({
        'title': title,
        'author': author,
        'score': score
    })


In [113]:
for rec in unique_recommendations[:100]:
    print(f"Title: {rec['title']}, Author: {rec['author']}, Score: {rec['score']:.4f}")

Title: Howards End, Author: E. M. Forster, Score: 0.5656
Title: Selected Poems (Dover Thrift Editions), Author: Walt Whitman, Score: 0.3865
Title: The Secret Garden, Author: Frances H. Burnett, Score: 0.3571
Title: Spellbound, Author: Helen Glisic, Score: 0.3057
Title: Murder on the Orient Express, Author: Agatha Christie, Score: 0.3050
Title: Tale of Two Cities, Author: Charles Dickens, Score: 0.2960
Title: Collected Stories, Author: Lily Brett, Score: 0.2941
Title: Masquerade, Author: Kit Williams, Score: 0.2805
Title: Light a Penny Candle, Author: Maeve Binchy, Score: 0.2759
Title: David Copperfield, Author: Charles Dickens, Score: 0.2700
Title: Lolita, Author: Vladimir Nabokov, Score: 0.2588
Title: Promises, Author: Belva Plain, Score: 0.2537
Title: And Then There Were None, Author: Agatha Christie, Score: 0.2283
Title: Anthem, Author: Ayn Rand, Score: 0.2275
Title: Homecoming, Author: Belva Plain, Score: 0.2269
Title: Outlander, Author: DIANA GABALDON, Score: 0.2261
Title: The Win