## Importing Dataset

In [None]:
%pip install kagglehub[pandas-datasets]



In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import Counter
from google.colab import drive
import kagglehub
from kagglehub import KaggleDatasetAdapter
drive.mount('/content/drive/')

# Load the latest version
books_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Books.csv",
)

ratings_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Ratings.csv",
)

users_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Users.csv",
)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


  books_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


  result = read_function(
  ratings_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


  users_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


In [None]:
df_ratings_books = pd.merge(ratings_df, books_df, on="ISBN", how='inner')
df = pd.merge(df_ratings_books, users_df, on='User-ID')
df['User-ID'] = df['User-ID'].astype(str)
df['Year-Of-Publication'] = pd.to_numeric(df['Year-Of-Publication'], errors='coerce')
df = df.dropna(subset=['Year-Of-Publication'])
df = df.dropna(subset=['Age'])
df = df[df['Book-Rating'] > 0]
df = df[df['Age'] <= 100]
df = df[df['Year-Of-Publication'] > 0]
df['Book-Rating'].describe()

Unnamed: 0,Book-Rating
count,264742.0
mean,7.738848
std,1.813809
min,1.0
25%,7.0
50%,8.0
75%,9.0
max,10.0


In [None]:
print(df['Book-Rating'].value_counts().sort_index())

Book-Rating
1       862
2      1525
3      3260
4      4991
5     27203
6     21047
7     44663
8     63605
9     44496
10    53090
Name: count, dtype: int64


In [None]:
print(df['Age'].value_counts().sort_index())

Age
0.0      203
1.0      198
2.0      191
3.0       82
4.0      143
        ... 
96.0       2
97.0      46
98.0       1
99.0       5
100.0     51
Name: count, Length: 96, dtype: int64


In [None]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
import ast
from sentence_transformers import SentenceTransformer

# User Tower -- User-ID, Age
# Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

class BookRecommenderDataset(Dataset):
    def __init__(self, data):
        self.data = data.sample(frac=0.05, random_state=42)
        self.data = self.preprocess(self.data)

    def preprocess(self, data):
        self.encoders = {} # {'Column name': {'value': idx, ...}, ...}
        self.reverse_encoders = {} # {'Column name': {idx: 'value', ...}, ...}
        self.scalers = {}

        label_encoders = ['User-ID', 'ISBN', 'Book-Title', 'Book-Author', 'Publisher']
        standard_scalers = ['Age', 'Year-Of-Publication']

        for col in label_encoders:
            unique_vals = data[col].astype(str).unique()
            self.encoders[col] = {val: idx + 1 for idx, val in enumerate(unique_vals)}
            self.reverse_encoders[col] = {idx + 1: val for idx, val in enumerate(unique_vals)}
            data[col] = data[col].astype(str).map(self.encoders[col]).fillna(0).astype(int)

        for col in standard_scalers:
            self.scalers[col] = StandardScaler()
            data[[col]] = self.scalers[col].fit_transform(data[[col]])

        # Manually adding my own User-ID so I don't need to adjust nn.Embedding later
        max_user_idx = max(self.encoders['User-ID'].values())
        self.encoders['User-ID']["1234567890"] = max_user_idx + 1
        self.reverse_encoders['User-ID'][max_user_idx + 1] = "1234567890"
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            'User-ID': torch.tensor(item['User-ID'], dtype=torch.long),
            'User-Age': torch.tensor(item['Age'], dtype=torch.float32),
            'Book-ISBN': torch.tensor(item['ISBN'], dtype=torch.long),
            'Book-Title': torch.tensor(item['Book-Title'], dtype=torch.long),
            'Book-Author': torch.tensor(item['Book-Author'], dtype=torch.long),
            'Book-Publisher': torch.tensor(item['Publisher'], dtype=torch.long),
            'Book-Year-Of-Publication': torch.tensor(item['Year-Of-Publication'], dtype=torch.float32),
            'Rating': torch.tensor(item['Book-Rating'], dtype=torch.float32)
        }

dataset = BookRecommenderDataset(df)


In [None]:
train_size = int(0.5 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])

# Dataloaders
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [None]:
next(iter(train_loader))

{'User-ID': tensor([5850, 1709,  321,   14, 6571, 4281,  700, 1000, 5001, 1509, 1226, 1081,
         5866,  795, 1707, 1433,  846, 2575, 3539, 1913, 1160, 2416, 5471, 1678,
         1284, 2506,   38,  798,  442, 1520, 2290, 1754, 1434,   38,  177, 5631,
          504, 3926,    2,   42,  598, 4829, 1229, 5179, 2263,   97, 2571, 5156,
          738, 3586,  752,  625, 4284,   14, 1289, 2505, 1635, 1678,  895, 4039,
         4492, 2173, 2566, 5424]),
 'User-Age': tensor([-1.5119, -1.1064,  0.8398, -0.2955,  0.3533, -0.1333,  0.3533,  1.1642,
         -0.1333, -0.5388,  0.5965,  0.7587, -0.6199, -0.2955,  2.1373,  1.0020,
          0.3533,  1.3264, -0.7010, -0.7821,  0.3533,  2.9482, -0.6199,  0.4343,
          0.0289,  1.3264,  1.2453, -0.9442, -2.0796, -2.2417,  0.1100, -1.1064,
         -1.5930,  1.2453, -0.3766, -1.0253, -1.5119, -1.9985,  0.8398, -0.4577,
          0.8398,  1.7318,  0.0289,  0.0289, -0.7821, -0.0522, -0.3766, -0.2955,
         -0.1333,  0.4343, -0.6199,  0.1911,  2.380

## Two Tower Model for Recommendations

In [None]:
class UserTower(nn.Module):

    # User Tower -- User-ID, Age

    def __init__(self, num_users, embedding_dim=32):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim, padding_idx=0)

        self.user_mlp = nn.Sequential(
            nn.Linear(embedding_dim + 1, 128), # 1 embedding + 1 numerical
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, embedding_dim)
        )

    def forward(self, user_id, age):
        """
        user_id: (batch,) int64
        review_mean: (batch,) float32
        """
        user_emb = self.user_embedding(user_id)
        age = age.unsqueeze(1)
        x = torch.cat([user_emb, age], dim=1)
        return self.user_mlp(x)

    def get_embedding(self, data):
        return self.forward(data['User-ID'], data['User-Age'])


In [None]:
class ItemTower(nn.Module):
    def __init__(self, num_isbn, num_titles, num_authors, num_publishers, embedding_dim=32):
        super().__init__()

        # Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

        self.book_isbn_embedding = nn.Embedding(num_isbn, embedding_dim, padding_idx=0)
        self.book_title_embedding = nn.Embedding(num_titles, embedding_dim, padding_idx=0)
        self.book_author_embedding = nn.Embedding(num_authors, embedding_dim, padding_idx=0)
        self.book_publisher_embedding = nn.Embedding(num_publishers, embedding_dim, padding_idx=0)

        self.item_mlp = nn.Sequential(
            nn.Linear(embedding_dim * 4 + 1, 128),  # 4 embeddings + 1 numerical
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, embedding_dim)
        )

    def forward(self, isbn, book_title, book_author, book_publisher, book_year_of_publication):
        book_isbn_emb = self.book_isbn_embedding(isbn)
        book_title_emb = self.book_title_embedding(book_title)
        book_author_emb = self.book_author_embedding(book_author)
        book_publisher_emb = self.book_publisher_embedding(book_publisher)
        book_year = book_year_of_publication.unsqueeze(1)

        x = torch.cat([
            book_isbn_emb,
            book_title_emb,
            book_author_emb,
            book_publisher_emb,
            book_year
        ], dim=1)

        return self.item_mlp(x)

    def get_embedding(self, data):
        return self.forward(
            data['Book-ISBN'],
            data['Book-Title'],
            data['Book-Author'],
            data['Book-Publisher'],
            data['Book-Year-Of-Publication'],
        )


In [None]:
class TwoTowers(nn.Module):
    def __init__(self, user_tower: UserTower, item_tower: ItemTower):
        super().__init__()
        self.user_tower = user_tower
        self.item_tower = item_tower

    def forward(self, data):
        user_vector = self.user_tower.get_embedding(data)
        item_vector = self.item_tower.get_embedding(data)
        return (user_vector * item_vector).sum(dim=1)

In [None]:
example_data = next(iter(train_loader))

NUM_USERS = len(dataset.encoders['User-ID']) + 1
NUM_ISBN = len(dataset.encoders['ISBN']) + 1
NUM_TITLES = len(dataset.encoders['Book-Title']) + 1
NUM_AUTHORS = len(dataset.encoders['Book-Author']) + 1
NUM_PUBLISHERS = len(dataset.encoders['Publisher']) + 1

user_tower = UserTower(num_users=NUM_USERS)

item_tower = ItemTower(
    num_isbn=NUM_ISBN,
    num_titles=NUM_TITLES,
    num_authors=NUM_AUTHORS,
    num_publishers=NUM_PUBLISHERS,
)

two_towers = TwoTowers(
    user_tower,
    item_tower
)

## Training

#### Helper Functions

In [None]:
# --- Training Helpers ---
def train_one_epoch(model, loader, optimizer, loss_fn):
    model.train()
    running_loss = 0.0

    for batch_idx, data in enumerate(loader):
        optimizer.zero_grad()
        preds = model(data)
        targets = data['Rating']
        loss = loss_fn(preds, targets)

        # Sanity checks
        assert not torch.isnan(preds).any(), "NaN in predictions"
        assert not torch.isnan(targets).any(), "NaN in targets"
        assert not torch.isinf(preds).any(), "Inf in predictions"
        assert not torch.isinf(targets).any(), "Inf in targets"

        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=CLIP_GRAD_NORM)
        optimizer.step()

        yield batch_idx, loss.item()

@torch.no_grad()
def calculate_test_loss(model, test_loader, loss_fn):
    model.eval()
    total_loss = 0.0
    num_batches = 0

    for data in test_loader:
        preds = model(data)
        targets = data['Rating']
        loss = loss_fn(preds, targets)
        total_loss += loss.item()
        num_batches += 1

    return total_loss / num_batches if num_batches > 0 else float('inf')

def evaluate_and_checkpoint(model, epoch, global_step, best_loss, counter, test_loader, loss_fn):
    test_loss = calculate_test_loss(model, test_loader, loss_fn)

    if test_loss < best_loss:
        best_loss = test_loss
        counter = 0
        timestamp = datetime.datetime.now().strftime('%Y%m%d')
        save_path = f"{MODEL_SAVE_PATH}/two_towers_best_model_{timestamp}.pt"
        torch.save(model.state_dict(), save_path)
        print(f"[Epoch {epoch}] ✅ Improved! Test Loss: {test_loss:.4f}. Model saved.")
    else:
        counter += 1
        print(f"[Epoch {epoch}] No improvement. Test Loss: {test_loss:.4f} ({counter}/{EARLY_STOPPING_PATIENCE})")

    return best_loss, counter

#### Main Training Loop

In [None]:
%rm -rf ./logs/

In [None]:
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import datetime

EPOCHS = 200
LOG_INTERVAL = 100
CLIP_GRAD_NORM = 1.0
LEARNING_RATE = 0.001
EARLY_STOPPING_PATIENCE = 15
MODEL_SAVE_PATH = "/content/drive/MyDrive/models"

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(two_towers.parameters(), lr=LEARNING_RATE)
writer = SummaryWriter('./logs/')

best_test_loss = float('inf')
early_stopping_counter = 0
global_step = 0

all_items_loader = DataLoader(dataset, batch_size=len(dataset), shuffle=False)
items = next(iter(all_items_loader))
item_vector = two_towers.item_tower.get_embedding(items)

In [None]:

for epoch in range(1, EPOCHS + 1):

    # -- Main Loop --
    running_loss = 0.0
    two_towers.train()

    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()

        preds = two_towers(batch)
        targets = batch['Rating']
        loss = loss_fn(preds, targets)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch}, Average Training Loss (based on rating): {avg_loss:.4f}")


    # -----------------------
    # -- Recall@K Metric --
    #
    # Work in progress... Implement the recall@k metric to see out of the user's total ratings, how many of these appeared in the top-k.
    #

    # user_row = next(iter(train_loader))
    # user_vector = two_towers.user_tower.get_embedding(user_row)

    # scores = torch.matmul(item_vector, user_vector.T)
    # scores = scores.T
    #
    # top_k = 20
    # for i, user_id_encoded in enumerate(user_row['User-ID']):
    #     top_scores, top_indices = torch.topk(scores[i], top_k)
    #     user_id_decoded = dataset.reverse_encoders['User-ID'][user_id_encoded.item()]
    #     ground_truth_books = df[df['User-ID'] == user_id_decoded]['ISBN']

    #     print(f"Top k books predicted for user {user_id_decoded}")
    #     print(top_indices)
    #     print(f"Book ratings for user {user_id_decoded}")
    #     print(ground_truth_books)
    #
    # -----------------------

    print(f"Epoch {epoch}/{EPOCHS} completed.")

writer.close()
print("Training complete.")


Epoch 1, Average Training Loss (based on rating): 11.6375
Epoch 1/200 completed.
Epoch 2, Average Training Loss (based on rating): 2.9920
Epoch 2/200 completed.
Epoch 3, Average Training Loss (based on rating): 2.0109
Epoch 3/200 completed.
Epoch 4, Average Training Loss (based on rating): 1.4614
Epoch 4/200 completed.
Epoch 5, Average Training Loss (based on rating): 1.0928
Epoch 5/200 completed.
Epoch 6, Average Training Loss (based on rating): 0.8664
Epoch 6/200 completed.
Epoch 7, Average Training Loss (based on rating): 0.6933
Epoch 7/200 completed.
Epoch 8, Average Training Loss (based on rating): 0.5886
Epoch 8/200 completed.
Epoch 9, Average Training Loss (based on rating): 0.5255
Epoch 9/200 completed.
Epoch 10, Average Training Loss (based on rating): 0.4998
Epoch 10/200 completed.
Epoch 11, Average Training Loss (based on rating): 0.4643
Epoch 11/200 completed.
Epoch 12, Average Training Loss (based on rating): 0.4495
Epoch 12/200 completed.
Epoch 13, Average Training Loss (

### Fine-tuning on my own data

- The item embeddings were learned during training. I will freeze the item tower.
- I will use my own dataset of books I like to train the user embeddings.

In [None]:
class FineTuningDataset(Dataset):
    def __init__(self, data, encoders, scalers):
        self.data = data.copy()
        self.encoders = encoders
        self.scalers = scalers
        self.preprocess()

    def preprocess(self):
        categorical_cols = ['User-ID', 'ISBN', 'Book-Title', 'Book-Author', 'Publisher']
        for col in categorical_cols:
            self.data[col] = self.data[col].astype(str).map(self.encoders[col]).fillna(0).astype(int)

        numeric_cols = ['Age', 'Year-Of-Publication']
        for col in numeric_cols:
            scaled_vals = self.scalers[col].transform(self.data[[col]])
            self.data[col] = scaled_vals.flatten()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            'User-ID': torch.tensor(item['User-ID'], dtype=torch.long),
            'User-Age': torch.tensor(item['Age'], dtype=torch.float32),
            'Book-ISBN': torch.tensor(item['ISBN'], dtype=torch.long),
            'Book-Title': torch.tensor(item['Book-Title'], dtype=torch.long),
            'Book-Author': torch.tensor(item['Book-Author'], dtype=torch.long),
            'Book-Publisher': torch.tensor(item['Publisher'], dtype=torch.long),
            'Book-Year-Of-Publication': torch.tensor(item['Year-Of-Publication'], dtype=torch.float32),
            'Rating': torch.tensor(item['Book-Rating'], dtype=torch.float32)
        }

fine_tuning_df = pd.read_csv("./drive/MyDrive/fine-tuning-book-set.txt")
fine_tuning_dataset = FineTuningDataset(fine_tuning_df, dataset.encoders, dataset.scalers)
fine_tuning_loader = DataLoader(fine_tuning_dataset, batch_size=len(fine_tuning_dataset), shuffle=False)


In [None]:
next(iter(fine_tuning_loader))

{'User-ID': tensor([6656, 6656, 6656, 6656, 6656, 6656, 6656, 6656, 6656, 6656, 6656, 6656,
         6656, 6656, 6656, 6656]),
 'User-Age': tensor([-1.1064, -1.1064, -1.1064, -1.1064, -1.1064, -1.1064, -1.1064, -1.1064,
         -1.1064, -1.1064, -1.1064, -1.1064, -1.1064, -1.1064, -1.1064, -1.1064]),
 'Book-ISBN': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'Book-Title': tensor([3120,    0,    0, 3187,    0, 6151, 5646,    0,    0,    0,    0,    0,
            0,    0,    0,    0]),
 'Book-Author': tensor([ 793,    0, 4880, 1884, 1884, 1884, 2624,  787,  787,    0,  118,    0,
         5533,    4,    4, 1884]),
 'Book-Publisher': tensor([  12,    0,   62,   95,   95,    1,   25,  206,  299, 1816, 1054,   62,
            0,  149,  149,   95]),
 'Book-Year-Of-Publication': tensor([-20.8847,   1.2564,  -7.4075,  -5.6197,  -5.7572,  -8.0951,  -5.0696,
         -54.5776, -54.0275,   0.8439,  -0.2563,   0.4313,  -8.2326,  -3.6944,
          -7.8200,  -5.7572]),
 'Rating': te

In [None]:
for param in two_towers.item_tower.parameters():
    param.requires_grad = False

FINE_TUNING_EPOCHS = 10
for epoch in range(1, FINE_TUNING_EPOCHS + 1):

    # -- Main Loop --
    running_loss = 0.0
    two_towers.train()

    for batch_idx, batch in enumerate(fine_tuning_loader):
        optimizer.zero_grad()

        preds = two_towers(batch)
        targets = batch['Rating']
        loss = loss_fn(preds, targets)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch}, Average Training Loss (based on rating): {avg_loss:.4f}")

avg_loss = running_loss / len(train_loader)
print(f"Fine-Tuned Set {epoch}, Average Training Loss (based on rating): {avg_loss:.4f}")

Epoch 1, Average Training Loss (based on rating): 0.0564
Epoch 2, Average Training Loss (based on rating): 0.0534
Epoch 3, Average Training Loss (based on rating): 0.0496
Epoch 4, Average Training Loss (based on rating): 0.0457
Epoch 5, Average Training Loss (based on rating): 0.0426
Epoch 6, Average Training Loss (based on rating): 0.0400
Epoch 7, Average Training Loss (based on rating): 0.0375
Epoch 8, Average Training Loss (based on rating): 0.0350
Epoch 9, Average Training Loss (based on rating): 0.0327
Epoch 10, Average Training Loss (based on rating): 0.0304
Fine-Tuned Set 10, Average Training Loss (based on rating): 0.0304


In [None]:
user_row = next(iter(fine_tuning_loader))
user_vector = two_towers.user_tower.get_embedding(user_row)

scores = torch.matmul(item_vector, user_vector.T)
scores = scores.T

topk_scores, topk_indices = torch.topk(scores, k=50, dim=1)
print(topk_scores[0])
print(topk_indices[0])

tensor([2.0188, 1.9747, 1.9732, 1.9061, 1.8053, 1.7696, 1.7078, 1.6479, 1.6479,
        1.6479, 1.6479, 1.6479, 1.6479, 1.6287, 1.5782, 1.5779, 1.5756, 1.5756,
        1.5756, 1.5473, 1.5473, 1.5418, 1.5412, 1.5389, 1.5158, 1.5127, 1.4987,
        1.4983, 1.4848, 1.4848, 1.4820, 1.4672, 1.4652, 1.4563, 1.4563, 1.4563,
        1.4563, 1.4558, 1.4495, 1.4467, 1.4446, 1.4422, 1.4418, 1.4393, 1.4391,
        1.4295, 1.4260, 1.4235, 1.4161, 1.4109], grad_fn=<SelectBackward0>)
tensor([10515,  4893,  2018, 11214,  8978, 11918,  6423,  8944,  2326,  1300,
         3220,  7048,  2893,  2961,  1890,  9754,  8463,  6034,   580,   620,
         8080,  8191,  7015,  4680,   859,  9614,   564,  4231,  8339,  4528,
         1347,  6032,  1943,   997,  1071,  7111, 13145,  6824,  8677,  5447,
         1529, 13033,  5250,  6303,  3933,  6708,  8716,  4290,  8739,  1653])


In [None]:
recommended_books = df.iloc[topk_indices[0].tolist()]
print(recommended_books[['Book-Title']])

                                              Book-Title
37229                                         The Palace
18732  Cowboy With A Secret (Harlequin American Roman...
10439  The New Rabbi : A Congregation Searches for It...
52068                                 Cyrano De Bergerac
32322          The Far Shore of Time (Eschaton Sequence)
54838                          Orlando. Eine Biographie.
23081  All Change: Project Manager's Secret Handbook ...
32180  The Exploits of the Incomparable Mulla Nasrudi...
11289                             The Kitchen God's Wife
8010   Michael KÃ¶hlmeiers Sagen des klassischen Alte...
13448                      By the Sword (Kerowyn's Tale)
25581               Flatland (Shambhala Pocket Classics)
12490             Child of Thunder (Daw Book Collectors)
12628  Bold Science: Seven Scientists Who Are Changin...
10099             Tollivers Reisen. Stadtgeschichten IV.
34729                                       White Plague
30295    Divine Secrets of the 