## Importing Dataset

In [1]:
%pip install kagglehub[pandas-datasets]



In [49]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import Counter
from google.colab import drive
import kagglehub
from kagglehub import KaggleDatasetAdapter
drive.mount('/content/drive/')

# Load the latest version
books_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Books.csv",
)

ratings_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Ratings.csv",
)

users_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Users.csv",
)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


  books_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


  result = read_function(
  ratings_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


  users_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


In [3]:
df_ratings_books = pd.merge(ratings_df, books_df, on="ISBN", how='inner')
df = pd.merge(df_ratings_books, users_df, on='User-ID')
df['User-ID'] = df['User-ID'].astype(str)
df['Year-Of-Publication'] = pd.to_numeric(df['Year-Of-Publication'], errors='coerce')
df = df.dropna(subset=['Year-Of-Publication'])
df = df.dropna(subset=['Age'])
df = df[df['Book-Rating'] > 0]
df = df[df['Age'] <= 100]
df = df[df['Year-Of-Publication'] > 0]
df['Book-Rating'].describe()

Unnamed: 0,Book-Rating
count,264742.0
mean,7.738848
std,1.813809
min,1.0
25%,7.0
50%,8.0
75%,9.0
max,10.0


In [4]:
print(df['Book-Rating'].value_counts().sort_index())

Book-Rating
1       862
2      1525
3      3260
4      4991
5     27203
6     21047
7     44663
8     63605
9     44496
10    53090
Name: count, dtype: int64


In [5]:
print(df['Age'].value_counts().sort_index())

Age
0.0      203
1.0      198
2.0      191
3.0       82
4.0      143
        ... 
96.0       2
97.0      46
98.0       1
99.0       5
100.0     51
Name: count, Length: 96, dtype: int64


In [6]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
import ast
from sentence_transformers import SentenceTransformer

# User Tower -- User-ID, Age
# Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

class BookRecommenderDataset(Dataset):
    """
    A PyTorch Dataset class for book recommendation tasks.

    Parameters
    ----------
    dataframe : pd.DataFrame
        The input data containing user, item, and possibly interaction features.

    Attributes
    ----------
    data : pd.DataFrame
        The processed version of the input dataframe.
    encoders : dict
        A dictionary mapping column names to fitted label encoders.
    reverse_encoders : dict
        A dictionary mapping column names to reverse label encoders (index to label).
    scalers : dict
        A dictionary mapping column names to fitted scalers for numerical features.
    user_item_interaction : dict
        A dictionary mapping encoded User-IDs to a list of positive example encoded ISBNs
    negative_examples : int
        An integer hyperparameter for the number of negative examples to use for contrastive learning
    """

    def __init__(self, data, negative_examples=1):
        self.encoders = {} # {'Column name': {'value': idx, ...}, ...}
        self.reverse_encoders = {} # {'Column name': {idx: 'value', ...}, ...}
        self.scalers = {}
        self.user_item_interactions = {} # {encoded userid: [encoded ISBN]}
        self.negative_examples = negative_examples
        self.data = data.sample(frac=0.05, random_state=42).reset_index(drop=True)
        self.preprocess(self.data)

    def preprocess(self, data):
        self.encode_information()
        self.generate_positives()

    def generate_positives(self):
        self.user_item_interaction = (
            self.data
            .groupby('User-ID')['ISBN']
            .apply(list)
            .to_dict()
        )

    def encode_information(self):
        """
        Maps {key: index} pairs and StandardScaler for real valued numbers
        """
        label_encoders = ['User-ID', 'ISBN', 'Book-Title', 'Book-Author', 'Publisher']
        standard_scalers = ['Age', 'Year-Of-Publication']

        for col in label_encoders:
            unique_vals = self.data[col].astype(str).unique()
            self.encoders[col] = {val: idx + 1 for idx, val in enumerate(unique_vals)}
            self.reverse_encoders[col] = {idx + 1: val for idx, val in enumerate(unique_vals)}
            self.data[col] = self.data[col].astype(str).map(self.encoders[col]).fillna(0).astype(int)

        for col in standard_scalers:
            self.scalers[col] = StandardScaler()
            self.data[[col]] = self.scalers[col].fit_transform(self.data[[col]])

        # Manually adding my own User-ID so I don't need to adjust nn.Embedding later
        max_user_idx = max(self.encoders['User-ID'].values())
        self.encoders['User-ID']["1234567890"] = max_user_idx + 1
        self.reverse_encoders['User-ID'][max_user_idx + 1] = "1234567890"

    def __len__(self):
        return len(self.data)


    def __getitem__(self, idx):
        """
        Returns
            - user-tower data (User-ID and User-Age)
            - positive item data (pos_item)
            - negative item data (neg_items)
            - Target rating
        """
        positive_item = self.data.iloc[idx]
        positive_user_id = positive_item['User-ID']
        positive_isbns = self.user_item_interaction[positive_user_id]

        negative_examples = []
        while len(negative_examples) < self.negative_examples:
            candidate = self.data.sample(n=1).iloc[0]
            candidate_isbn = candidate['ISBN']
            if candidate_isbn not in positive_isbns:
                negative_examples.append(candidate)

        output = {
            'User-ID': torch.tensor(positive_item['User-ID'], dtype=torch.long),
            'User-Age': torch.tensor(positive_item['Age'], dtype=torch.float32),
            'Rating': torch.tensor(positive_item['Book-Rating'], dtype=torch.float32),

            'pos_item': {
                'Book-ISBN': torch.tensor(positive_item['ISBN'], dtype=torch.long),
                'Book-Title': torch.tensor(positive_item['Book-Title'], dtype=torch.long),
                'Book-Author': torch.tensor(positive_item['Book-Author'], dtype=torch.long),
                'Book-Publisher': torch.tensor(positive_item['Publisher'], dtype=torch.long),
                'Book-Year-Of-Publication': torch.tensor(positive_item['Year-Of-Publication'], dtype=torch.float32),
            },

            'neg_items': [
                {
                    'Book-ISBN': torch.tensor(neg['ISBN'], dtype=torch.long),
                    'Book-Title': torch.tensor(neg['Book-Title'], dtype=torch.long),
                    'Book-Author': torch.tensor(neg['Book-Author'], dtype=torch.long),
                    'Book-Publisher': torch.tensor(neg['Publisher'], dtype=torch.long),
                    'Book-Year-Of-Publication': torch.tensor(neg['Year-Of-Publication'], dtype=torch.float32),
                }
                for neg in negative_examples
            ]
        }
        return output


dataset = BookRecommenderDataset(df)


In [7]:
dataset[0]

{'User-ID': tensor(1),
 'User-Age': tensor(-0.2955),
 'Rating': tensor(7.),
 'pos_item': {'Book-ISBN': tensor(1),
  'Book-Title': tensor(1),
  'Book-Author': tensor(1),
  'Book-Publisher': tensor(1),
  'Book-Year-Of-Publication': tensor(0.4313)},
 'neg_items': [{'Book-ISBN': tensor(432),
   'Book-Title': tensor(429),
   'Book-Author': tensor(332),
   'Book-Publisher': tensor(6),
   'Book-Year-Of-Publication': tensor(0.8439)}]}

In [8]:
train_size = int(0.5 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])

# Dataloaders
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [9]:
next(iter(train_loader))

{'User-ID': tensor([2087, 2239, 4866, 3085, 1548, 2328, 3379, 1420, 2097,  631, 1762,   44,
          607,  323, 2940, 1000, 2238, 1387, 3731,   38,  607,   38, 1564, 3064,
         5677, 3786,  426, 2307,  156,  124,  121, 1500, 4743,   38, 4404, 2018,
         6449,  802,  536, 4578, 1013, 1463,  201,  207, 3603, 2705,  545, 1046,
          542,  545, 5467, 1953, 3396, 5200,  545, 6586,  900,  575, 2861,   38,
         1342,  297, 5510,   38]),
 'User-Age': tensor([ 1.3264,  0.7587, -0.8631,  0.8398,  0.1100, -1.1064, -0.9442, -0.3766,
          0.9209, -1.5930, -0.7010,  0.5965, -0.2144, -0.6199, -0.3766,  1.1642,
          0.8398, -0.4577, -0.4577,  1.2453, -0.2144,  1.2453, -0.6199, -0.5388,
          0.7587, -0.1333,  0.4343, -1.5930,  1.2453, -1.1875, -0.2144, -0.2955,
         -0.0522,  1.2453, -0.6199,  1.6507, -1.0253, -1.0253,  0.8398, -1.1064,
          0.4343,  1.6507, -1.1875, -1.3497, -0.0522, -0.4577,  1.2453, -0.0522,
         -0.7010,  1.2453,  0.4343, -1.5119,  1.164

## Two Tower Model for Recommendations

In [10]:
class UserTower(nn.Module):

    # User Tower -- User-ID, Age

    def __init__(self, num_users, embedding_dim=32):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim, padding_idx=0)

        self.user_mlp = nn.Sequential(
            nn.Linear(embedding_dim + 1, 128), # 1 embedding + 1 numerical
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, embedding_dim)
        )

    def forward(self, user_id, age):
        """
        user_id: (batch,) int64
        review_mean: (batch,) float32
        """
        user_emb = self.user_embedding(user_id)
        age = age.unsqueeze(1)
        x = torch.cat([user_emb, age], dim=1)
        return self.user_mlp(x)

    def get_embedding(self, data):
        return self.forward(data['User-ID'], data['User-Age'])


In [11]:
class ItemTower(nn.Module):
    def __init__(self, num_isbn, num_titles, num_authors, num_publishers, embedding_dim=32):
        super().__init__()

        # Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

        self.book_isbn_embedding = nn.Embedding(num_isbn, embedding_dim, padding_idx=0)
        self.book_title_embedding = nn.Embedding(num_titles, embedding_dim, padding_idx=0)
        self.book_author_embedding = nn.Embedding(num_authors, embedding_dim, padding_idx=0)
        self.book_publisher_embedding = nn.Embedding(num_publishers, embedding_dim, padding_idx=0)

        self.item_mlp = nn.Sequential(
            nn.Linear(embedding_dim * 4 + 1, 128),  # 4 embeddings + 1 numerical
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, embedding_dim)
        )

    def forward(self, isbn, book_title, book_author, book_publisher, book_year_of_publication):
        book_isbn_emb = self.book_isbn_embedding(isbn)
        book_title_emb = self.book_title_embedding(book_title)
        book_author_emb = self.book_author_embedding(book_author)
        book_publisher_emb = self.book_publisher_embedding(book_publisher)
        book_year = book_year_of_publication.unsqueeze(1)

        x = torch.cat([
            book_isbn_emb,
            book_title_emb,
            book_author_emb,
            book_publisher_emb,
            book_year
        ], dim=1)

        return self.item_mlp(x)

    def get_embedding(self, data):
        return self.forward(
            data['Book-ISBN'],
            data['Book-Title'],
            data['Book-Author'],
            data['Book-Publisher'],
            data['Book-Year-Of-Publication'],
        )


In [19]:
class TwoTowers(nn.Module):
    def __init__(self, user_tower: UserTower, item_tower: ItemTower):
        super().__init__()
        self.user_tower = user_tower
        self.item_tower = item_tower

    def forward(self, data):
        user_vector = self.user_tower.get_embedding(data)
        item_vector = self.item_tower.get_embedding(data['pos_item'])
        return (user_vector * item_vector).sum(dim=1)

In [20]:
example_data = next(iter(train_loader))

NUM_USERS = len(dataset.encoders['User-ID']) + 1
NUM_ISBN = len(dataset.encoders['ISBN']) + 1
NUM_TITLES = len(dataset.encoders['Book-Title']) + 1
NUM_AUTHORS = len(dataset.encoders['Book-Author']) + 1
NUM_PUBLISHERS = len(dataset.encoders['Publisher']) + 1

user_tower = UserTower(num_users=NUM_USERS)

item_tower = ItemTower(
    num_isbn=NUM_ISBN,
    num_titles=NUM_TITLES,
    num_authors=NUM_AUTHORS,
    num_publishers=NUM_PUBLISHERS,
)

two_towers = TwoTowers(
    user_tower,
    item_tower
)

## Training

#### Helper Functions

In [21]:
# --- Training Helpers ---
def train_one_epoch(model, loader, optimizer, loss_fn):
    model.train()
    running_loss = 0.0

    for batch_idx, data in enumerate(loader):
        optimizer.zero_grad()
        preds = model(data)
        targets = data['Rating']
        loss = loss_fn(preds, targets)

        # Sanity checks
        assert not torch.isnan(preds).any(), "NaN in predictions"
        assert not torch.isnan(targets).any(), "NaN in targets"
        assert not torch.isinf(preds).any(), "Inf in predictions"
        assert not torch.isinf(targets).any(), "Inf in targets"

        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=CLIP_GRAD_NORM)
        optimizer.step()

        yield batch_idx, loss.item()

@torch.no_grad()
def calculate_test_loss(model, test_loader, loss_fn):
    model.eval()
    total_loss = 0.0
    num_batches = 0

    for data in test_loader:
        preds = model(data)
        targets = data['Rating']
        loss = loss_fn(preds, targets)
        total_loss += loss.item()
        num_batches += 1

    return total_loss / num_batches if num_batches > 0 else float('inf')

def evaluate_and_checkpoint(model, epoch, global_step, best_loss, counter, test_loader, loss_fn):
    test_loss = calculate_test_loss(model, test_loader, loss_fn)

    if test_loss < best_loss:
        best_loss = test_loss
        counter = 0
        timestamp = datetime.datetime.now().strftime('%Y%m%d')
        save_path = f"{MODEL_SAVE_PATH}/two_towers_best_model_{timestamp}.pt"
        torch.save(model.state_dict(), save_path)
        print(f"[Epoch {epoch}] ✅ Improved! Test Loss: {test_loss:.4f}. Model saved.")
    else:
        counter += 1
        print(f"[Epoch {epoch}] No improvement. Test Loss: {test_loss:.4f} ({counter}/{EARLY_STOPPING_PATIENCE})")

    return best_loss, counter

#### Main Training Loop

In [22]:
%rm -rf ./logs/

In [23]:
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import datetime

EPOCHS = 200
LOG_INTERVAL = 100
CLIP_GRAD_NORM = 1.0
LEARNING_RATE = 0.001
EARLY_STOPPING_PATIENCE = 15
MODEL_SAVE_PATH = "/content/drive/MyDrive/models"

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(two_towers.parameters(), lr=LEARNING_RATE)
writer = SummaryWriter('./logs/')

best_test_loss = float('inf')
early_stopping_counter = 0
global_step = 0

In [51]:

for epoch in range(1, EPOCHS + 1):

    # -- Main Loop --
    running_loss = 0.0
    two_towers.train()

    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()

        # Calculate similarity between user and positive examples
        pos_sim = two_towers(batch)  # shape: [batch_size]

        # Calculate similarity between user and negative examples
        user_embedding = two_towers.user_tower.get_embedding(batch)          # [batch_size, d]
        neg_embedding = two_towers.item_tower.get_embedding(batch['neg_items'][0])  # [batch_size, d]
        neg_sim = torch.sum(user_embedding * neg_embedding, dim=1)

        #
        # Logits (first position will be positive that we want to maximize, and everything else is negative)
        # For example, if we have 1 positive example and 1 negative example our logits are:
        #   [[pos, neg], ..batch_size..]
        #
        # Now the label we want cross entropy to maximize is in the 0th position (positive)
        #   labels = [0, ..batch_size..]
        #
        logits = torch.stack([pos_sim, neg_sim], dim=1)
        labels = torch.zeros(logits.size(0), dtype=torch.long)
        loss = F.cross_entropy(logits, labels)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch}/{EPOCHS}, Average Training Loss (based on rating): {avg_loss:.4f}, ")

writer.close()
print("Training complete.")


Epoch 1/200, Average Training Loss (based on rating): 0.7072, 
Epoch 2/200, Average Training Loss (based on rating): 0.6546, 
Epoch 3/200, Average Training Loss (based on rating): 0.5976, 
Epoch 4/200, Average Training Loss (based on rating): 0.5282, 


KeyboardInterrupt: 

### Fine-tuning on my own data

- The item embeddings were learned during training. I will freeze the item tower.
- I will use my own dataset of books I like to train the user embeddings.

In [None]:

    # -----------------------
    # -- Recall@K Metric --
    #
    # Work in progress... Implement the recall@k metric to see out of the user's total ratings, how many of these appeared in the top-k.
    #

    # user_row = next(iter(train_loader))
    # user_vector = two_towers.user_tower.get_embedding(user_row)

    # scores = torch.matmul(item_vector, user_vector.T)
    # scores = scores.T
    #
    # top_k = 20
    # for i, user_id_encoded in enumerate(user_row['User-ID']):
    #     top_scores, top_indices = torch.topk(scores[i], top_k)
    #     user_id_decoded = dataset.reverse_encoders['User-ID'][user_id_encoded.item()]
    #     ground_truth_books = df[df['User-ID'] == user_id_decoded]['ISBN']

    #     print(f"Top k books predicted for user {user_id_decoded}")
    #     print(top_indices)
    #     print(f"Book ratings for user {user_id_decoded}")
    #     print(ground_truth_books)
    #
    # -----------------------

In [18]:
all_items_loader = DataLoader(dataset, batch_size=len(dataset), shuffle=False)
items = next(iter(all_items_loader))
item_vector = two_towers.item_tower.get_embedding(items)

KeyError: 'Book-ISBN'

In [None]:
class FineTuningDataset(Dataset):
    def __init__(self, data, encoders, scalers):
        self.data = data.copy()
        self.encoders = encoders
        self.scalers = scalers
        self.preprocess()

    def preprocess(self):
        categorical_cols = ['User-ID', 'ISBN', 'Book-Title', 'Book-Author', 'Publisher']
        for col in categorical_cols:
            self.data[col] = self.data[col].astype(str).map(self.encoders[col]).fillna(0).astype(int)

        numeric_cols = ['Age', 'Year-Of-Publication']
        for col in numeric_cols:
            scaled_vals = self.scalers[col].transform(self.data[[col]])
            self.data[col] = scaled_vals.flatten()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            'User-ID': torch.tensor(item['User-ID'], dtype=torch.long),
            'User-Age': torch.tensor(item['Age'], dtype=torch.float32),
            'Book-ISBN': torch.tensor(item['ISBN'], dtype=torch.long),
            'Book-Title': torch.tensor(item['Book-Title'], dtype=torch.long),
            'Book-Author': torch.tensor(item['Book-Author'], dtype=torch.long),
            'Book-Publisher': torch.tensor(item['Publisher'], dtype=torch.long),
            'Book-Year-Of-Publication': torch.tensor(item['Year-Of-Publication'], dtype=torch.float32),
            'Rating': torch.tensor(item['Book-Rating'], dtype=torch.float32)
        }

fine_tuning_df = pd.read_csv("./drive/MyDrive/fine-tuning-book-set.txt")
fine_tuning_dataset = FineTuningDataset(fine_tuning_df, dataset.encoders, dataset.scalers)
fine_tuning_loader = DataLoader(fine_tuning_dataset, batch_size=len(fine_tuning_dataset), shuffle=False)


In [None]:
next(iter(fine_tuning_loader))

In [None]:
for param in two_towers.item_tower.parameters():
    param.requires_grad = False

FINE_TUNING_EPOCHS = 10
for epoch in range(1, FINE_TUNING_EPOCHS + 1):

    # -- Main Loop --
    running_loss = 0.0
    two_towers.train()

    for batch_idx, batch in enumerate(fine_tuning_loader):
        optimizer.zero_grad()

        preds = two_towers(batch)
        targets = batch['Rating']
        loss = loss_fn(preds, targets)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch}, Average Training Loss (based on rating): {avg_loss:.4f}")

avg_loss = running_loss / len(train_loader)
print(f"Fine-Tuned Set {epoch}, Average Training Loss (based on rating): {avg_loss:.4f}")

In [None]:
user_row = next(iter(fine_tuning_loader))
user_vector = two_towers.user_tower.get_embedding(user_row)

scores = torch.matmul(item_vector, user_vector.T)
scores = scores.T

topk_scores, topk_indices = torch.topk(scores, k=50, dim=1)
print(topk_scores[0])
print(topk_indices[0])

In [None]:
recommended_books = df.iloc[topk_indices[0].tolist()]
print(recommended_books[['Book-Title']])