## Importing Dataset

In [174]:
%pip install kagglehub[pandas-datasets]



In [175]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import Counter
from google.colab import drive
import kagglehub
from kagglehub import KaggleDatasetAdapter
drive.mount('/content/drive/')

# Load the latest version
books_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Books.csv",
)

ratings_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Ratings.csv",
)

users_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Users.csv",
)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


  books_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


  result = read_function(
  ratings_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


  users_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


In [176]:
df_ratings_books = pd.merge(ratings_df, books_df, on="ISBN", how='inner')
df = pd.merge(df_ratings_books, users_df, on='User-ID')
df['User-ID'] = df['User-ID'].astype(str)
df['Year-Of-Publication'] = pd.to_numeric(df['Year-Of-Publication'], errors='coerce')
df = df.dropna(subset=['Year-Of-Publication'])
df = df.dropna(subset=['Age'])
df = df[df['Book-Rating'] > 0]
df = df[df['Age'] <= 100]
df = df[df['Year-Of-Publication'] > 0]
df['Book-Rating'].describe()

Unnamed: 0,Book-Rating
count,264742.0
mean,7.738848
std,1.813809
min,1.0
25%,7.0
50%,8.0
75%,9.0
max,10.0


In [177]:
personal_df = pd.read_csv("./drive/MyDrive/fine-tuning-book-set.txt")
end_index = len(df)
df = pd.concat([df, personal_df], ignore_index=True, sort=False)

In [178]:
# print(df['Book-Rating'].value_counts().sort_index())

In [179]:
# print(df['Age'].value_counts().sort_index())

In [180]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [181]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
import ast
from sentence_transformers import SentenceTransformer

# User Tower -- User-ID, Age
# Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

class BookRecommenderDataset(Dataset):
    """
    A PyTorch Dataset class for book recommendation tasks.

    Parameters
    ----------
    dataframe : pd.DataFrame
        The input data containing user, item, and possibly interaction features.

    Attributes
    ----------
    data : pd.DataFrame
        The processed version of the input dataframe.
    encoders : dict
        A dictionary mapping column names to fitted label encoders.
    reverse_encoders : dict
        A dictionary mapping column names to reverse label encoders (index to label).
    scalers : dict
        A dictionary mapping column names to fitted scalers for numerical features.
    user_item_interaction : dict
        A dictionary mapping encoded User-IDs to a list of positive example encoded ISBNs
    negative_examples : int
        An integer hyperparameter for the number of negative examples to use for contrastive learning
    """

    def __init__(self, data, negative_examples=1):
        self.encoders = {} # {'Column name': {'value': idx, ...}, ...}
        self.reverse_encoders = {} # {'Column name': {idx: 'value', ...}, ...}
        self.scalers = {}
        self.user_item_interactions = {} # {encoded userid: [encoded ISBN]}
        self.negative_examples = negative_examples
        self.data = data.sample(frac=0.01, random_state=42).reset_index(drop=True)
        self.preprocess(self.data)

    def preprocess(self, data):
        self.encode_information()
        self.generate_positives()

    def generate_positives(self):
        self.user_item_interaction = (
            self.data
            .groupby('User-ID')['ISBN']
            .apply(list)
            .to_dict()
        )

    def encode_information(self):
        """
        Maps {key: index} pairs and StandardScaler for real valued numbers
        """
        label_encoders = ['User-ID', 'ISBN', 'Book-Title', 'Book-Author', 'Publisher']
        standard_scalers = ['Age', 'Year-Of-Publication']

        for col in label_encoders:
            unique_vals = self.data[col].astype(str).unique()
            self.encoders[col] = {val: idx + 1 for idx, val in enumerate(unique_vals)}
            self.reverse_encoders[col] = {idx + 1: val for idx, val in enumerate(unique_vals)}
            self.data[col] = self.data[col].astype(str).map(self.encoders[col]).fillna(0).astype(int)

        for col in standard_scalers:
            self.scalers[col] = StandardScaler()
            self.data[[col]] = self.scalers[col].fit_transform(self.data[[col]])

        # Manually adding my own User-ID so I don't need to adjust nn.Embedding later
        # max_user_idx = max(self.encoders['User-ID'].values())
        # self.encoders['User-ID']["1234567890"] = max_user_idx + 1
        # self.reverse_encoders['User-ID'][max_user_idx + 1] = "1234567890"

    def __len__(self):
        return len(self.data)


    def __getitem__(self, idx):
        """
        Returns
            - user-tower data (User-ID and User-Age)
            - positive item data (pos_item)
            - negative item data (neg_items)
            - Target rating
        """
        positive_item = self.data.iloc[idx]
        positive_user_id = positive_item['User-ID']
        positive_isbns = self.user_item_interaction[positive_user_id]

        negative_examples = []
        while len(negative_examples) < self.negative_examples:
            candidate = self.data.sample(n=1).iloc[0]
            candidate_isbn = candidate['ISBN']
            if candidate_isbn not in positive_isbns:
                negative_examples.append(candidate)

        output = {
            'User-ID': torch.tensor(positive_item['User-ID'], dtype=torch.long),
            'User-Age': torch.tensor(positive_item['Age'], dtype=torch.float32),
            'Rating': torch.tensor(positive_item['Book-Rating'], dtype=torch.float32),

            'pos_item': {
                'Book-ISBN': torch.tensor(positive_item['ISBN'], dtype=torch.long),
                'Book-Title': torch.tensor(positive_item['Book-Title'], dtype=torch.long),
                'Book-Author': torch.tensor(positive_item['Book-Author'], dtype=torch.long),
                'Book-Publisher': torch.tensor(positive_item['Publisher'], dtype=torch.long),
                'Book-Year-Of-Publication': torch.tensor(positive_item['Year-Of-Publication'], dtype=torch.float32),
            },

            'neg_items': [
                {
                    'Book-ISBN': torch.tensor(neg['ISBN'], dtype=torch.long),
                    'Book-Title': torch.tensor(neg['Book-Title'], dtype=torch.long),
                    'Book-Author': torch.tensor(neg['Book-Author'], dtype=torch.long),
                    'Book-Publisher': torch.tensor(neg['Publisher'], dtype=torch.long),
                    'Book-Year-Of-Publication': torch.tensor(neg['Year-Of-Publication'], dtype=torch.float32),
                }
                for neg in negative_examples
            ]
        }
        return output


dataset = BookRecommenderDataset(df, negative_examples=3)


In [182]:
dataset[0]

{'User-ID': tensor(1),
 'User-Age': tensor(-0.8395),
 'Rating': tensor(8.),
 'pos_item': {'Book-ISBN': tensor(1),
  'Book-Title': tensor(1),
  'Book-Author': tensor(1),
  'Book-Publisher': tensor(1),
  'Book-Year-Of-Publication': tensor(0.2982)},
 'neg_items': [{'Book-ISBN': tensor(334),
   'Book-Title': tensor(334),
   'Book-Author': tensor(307),
   'Book-Publisher': tensor(42),
   'Book-Year-Of-Publication': tensor(-1.1344)},
  {'Book-ISBN': tensor(1347),
   'Book-Title': tensor(1335),
   'Book-Author': tensor(1105),
   'Book-Publisher': tensor(490),
   'Book-Year-Of-Publication': tensor(0.4415)},
  {'Book-ISBN': tensor(646),
   'Book-Title': tensor(644),
   'Book-Author': tensor(562),
   'Book-Publisher': tensor(244),
   'Book-Year-Of-Publication': tensor(1.0145)}]}

In [183]:
train_size = int(0.7 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])

# Dataloaders
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [184]:
next(iter(train_loader))

{'User-ID': tensor([ 266,   10, 1932,  841,  120,  317, 1247, 1516,  694,  471, 1414, 1183,
          609,  493,  128,  844, 1459,   50,  734,  148, 1871,  939,  392,  342,
          236,  323,  198,  676, 1786,  432,  223,  618,  825,  391, 1708,  972,
           56,  509,  737, 1386,  752, 1619,  122,  607, 1236, 1469, 1032, 1744,
         1536, 1202,  536,  128,  832,  651, 1191,  896, 1361,  778,  584,  231,
          203,  253, 1233, 1667]),
 'User-Age': tensor([-0.5174, -0.3564,  0.7707, -1.0005,  0.5292, -0.6784, -0.1149,  0.9317,
          0.1266, -0.0344, -0.5174,  1.6563,  0.2876, -1.1615,  0.6097, -0.8395,
          1.4147,  1.2537, -0.4369,  1.0122,  0.7707,  0.0461, -0.1149, -1.0005,
          1.5758,  3.1859, -0.5979,  0.2876, -0.1149,  1.6563, -0.9200, -0.5174,
         -0.8395, -0.6784,  1.2537, -0.4369,  0.5292, -1.0810, -1.0005,  0.6902,
          0.8512, -0.9200, -0.5174,  0.9317, -0.1149,  2.3003, -1.0005,  1.1732,
         -0.6784,  0.0461, -1.1615,  0.6097, -1.161

## Two Tower Model for Recommendations

In [185]:
class UserTower(nn.Module):

    # User Tower -- User-ID, Age

    def __init__(self, num_users, embedding_dim=32):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim, padding_idx=0)

        self.user_mlp = nn.Sequential(
            nn.Linear(embedding_dim + 1, 128), # 1 embedding + 1 numerical
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, embedding_dim)
        )

    def forward(self, user_id, age):
        """
        user_id: (batch,) int64
        review_mean: (batch,) float32
        """
        user_emb = self.user_embedding(user_id)
        age = age.unsqueeze(1)
        x = torch.cat([user_emb, age], dim=1)
        return self.user_mlp(x)

    def get_embedding(self, data):
        return self.forward(data['User-ID'], data['User-Age'])


In [186]:
class ItemTower(nn.Module):
    def __init__(self, num_isbn, num_titles, num_authors, num_publishers, embedding_dim=32):
        super().__init__()

        # Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

        self.book_isbn_embedding = nn.Embedding(num_isbn, embedding_dim, padding_idx=0)
        self.book_title_embedding = nn.Embedding(num_titles, embedding_dim, padding_idx=0)
        self.book_author_embedding = nn.Embedding(num_authors, embedding_dim, padding_idx=0)
        self.book_publisher_embedding = nn.Embedding(num_publishers, embedding_dim, padding_idx=0)

        self.item_mlp = nn.Sequential(
            nn.Linear(embedding_dim * 4 + 1, 128),  # 4 embeddings + 1 numerical
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, embedding_dim)
        )

    def forward(self, isbn, book_title, book_author, book_publisher, book_year_of_publication):
        book_isbn_emb = self.book_isbn_embedding(isbn)
        book_title_emb = self.book_title_embedding(book_title)
        book_author_emb = self.book_author_embedding(book_author)
        book_publisher_emb = self.book_publisher_embedding(book_publisher)
        book_year = book_year_of_publication.unsqueeze(1)

        x = torch.cat([
            book_isbn_emb,
            book_title_emb,
            book_author_emb,
            book_publisher_emb,
            book_year
        ], dim=1)

        return self.item_mlp(x)

    def get_embedding(self, data):
        return self.forward(
            data['Book-ISBN'],
            data['Book-Title'],
            data['Book-Author'],
            data['Book-Publisher'],
            data['Book-Year-Of-Publication'],
        )


In [187]:
class TwoTowers(nn.Module):
    def __init__(self, user_tower: UserTower, item_tower: ItemTower):
        super().__init__()
        self.user_tower = user_tower
        self.item_tower = item_tower

    def forward(self, data):
        user_vector = self.user_tower.get_embedding(data)
        item_vector = self.item_tower.get_embedding(data['pos_item'])
        return (user_vector * item_vector).sum(dim=1)

In [188]:
example_data = next(iter(train_loader))

NUM_USERS = len(dataset.encoders['User-ID']) + 1
NUM_ISBN = len(dataset.encoders['ISBN']) + 1
NUM_TITLES = len(dataset.encoders['Book-Title']) + 1
NUM_AUTHORS = len(dataset.encoders['Book-Author']) + 1
NUM_PUBLISHERS = len(dataset.encoders['Publisher']) + 1

user_tower = UserTower(num_users=NUM_USERS)

item_tower = ItemTower(
    num_isbn=NUM_ISBN,
    num_titles=NUM_TITLES,
    num_authors=NUM_AUTHORS,
    num_publishers=NUM_PUBLISHERS,
)

two_towers = TwoTowers(
    user_tower,
    item_tower
).to(device)

## Training

#### Helper Functions

In [189]:
# --- Training Helpers ---
def train_one_epoch(model, loader, optimizer, loss_fn):
    model.train()
    running_loss = 0.0

    for batch_idx, data in enumerate(loader):
        optimizer.zero_grad()
        preds = model(data)
        targets = data['Rating']
        loss = loss_fn(preds, targets)

        # Sanity checks
        assert not torch.isnan(preds).any(), "NaN in predictions"
        assert not torch.isnan(targets).any(), "NaN in targets"
        assert not torch.isinf(preds).any(), "Inf in predictions"
        assert not torch.isinf(targets).any(), "Inf in targets"

        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=CLIP_GRAD_NORM)
        optimizer.step()

        yield batch_idx, loss.item()

@torch.no_grad()
def calculate_test_loss(model, test_loader, loss_fn):
    model.eval()
    total_loss = 0.0
    num_batches = 0

    for data in test_loader:
        preds = model(data)
        targets = data['Rating']
        loss = loss_fn(preds, targets)
        total_loss += loss.item()
        num_batches += 1

    return total_loss / num_batches if num_batches > 0 else float('inf')

def evaluate_and_checkpoint(model, epoch, global_step, best_loss, counter, test_loader, loss_fn):
    test_loss = calculate_test_loss(model, test_loader, loss_fn)

    if test_loss < best_loss:
        best_loss = test_loss
        counter = 0
        timestamp = datetime.datetime.now().strftime('%Y%m%d')
        save_path = f"{MODEL_SAVE_PATH}/two_towers_best_model_{timestamp}.pt"
        torch.save(model.state_dict(), save_path)
        print(f"[Epoch {epoch}] ✅ Improved! Test Loss: {test_loss:.4f}. Model saved.")
    else:
        counter += 1
        print(f"[Epoch {epoch}] No improvement. Test Loss: {test_loss:.4f} ({counter}/{EARLY_STOPPING_PATIENCE})")

    return best_loss, counter

def move_batch_to_device(batch, device):
    batch['User-ID'] = batch['User-ID'].to(device)
    batch['User-Age'] = batch['User-Age'].to(device)
    batch['Rating'] = batch['Rating'].to(device)

    for key in batch['pos_item']:
        batch['pos_item'][key] = batch['pos_item'][key].to(device)

    for neg_item in batch['neg_items']:
        for key in neg_item:
            neg_item[key] = neg_item[key].to(device)

    return batch

#### Main Training Loop

In [190]:
%rm -rf ./logs/

In [191]:
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import datetime

EPOCHS = 30
LOG_INTERVAL = 100
CLIP_GRAD_NORM = 1.0
LEARNING_RATE = 0.001
EARLY_STOPPING_PATIENCE = 15
MODEL_SAVE_PATH = "/content/drive/MyDrive/models"

# loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(two_towers.parameters(), lr=LEARNING_RATE)
writer = SummaryWriter('./logs/')

best_test_loss = float('inf')
early_stopping_counter = 0
global_step = 0

In [193]:

for epoch in range(1, EPOCHS + 1):

    # -- Main Loop --
    running_train_loss = 0.0
    two_towers.train()

    for batch in train_loader:
        batch = move_batch_to_device(batch, device)
        optimizer.zero_grad()

        user_embedding = two_towers.user_tower.get_embedding(batch)  # [B, D]

        # Calculate similarity between user and positive example
        pos_sim = two_towers(batch)

        # Calculate similarity between user and negative examples
        #
        # Logits (first position will be positive that we want to maximize, and everything else is negative)
        # For example, if we have 1 positive example and 1 negative example our logits are:
        #   [[pos, neg], ..batch_size..]
        #
        # Now the label we want cross entropy to maximize is in the 0th position (positive)
        #   labels = [0, ..batch_size..]
        #
        #
        neg_sims = []
        for neg_item_dict in batch['neg_items']:
            neg_emb = two_towers.item_tower.get_embedding(neg_item_dict)  # [B, D]
            neg_sim = torch.sum(user_embedding * neg_emb, dim=1)  # [B]
            neg_sims.append(neg_sim)

        # Contrastive Loss
        logits = torch.stack([pos_sim] + neg_sims, dim=1)
        labels = torch.zeros(logits.size(0), dtype=torch.long, device=device)
        loss = F.cross_entropy(logits, labels)

        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()

    two_towers.eval()
    running_test_loss = 0.0
    for batch in test_loader:
        batch = move_batch_to_device(batch, device)

        pos_sim = two_towers(batch)
        user_embedding = two_towers.user_tower.get_embedding(batch)
        neg_embedding = two_towers.item_tower.get_embedding(batch['neg_items'][0])
        neg_sim = torch.sum(user_embedding * neg_embedding, dim=1)

        logits = torch.stack([pos_sim, neg_sim], dim=1)
        labels = torch.zeros(logits.size(0), dtype=torch.long)
        loss = F.cross_entropy(logits, labels)

        running_test_loss += loss.item()

    if epoch % 5 == 0:
        avg_train_loss = running_train_loss / len(train_loader)
        avg_test_loss = running_test_loss / len(test_loader)
        print(f"Epoch {epoch}/{EPOCHS}, Average Training Loss: {avg_train_loss:.4f}, ")
        print(f"Epoch {epoch}/{EPOCHS}, Average Test Loss: {avg_test_loss:.4f}, ")

writer.close()
print("Training complete.")


Epoch 1/30, Average Training Loss: 0.7528, 
Epoch 1/30, Average Test Loss: 0.9950, 
Epoch 2/30, Average Training Loss: 0.5444, 
Epoch 2/30, Average Test Loss: 1.1698, 
Epoch 3/30, Average Training Loss: 0.4222, 
Epoch 3/30, Average Test Loss: 1.3075, 


KeyboardInterrupt: 

In [None]:
    # -----------------------
    # -- Recall@K Metric --
    #
    # Work in progress... Implement the recall@k metric to see out of the user's total ratings, how many of these appeared in the top-k.
    #

    # user_row = next(iter(train_loader))
    # user_vector = two_towers.user_tower.get_embedding(user_row)

    # scores = torch.matmul(item_vector, user_vector.T)
    # scores = scores.T
    #
    # top_k = 20
    # for i, user_id_encoded in enumerate(user_row['User-ID']):
    #     top_scores, top_indices = torch.topk(scores[i], top_k)
    #     user_id_decoded = dataset.reverse_encoders['User-ID'][user_id_encoded.item()]
    #     ground_truth_books = df[df['User-ID'] == user_id_decoded]['ISBN']

    #     print(f"Top k books predicted for user {user_id_decoded}")
    #     print(top_indices)
    #     print(f"Book ratings for user {user_id_decoded}")
    #     print(ground_truth_books)
    #
    # -----------------------