## Importing Dataset

In [None]:
%pip install kagglehub[pandas-datasets]



In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import Counter
from google.colab import drive
import kagglehub
from kagglehub import KaggleDatasetAdapter
drive.mount('/content/drive/')

# Load the latest version
books_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Books.csv",
)

ratings_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Ratings.csv",
)

users_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Users.csv",
)

Mounted at /content/drive/


  books_df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/arashnic/book-recommendation-dataset?dataset_version_number=3&file_name=Books.csv...


100%|██████████| 14.8M/14.8M [00:00<00:00, 46.0MB/s]

Extracting zip of Books.csv...



  result = read_function(
  ratings_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


  users_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


In [None]:
df_ratings_books = pd.merge(ratings_df, books_df, on="ISBN", how='inner')
df = pd.merge(df_ratings_books, users_df, on='User-ID')
df['User-ID'] = df['User-ID'].astype(str)
df['Year-Of-Publication'] = pd.to_numeric(df['Year-Of-Publication'], errors='coerce')
df = df.dropna(subset=['Year-Of-Publication'])
df = df.dropna(subset=['Age'])
df = df[df['Book-Rating'] > 0]
df['Book-Rating'].describe()

Unnamed: 0,Book-Rating
count,269620.0
mean,7.736162
std,1.81453
min,1.0
25%,7.0
50%,8.0
75%,9.0
max,10.0


In [None]:
print(df['Book-Rating'].value_counts().sort_index())

Book-Rating
1       886
2      1562
3      3331
4      5096
5     27744
6     21445
7     45538
8     64824
9     45251
10    53943
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
import ast
from sentence_transformers import SentenceTransformer

# User Tower -- User-ID, Age
# Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

class BookRecommenderDataset(Dataset):
    def __init__(self, data):
        self.data = data.sample(frac=0.05, random_state=42)
        self.data = self.preprocess(self.data)

    def preprocess(self, data):
        self.encoders = {}
        self.scalers = {}

        label_encoders = ['User-ID', 'ISBN', 'Book-Title', 'Book-Author', 'Publisher']
        standard_scalers = ['Age', 'Year-Of-Publication']

        for key in label_encoders:
            self.encoders[key] = LabelEncoder()
            data[key] = self.encoders[key].fit_transform(data[key].astype(str))

        for key in standard_scalers:
            self.scalers[key] = StandardScaler()
            data[[key]] = self.scalers[key].fit_transform(data[[key]])
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            'User-ID': torch.tensor(item['User-ID'], dtype=torch.long),
            'User-Age': torch.tensor(item['Age'], dtype=torch.float32),
            'Book-ISBN': torch.tensor(item['ISBN'], dtype=torch.long),
            'Book-Title': torch.tensor(item['Book-Title'], dtype=torch.long),
            'Book-Author': torch.tensor(item['Book-Author'], dtype=torch.long),
            'Book-Publisher': torch.tensor(item['Publisher'], dtype=torch.long),
            'Book-Year-Of-Publication': torch.tensor(item['Year-Of-Publication'], dtype=torch.float32),
            'Rating': torch.tensor(item['Book-Rating'], dtype=torch.float32)
        }

dataset = BookRecommenderDataset(df)


In [None]:
train_size = int(0.5 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])

# Dataloaders
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [None]:
next(iter(train_loader))

{'User-ID': tensor([2487, 5638,  453, 1151, 5914, 4037, 6156, 2890, 5895,  758, 1145, 1618,
         4024, 2685, 4771, 3671, 4861, 6729, 1738, 5705, 4769, 1996, 1463, 5160,
         5887,  566, 1075, 3619, 4024, 4610,  363, 1619, 4523, 4798, 2585, 2580,
         6718, 3958, 5002, 1463,  848, 2571,   93, 4642, 1754, 3593, 1153,  563,
          838, 1968, 5435, 2516, 2004, 5944, 6171, 2298, 3087, 2763,    9, 6189,
         4257, 6718, 1128,  748]),
 'User-Age': tensor([ 0.6693,  0.8166,  1.8474, -0.0670,  0.3012,  0.5957, -1.7605,  0.3012,
          0.3748,  1.1111, -1.0242,  2.5838,  0.4485, -1.3187,  0.8166, -1.4660,
         -0.9505, -1.0978, -0.8033, -0.5088,  0.5221,  1.1848,  0.5221,  0.3748,
          1.5529,  0.2276,  0.8902,  0.6693,  0.4485,  0.9639, -0.0670,  0.0067,
          0.4485, -0.2142, -0.8033,  2.2892,  1.1111, -0.4351, -0.1406,  0.5221,
          0.5221,  2.3629, -0.9505,  0.0067, -0.5824,  0.5957,  1.6266, -2.1286,
         -1.3923, -0.5088, -0.4351, -1.0242, -0.582

## Two Tower Model for Recommendations

In [None]:
class UserTower(nn.Module):

    # User Tower -- User-ID, Age

    def __init__(self, num_users, embedding_dim=32):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim, padding_idx=0)

        self.user_mlp = nn.Sequential(
            nn.Linear(embedding_dim + 1, 128), # 1 embedding + 1 numerical
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, embedding_dim)
        )

    def forward(self, user_id, age):
        """
        user_id: (batch,) int64
        review_mean: (batch,) float32
        """
        user_emb = self.user_embedding(user_id)
        age = age.unsqueeze(1)
        x = torch.cat([user_emb, age], dim=1)
        return self.user_mlp(x)

    def get_embedding(self, data):
        return self.forward(data['User-ID'], data['User-Age'])


In [None]:
class ItemTower(nn.Module):
    def __init__(self, num_isbn, num_titles, num_authors, num_publishers, embedding_dim=32):
        super().__init__()

        # Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

        self.book_isbn_embedding = nn.Embedding(num_isbn, embedding_dim, padding_idx=0)
        self.book_title_embedding = nn.Embedding(num_titles, embedding_dim, padding_idx=0)
        self.book_author_embedding = nn.Embedding(num_authors, embedding_dim, padding_idx=0)
        self.book_publisher_embedding = nn.Embedding(num_publishers, embedding_dim, padding_idx=0)

        self.item_mlp = nn.Sequential(
            nn.Linear(embedding_dim * 4 + 1, 128),  # 4 embeddings + 1 numerical
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, embedding_dim)
        )

    def forward(self, isbn, book_title, book_author, book_publisher, book_year_of_publication):
        book_isbn_emb = self.book_isbn_embedding(isbn)
        book_title_emb = self.book_title_embedding(book_title)
        book_author_emb = self.book_author_embedding(book_author)
        book_publisher_emb = self.book_publisher_embedding(book_publisher)
        book_year = book_year_of_publication.unsqueeze(1)

        x = torch.cat([
            book_isbn_emb,
            book_title_emb,
            book_author_emb,
            book_publisher_emb,
            book_year
        ], dim=1)

        return self.item_mlp(x)

    def get_embedding(self, data):
        return self.forward(
            data['Book-ISBN'],
            data['Book-Title'],
            data['Book-Author'],
            data['Book-Publisher'],
            data['Book-Year-Of-Publication'],
        )


In [None]:
class TwoTowers(nn.Module):
    def __init__(self, user_tower: UserTower, item_tower: ItemTower):
        super().__init__()
        self.user_tower = user_tower
        self.item_tower = item_tower

    def forward(self, data):
        user_vector = self.user_tower.get_embedding(data)
        item_vector = self.item_tower.get_embedding(data)
        return (user_vector * item_vector).sum(dim=1)

In [None]:
example_data = next(iter(train_loader))

NUM_USERS = len(dataset.encoders['User-ID'].classes_)
NUM_ISBN = len(dataset.encoders['ISBN'].classes_)
NUM_TITLES = len(dataset.encoders['Book-Title'].classes_)
NUM_AUTHORS = len(dataset.encoders['Book-Author'].classes_)
NUM_PUBLISHERS = len(dataset.encoders['Publisher'].classes_)

user_tower = UserTower(num_users=NUM_USERS)

item_tower = ItemTower(
    num_isbn=NUM_ISBN,
    num_titles=NUM_TITLES,
    num_authors=NUM_AUTHORS,
    num_publishers=NUM_PUBLISHERS,
)

two_towers = TwoTowers(
    user_tower,
    item_tower
)

## Training

#### Helper Functions

In [None]:
# --- Training Helpers ---
def train_one_epoch(model, loader, optimizer, loss_fn):
    model.train()
    running_loss = 0.0

    for batch_idx, data in enumerate(loader):
        optimizer.zero_grad()
        preds = model(data)
        targets = data['Rating']
        loss = loss_fn(preds, targets)

        # Sanity checks
        assert not torch.isnan(preds).any(), "NaN in predictions"
        assert not torch.isnan(targets).any(), "NaN in targets"
        assert not torch.isinf(preds).any(), "Inf in predictions"
        assert not torch.isinf(targets).any(), "Inf in targets"

        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=CLIP_GRAD_NORM)
        optimizer.step()

        yield batch_idx, loss.item()

@torch.no_grad()
def calculate_test_loss(model, test_loader, loss_fn):
    model.eval()
    total_loss = 0.0
    num_batches = 0

    for data in test_loader:
        preds = model(data)
        targets = data['Rating']
        loss = loss_fn(preds, targets)
        total_loss += loss.item()
        num_batches += 1

    return total_loss / num_batches if num_batches > 0 else float('inf')

def evaluate_and_checkpoint(model, epoch, global_step, best_loss, counter, test_loader, loss_fn):
    test_loss = calculate_test_loss(model, test_loader, loss_fn)

    if test_loss < best_loss:
        best_loss = test_loss
        counter = 0
        timestamp = datetime.datetime.now().strftime('%Y%m%d')
        save_path = f"{MODEL_SAVE_PATH}/two_towers_best_model_{timestamp}.pt"
        torch.save(model.state_dict(), save_path)
        print(f"[Epoch {epoch}] ✅ Improved! Test Loss: {test_loss:.4f}. Model saved.")
    else:
        counter += 1
        print(f"[Epoch {epoch}] No improvement. Test Loss: {test_loss:.4f} ({counter}/{EARLY_STOPPING_PATIENCE})")

    return best_loss, counter

#### Main Training Loop

In [None]:
%rm -rf ./logs/

In [None]:
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import datetime

EPOCHS = 200
LOG_INTERVAL = 100
CLIP_GRAD_NORM = 1.0
LEARNING_RATE = 0.001
EARLY_STOPPING_PATIENCE = 15
MODEL_SAVE_PATH = "/content/drive/MyDrive/models"

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(two_towers.parameters(), lr=LEARNING_RATE)
writer = SummaryWriter('./logs/')

best_test_loss = float('inf')
early_stopping_counter = 0
global_step = 0

all_items_loader = DataLoader(dataset, batch_size=len(dataset), shuffle=False)
items = next(iter(all_items_loader))
item_vector = two_towers.item_tower.get_embedding(items)

In [None]:

for epoch in range(1, EPOCHS + 1):

    # -- Main Loop --
    running_loss = 0.0
    two_towers.train()

    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()

        preds = two_towers(batch)
        targets = batch['Rating']
        loss = loss_fn(preds, targets)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch}, Average Training Loss (based on rating): {avg_loss:.4f}")


    # -- Recall@K Metric --
    user_row = next(iter(train_loader))
    user_vector = two_towers.user_tower.get_embedding(user_row)

    scores = torch.matmul(item_vector, user_vector.T)
    scores = scores.T

    top_k = 50
    for i, user_id in enumerate(user_row['User-ID']):
        top_scores, top_indices = torch.topk(scores[i], top_k)

        print(top_indices)
        break



    print(f"Epoch {epoch}/{EPOCHS} completed.")

writer.close()
print("Training complete.")


Epoch 1, Average Training Loss (based on rating): 0.0811
tensor([ 2515,  6097,  1115,  8632,  7310,  7222,  4206,  8718,   553,   676,
         3775,   896, 13282,   163, 10866,  5071, 12279, 11630,  5724,  2833,
           80,   256,  4951, 11940,   193,  5153, 13167, 12182,  8040,  6120,
           32,  5892,  9656, 12569,  8843, 12365,   121,  1407,  5878,  4781,
        10196,  7990, 10715,  7028,  2315,   429,  2427, 13313,  4203,  7947])
Epoch 1/200 completed.


KeyboardInterrupt: 