## Importing Dataset

In [259]:
%pip install kagglehub[pandas-datasets]



In [260]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from collections import Counter
from google.colab import drive
import kagglehub
from kagglehub import KaggleDatasetAdapter
drive.mount('/content/drive/')

# Load the latest version
books_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Books.csv",
)

ratings_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Ratings.csv",
)

users_df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "arashnic/book-recommendation-dataset",
  "Users.csv",
)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


  books_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


  result = read_function(
  ratings_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


  users_df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'book-recommendation-dataset' dataset.


### Only keeping books with more than three ratings

In [261]:
print(books_df.shape)
books_df = books_df[books_df.groupby('Book-Title')['Book-Title'].transform('count') > 4]
print(books_df.shape)

(271360, 8)
(6138, 8)


### Joining Books, Ratings, and Users tables together

In [262]:
df_ratings_books = pd.merge(ratings_df, books_df, on="ISBN", how='inner')
df = pd.merge(df_ratings_books, users_df, on='User-ID')
df['User-ID'] = df['User-ID'].astype(str)
df['Year-Of-Publication'] = pd.to_numeric(df['Year-Of-Publication'], errors='coerce')
df = df.dropna(subset=['Year-Of-Publication'])
df = df.dropna(subset=['Age'])
df = df[df['Book-Rating'] > 0]
df = df[df['Age'] <= 100]
df = df[df['Year-Of-Publication'] > 0]
df['Book-Rating'].describe()

Unnamed: 0,Book-Rating
count,20017.0
mean,7.919369
std,1.767708
min,1.0
25%,7.0
50%,8.0
75%,9.0
max,10.0


### Combining my own data into the training set

In [263]:
personal_df = pd.read_csv("./drive/MyDrive/fine-tuning-book-set.txt")
end_index = len(df)
df = pd.concat([df, personal_df], ignore_index=True, sort=False)

In [264]:
personal_df.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age
0,1234567890,451521951,5,The Count of Monte Cristo,Alexandre Dumas,1844,Signet Book,23
1,1234567890,684813637,5,1776,David McCullough,2005,Simon & Schuster,23


In [265]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [266]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder, StandardScaler, MultiLabelBinarizer
import ast
from sentence_transformers import SentenceTransformer

# User Tower -- User-ID, Age
# Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

class BookRecommenderDataset(Dataset):
    """
    A PyTorch Dataset class for book recommendation tasks.

    Parameters
    ----------
    dataframe : pd.DataFrame
        The input data containing user, item, and possibly interaction features.

    Attributes
    ----------
    data : pd.DataFrame
        The processed version of the input dataframe.
    encoders : dict
        A dictionary mapping column names to fitted label encoders.
    reverse_encoders : dict
        A dictionary mapping column names to reverse label encoders (index to label).
    scalers : dict
        A dictionary mapping column names to fitted scalers for numerical features.
    user_item_interaction : dict
        A dictionary mapping encoded User-IDs to a list of positive example encoded ISBNs
    negative_examples : int
        An integer hyperparameter for the number of negative examples to use for contrastive learning
    """

    def __init__(self, data, negative_examples=1):
        self.encoders = {} # {'Column name': {'value': idx, ...}, ...}
        self.reverse_encoders = {} # {'Column name': {idx: 'value', ...}, ...}
        self.scalers = {}
        self.user_item_interactions = {} # {encoded userid: [encoded ISBN]}
        self.negative_examples = negative_examples
        self.data = data.sample(frac=0.20, random_state=42).reset_index(drop=True)
        self.preprocess(self.data)

    def preprocess(self, data):
        self.encode_information()
        self.generate_positives()

    def generate_positives(self):
        self.user_item_interaction = (
            self.data
            .groupby('User-ID')['ISBN']
            .apply(list)
            .to_dict()
        )

    def encode_information(self):
        """
        Maps {key: index} pairs and StandardScaler for real valued numbers
        """
        label_encoders = ['User-ID', 'ISBN', 'Book-Title', 'Book-Author', 'Publisher']
        standard_scalers = ['Age', 'Year-Of-Publication']

        for col in label_encoders:
            unique_vals = self.data[col].astype(str).unique()
            self.encoders[col] = {val: idx + 1 for idx, val in enumerate(unique_vals)}
            self.reverse_encoders[col] = {idx + 1: val for idx, val in enumerate(unique_vals)}
            self.data[col] = self.data[col].astype(str).map(self.encoders[col]).fillna(0).astype(int)

        for col in standard_scalers:
            self.scalers[col] = StandardScaler()
            self.data[[col]] = self.scalers[col].fit_transform(self.data[[col]])

        # Manually adding my own User-ID so I don't need to adjust nn.Embedding later
        # max_user_idx = max(self.encoders['User-ID'].values())
        # self.encoders['User-ID']["1234567890"] = max_user_idx + 1
        # self.reverse_encoders['User-ID'][max_user_idx + 1] = "1234567890"

    def __len__(self):
        return len(self.data)


    def __getitem__(self, idx):
        """
        Returns
            - user-tower data (User-ID and User-Age)
            - positive item data (pos_item)
            - negative item data (neg_items)
            - Target rating
        """
        positive_item = self.data.iloc[idx]
        positive_user_id = positive_item['User-ID']
        positive_isbns = self.user_item_interaction[positive_user_id]

        negative_examples = []
        while len(negative_examples) < self.negative_examples:
            candidate = self.data.sample(n=1).iloc[0]
            candidate_isbn = candidate['ISBN']
            if candidate_isbn not in positive_isbns:
                negative_examples.append(candidate)

        output = {
            'User-ID': torch.tensor(positive_item['User-ID'], dtype=torch.long),
            'User-Age': torch.tensor(positive_item['Age'], dtype=torch.float32),
            'Rating': torch.tensor(positive_item['Book-Rating'], dtype=torch.float32),

            'pos_item': {
                'Book-ISBN': torch.tensor(positive_item['ISBN'], dtype=torch.long),
                'Book-Title': torch.tensor(positive_item['Book-Title'], dtype=torch.long),
                'Book-Author': torch.tensor(positive_item['Book-Author'], dtype=torch.long),
                'Book-Publisher': torch.tensor(positive_item['Publisher'], dtype=torch.long),
                'Book-Year-Of-Publication': torch.tensor(positive_item['Year-Of-Publication'], dtype=torch.float32),
            },

            'neg_items': [
                {
                    'Book-ISBN': torch.tensor(neg['ISBN'], dtype=torch.long),
                    'Book-Title': torch.tensor(neg['Book-Title'], dtype=torch.long),
                    'Book-Author': torch.tensor(neg['Book-Author'], dtype=torch.long),
                    'Book-Publisher': torch.tensor(neg['Publisher'], dtype=torch.long),
                    'Book-Year-Of-Publication': torch.tensor(neg['Year-Of-Publication'], dtype=torch.float32),
                }
                for neg in negative_examples
            ]
        }
        return output


dataset = BookRecommenderDataset(df, negative_examples=10)


In [267]:
dataset[0]

{'User-ID': tensor(1),
 'User-Age': tensor(0.1068),
 'Rating': tensor(6.),
 'pos_item': {'Book-ISBN': tensor(1),
  'Book-Title': tensor(1),
  'Book-Author': tensor(1),
  'Book-Publisher': tensor(1),
  'Book-Year-Of-Publication': tensor(0.6288)},
 'neg_items': [{'Book-ISBN': tensor(297),
   'Book-Title': tensor(88),
   'Book-Author': tensor(29),
   'Book-Publisher': tensor(104),
   'Book-Year-Of-Publication': tensor(0.7535)},
  {'Book-ISBN': tensor(675),
   'Book-Title': tensor(443),
   'Book-Author': tensor(91),
   'Book-Publisher': tensor(2),
   'Book-Year-Of-Publication': tensor(0.1297)},
  {'Book-ISBN': tensor(774),
   'Book-Title': tensor(179),
   'Book-Author': tensor(127),
   'Book-Publisher': tensor(12),
   'Book-Year-Of-Publication': tensor(0.6288)},
  {'Book-ISBN': tensor(688),
   'Book-Title': tensor(391),
   'Book-Author': tensor(346),
   'Book-Publisher': tensor(7),
   'Book-Year-Of-Publication': tensor(1.0031)},
  {'Book-ISBN': tensor(228),
   'Book-Title': tensor(190),
  

In [268]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_data, test_data = random_split(dataset, [train_size, test_size])

# Dataloaders
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64, shuffle=False)

In [269]:
next(iter(train_loader))

{'User-ID': tensor([ 225, 1750, 1293,  807,  609,  946,  880, 2272, 1257,  444, 1839, 1052,
         1804, 1578, 2745,  233, 1753,  214, 1932,  348,  576,  378, 1836,  822,
         1343,  169, 2262,  180,  438, 1188, 1009,  639,  743, 1559, 2101,  556,
         2307,  483,  680,   94,  729, 2687, 2430, 1262,  181,  396, 2159, 2411,
         2576, 2338,   81,  739,  464, 1094, 1406, 2711, 2020,  972,  455,  901,
          748, 1711,  562,  991]),
 'User-Age': tensor([ 0.6807, -1.2870,  0.0248, -0.6311, -0.3031, -0.3031,  1.5006,  0.2708,
         -0.7951,  1.3366, -0.7951,  0.0248, -0.7131,  0.3528,  2.8124,  0.0248,
         -0.2211, -0.9590, -0.9590,  0.7627,  0.8447, -0.2211, -0.3031, -0.3031,
         -0.3851,  0.1888, -0.8771,  0.8447,  0.4348,  0.3528,  1.2546, -0.3031,
          0.7627,  1.1727,  0.1888, -0.7131, -0.6311, -0.3851, -1.0410, -0.1392,
         -1.0410, -0.5491,  2.4845, -0.9590, -0.4671,  0.4348,  0.6807, -0.3851,
         -1.0410,  0.8447,  2.3205, -1.1230, -0.713

## Two Tower Model for Recommendations

In [270]:
class UserTower(nn.Module):

    # User Tower -- User-ID, Age

    def __init__(self, num_users, embedding_dim=16):
        super().__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim, padding_idx=0)

        self.user_mlp = nn.Sequential(
            nn.Linear(embedding_dim + 1, 64), # 1 embedding + 1 numerical
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, embedding_dim),
        )

    def forward(self, user_id, age):
        """
        user_id: (batch,) int64
        review_mean: (batch,) float32
        """
        user_emb = self.user_embedding(user_id)
        age = age.unsqueeze(1)
        x = torch.cat([user_emb, age], dim=1)
        return self.user_mlp(x)

    def get_embedding(self, data):
        return self.forward(data['User-ID'], data['User-Age'])


In [271]:
class ItemTower(nn.Module):
    def __init__(self, num_isbn, num_titles, num_authors, num_publishers, embedding_dim=16):
        super().__init__()

        # Item Tower -- ISBN, Book-Title, Book-Author, Publisher, Year-Of-Publication

        self.book_isbn_embedding = nn.Embedding(num_isbn, embedding_dim, padding_idx=0)
        self.book_title_embedding = nn.Embedding(num_titles, embedding_dim, padding_idx=0)
        self.book_author_embedding = nn.Embedding(num_authors, embedding_dim, padding_idx=0)
        self.book_publisher_embedding = nn.Embedding(num_publishers, embedding_dim, padding_idx=0)

        self.item_mlp = nn.Sequential(
            nn.Linear(embedding_dim * 4 + 1, 64),  # 4 embeddings + 1 numerical
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, embedding_dim),
        )

    def forward(self, isbn, book_title, book_author, book_publisher, book_year_of_publication):
        book_isbn_emb = self.book_isbn_embedding(isbn)
        book_title_emb = self.book_title_embedding(book_title)
        book_author_emb = self.book_author_embedding(book_author)
        book_publisher_emb = self.book_publisher_embedding(book_publisher)
        book_year = book_year_of_publication.unsqueeze(1)

        x = torch.cat([
            book_isbn_emb,
            book_title_emb,
            book_author_emb,
            book_publisher_emb,
            book_year
        ], dim=1)

        return self.item_mlp(x)

    def get_embedding(self, data):
        return self.forward(
            data['Book-ISBN'],
            data['Book-Title'],
            data['Book-Author'],
            data['Book-Publisher'],
            data['Book-Year-Of-Publication'],
        )


In [272]:
class TwoTowers(nn.Module):
    def __init__(self, user_tower: UserTower, item_tower: ItemTower):
        super().__init__()
        self.user_tower = user_tower
        self.item_tower = item_tower

    def forward(self, data):
        user_vector = self.user_tower.get_embedding(data)
        item_vector = self.item_tower.get_embedding(data['pos_item'])
        return (user_vector * item_vector).sum(dim=1)

In [273]:
# example_data = next(iter(train_loader))

NUM_USERS = len(dataset.encoders['User-ID']) + 1
NUM_ISBN = len(dataset.encoders['ISBN']) + 1
NUM_TITLES = len(dataset.encoders['Book-Title']) + 1
NUM_AUTHORS = len(dataset.encoders['Book-Author']) + 1
NUM_PUBLISHERS = len(dataset.encoders['Publisher']) + 1

user_tower = UserTower(num_users=NUM_USERS)

item_tower = ItemTower(
    num_isbn=NUM_ISBN,
    num_titles=NUM_TITLES,
    num_authors=NUM_AUTHORS,
    num_publishers=NUM_PUBLISHERS,
)

two_towers = TwoTowers(
    user_tower,
    item_tower
).to(device)

## Training

#### Helper Functions

In [274]:

def move_batch_to_device(batch, device):
    batch['User-ID'] = batch['User-ID'].to(device)
    batch['User-Age'] = batch['User-Age'].to(device)
    batch['Rating'] = batch['Rating'].to(device)

    for key in batch['pos_item']:
        batch['pos_item'][key] = batch['pos_item'][key].to(device)

    for neg_item in batch['neg_items']:
        for key in neg_item:
            neg_item[key] = neg_item[key].to(device)

    return batch

#### Main Training Loop

In [275]:
%rm -rf ./logs/

In [276]:
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
import datetime

EPOCHS = 100
LOG_INTERVAL = 100
CLIP_GRAD_NORM = 1.0
LEARNING_RATE = 0.001
EARLY_STOPPING_PATIENCE = 15
TEMPERATURE = 0.05
MODEL_SAVE_PATH = "/content/drive/MyDrive/models"

# loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(two_towers.parameters(), lr=LEARNING_RATE)
writer = SummaryWriter('./logs/')

best_test_loss = float('inf')
early_stopping_counter = 0
global_step = 0

In [None]:

for epoch in range(1, EPOCHS + 1):

    # -- Main Loop --
    running_train_loss = 0.0
    two_towers.train()

    for batch_idx, batch in enumerate(train_loader):
        batch = move_batch_to_device(batch, device)
        optimizer.zero_grad()

        user_embedding = two_towers.user_tower.get_embedding(batch)  # [B, D]

        # Calculate similarity between user and positive example
        pos_sim = two_towers(batch)

        # Calculate similarity between user and negative examples
        #
        # Logits (first position will be positive that we want to maximize, and everything else is negative)
        # For example, if we have 1 positive example and 1 negative example our logits are:
        #   [[pos, neg], ..batch_size..]
        #
        # Now the label we want cross entropy to maximize is in the 0th position (positive)
        #   labels = [0, ..batch_size..]
        #
        #
        neg_sims = []
        for neg_item_dict in batch['neg_items']:
            neg_emb = two_towers.item_tower.get_embedding(neg_item_dict)  # [B, D]
            neg_sim = torch.sum(user_embedding * neg_emb, dim=1)  # [B]
            neg_sims.append(neg_sim)

        # Contrastive Loss
        logits = torch.stack([pos_sim] + neg_sims, dim=1) / TEMPERATURE
        labels = torch.zeros(logits.size(0), dtype=torch.long, device=device)
        loss = F.cross_entropy(logits, labels)

        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()

    two_towers.eval()
    running_test_loss = 0.0
    for batch in test_loader:
        batch = move_batch_to_device(batch, device)

        pos_sim = two_towers(batch)
        user_embedding = two_towers.user_tower.get_embedding(batch)
        neg_embedding = two_towers.item_tower.get_embedding(batch['neg_items'][0])
        neg_sim = torch.sum(user_embedding * neg_embedding, dim=1)

        logits = torch.stack([pos_sim, neg_sim], dim=1) / TEMPERATURE
        labels = torch.zeros(logits.size(0), dtype=torch.long, device=device)
        loss = F.cross_entropy(logits, labels)

        running_test_loss += loss.item()

    avg_train_loss = running_train_loss / len(train_loader)
    avg_test_loss = running_test_loss / len(test_loader)
    print(f"Epoch {epoch}/{EPOCHS}, Average Training Loss: {avg_train_loss:.4f}, ")
    print(f"Epoch {epoch}/{EPOCHS}, Average Test Loss: {avg_test_loss:.4f}, ")

    if batch_idx % 10 == 0:
        torch.save(two_towers.state_dict(), f"{MODEL_SAVE_PATH}/two_towers_good_reads_{datetime.datetime.now().strftime("%Y%m%d%S")}.pt")


writer.close()
print("Training complete.")


Epoch 1/100, Average Training Loss: 6.1923, 
Epoch 1/100, Average Test Loss: 0.9723, 
Epoch 2/100, Average Training Loss: 3.5858, 
Epoch 2/100, Average Test Loss: 0.7826, 
Epoch 3/100, Average Training Loss: 2.9101, 
Epoch 3/100, Average Test Loss: 0.7363, 
Epoch 4/100, Average Training Loss: 2.6658, 
Epoch 4/100, Average Test Loss: 0.7360, 
Epoch 5/100, Average Training Loss: 2.5447, 
Epoch 5/100, Average Test Loss: 0.7345, 
Epoch 6/100, Average Training Loss: 2.4933, 
Epoch 6/100, Average Test Loss: 0.7210, 
Epoch 7/100, Average Training Loss: 2.4567, 
Epoch 7/100, Average Test Loss: 0.7081, 
Epoch 8/100, Average Training Loss: 2.3840, 
Epoch 8/100, Average Test Loss: 0.7093, 
Epoch 9/100, Average Training Loss: 2.3901, 
Epoch 9/100, Average Test Loss: 0.7125, 
Epoch 10/100, Average Training Loss: 2.3816, 
Epoch 10/100, Average Test Loss: 0.7129, 
Epoch 11/100, Average Training Loss: 2.3632, 
Epoch 11/100, Average Test Loss: 0.7158, 
Epoch 12/100, Average Training Loss: 2.3445, 
Epoc

In [None]:
torch.save(two_towers.state_dict(), f"{MODEL_SAVE_PATH}/two_towers_good_reads_1022.pt")



In [None]:
    # -----------------------
    # -- Recall@K Metric --
    #
    # Work in progress... Implement the recall@k metric to see out of the user's total ratings, how many of these appeared in the top-k.
    #

    # user_row = next(iter(train_loader))
    # user_vector = two_towers.user_tower.get_embedding(user_row)

    # scores = torch.matmul(item_vector, user_vector.T)
    # scores = scores.T
    #
    # top_k = 20
    # for i, user_id_encoded in enumerate(user_row['User-ID']):
    #     top_scores, top_indices = torch.topk(scores[i], top_k)
    #     user_id_decoded = dataset.reverse_encoders['User-ID'][user_id_encoded.item()]
    #     ground_truth_books = df[df['User-ID'] == user_id_decoded]['ISBN']

    #     print(f"Top k books predicted for user {user_id_decoded}")
    #     print(top_indices)
    #     print(f"Book ratings for user {user_id_decoded}")
    #     print(ground_truth_books)
    #
    # -----------------------

### Seeing what the model recommends to me after training

- It should have seen me somewhere in the training data and should have learned enough information from the other data to generalize over what I might like.
- I will pass my username and age into the User Tower. And then conduct a dot product between my vector and the matrix of learned item embeddings to get relevance scores.
- I will then conduct some semi-manual ranking based on removing what I have already read and other info.
- Then I will make the final 50 recommendations for me.

In [None]:
# Getting all item embeddings
entire_dataset = DataLoader(dataset, batch_size=1, shuffle=False)
all_item_embeddings = []
for batch in entire_dataset:
    pos_item = batch['pos_item']
    pos_item = {k: v.to(device) for k, v in pos_item.items()}
    item_embedding = two_towers.item_tower.get_embedding(pos_item)
    all_item_embeddings.append(item_embedding)
all_item_embeddings = torch.cat(all_item_embeddings, dim=0)

# Getting a single embedding for my learned user
paul_user_id = dataset.encoders['User-ID']['1234567890']
paul_age = dataset.scalers['Age'].transform([[23]])[0][0]
paul_batch = {
    'User-ID': torch.tensor([paul_user_id], dtype=torch.long, device=device),
    'User-Age': torch.tensor([paul_age], dtype=torch.float32, device=device)
}
paul_user_embedding = two_towers.user_tower.get_embedding(paul_batch) # [1 batch, 32 dimensions]

In [None]:
# Calculating similarity scores
similarity_scores = (paul_user_embedding * all_item_embeddings).sum(dim=1)
top_k = 100
top_scores, top_indices = torch.topk(similarity_scores, top_k)

read_isbns = personal_df['ISBN'].astype(str).to_list()
seen_titles = set()
unique_recommendations = []

for score, idx in zip(top_scores.detach().cpu().numpy(), top_indices.detach().cpu().numpy()):
    associated_item = dataset[idx]['pos_item']
    title_idx = int(associated_item['Book-Title'])
    author_idx = int(associated_item['Book-Author'])
    isbn_idx = int(associated_item['Book-ISBN'])

    title = dataset.reverse_encoders['Book-Title'][title_idx]
    author = dataset.reverse_encoders['Book-Author'][author_idx]
    isbn = dataset.reverse_encoders['ISBN'][isbn_idx]

    if title in seen_titles:
        continue

    if isbn in read_isbns:
        continue

    seen_titles.add(title)
    unique_recommendations.append({
        'title': title,
        'author': author,
        'score': score
    })

In [None]:
for rec in unique_recommendations[:50]:
    print(f"Title: {rec['title']}, Author: {rec['author']}, Score: {rec['score']:.4f}")