# Download Data

In [None]:
import kagglehub as kh

mohamedbakhet_amazon_books_reviews_path = kh.dataset_download('mohamedbakhet/amazon-books-reviews')
print(mohamedbakhet_amazon_books_reviews_path)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk(mohamedbakhet_amazon_books_reviews_path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from time import time
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Exploratory Data Analysis

In [None]:
books_rating_path = os.path.join(mohamedbakhet_amazon_books_reviews_path, "books_rating.csv")
books_data_path = os.path.join(mohamedbakhet_amazon_books_reviews_path, "books_data.csv")

The Id in the book_rating dataset is different from the Id used in the book data.

In [None]:
books_rating_raw = pd.read_csv(books_rating_path, header=0)
books_rating_raw

The title is unique and can be used as item ID.

In [None]:
books_data_raw = pd.read_csv(books_data_path, header=0)
books_data_raw

# Preprocessing

Convert original dataset to a clean dataset, such as missing value, get global lookup table.



In [None]:
# Rename columns to remove special characters
books_rating = books_rating_raw.rename(columns={
    "review/helpfulness": "reviewHelpfulness", 
    "review/score": "reviewScore", 
    "review/time": "reviewTime", 
    "review/summary": "reviewSummary", 
    "review/text": "reviewText",
    "User_id": "userId",
    "Id": "id",
    "Title": "title",
    "Price": "price",
})

# Rename columns of books_data to use camel case
books_data = books_data_raw.rename(columns={
    "Title": "title"})

In [None]:
# Preprocessing
books_data["bookId"] = books_data.infoLink.str.extract(r"id=([^&]+)")
books_data["title"] = books_data.title.fillna("")

# Fillna
books_rating.fillna({"userId": "00000000000000"}, inplace=True) # Use '00000000000000' for unknown User_id
books_rating.fillna({"id": "0000000000"}, inplace=True) # Use '0000000000' for unknown book ID

# Train / Test Split

The train / test split needs to happen before negative sampling. Use leave-last-out for test set.
* For users with 1 review, the only review is used as training data.
* For users with 2 review, the 2nd review is used as validation, and no testing data is used from this user.
* for users with 3 review and more, the last two reviews are used for validation and testing respectively.

In [None]:
# Group by userId and add rownumber per row based on reviewTime on ascending order
books_rating['userRatingSeqNum'] = books_rating.sort_values(['userId', 'reviewTime']).groupby('userId').cumcount() + 1
books_rating['userTotalRatings'] = books_rating.groupby('userId')['reviewTime'].transform('count')

books_rating.head()

In [None]:
# Based on the train/validation/test split strategy decided, create a new column 'dataSplit' to indicate which split the row belongs to
def assign_data_split(row):
    if row['userTotalRatings'] == 1:
        return 'train'
    elif row['userTotalRatings'] == 2:
        if row['userRatingSeqNum'] == 1:
            return 'train'
        else:
            return 'validation'
    else:
        if row['userRatingSeqNum'] == row['userTotalRatings']:
            return 'test'
        elif row['userRatingSeqNum'] == row['userTotalRatings'] - 1:
            return 'validation'
        else:
            return 'train'

# Split the book_rating dataframe into train, validation, and test sets
books_rating['dataSplit'] = books_rating.apply(assign_data_split, axis=1)

books_rating_train = books_rating[books_rating['dataSplit'] == 'train']
books_rating_validation = books_rating[books_rating['dataSplit'] == 'validation']
books_rating_test = books_rating[books_rating['dataSplit'] == 'test']

# Print the sizes of each split
print(f"Training set size: {len(books_rating_train)}")
print(f"Validation set size: {len(books_rating_validation)}")
print(f"Test set size: {len(books_rating_test)}")

# Dataset

In [None]:
# Generate user2idx and item2idx mappings using the books_rating dataset
raw_user_ids = sorted(list(set(books_rating.userId))) # sort raw ID for reproduceability
user2idx = {raw_user_id:idx for idx, raw_user_id in enumerate(raw_user_ids)}

raw_book_ids = sorted(list(set(books_data.title))) # sort raw ID for reproduceability
item2idx = {raw_book_id:idx for idx, raw_book_id in enumerate(raw_book_ids)}

In [None]:
# Create a dataset, a dataloader
class TwoTowersDataset(Dataset):
    def __init__(self, 
                 df_interaction=None, df_catalog=None,
                 user2idx=None, item2idx=None,
                 interaction_csv_path=None, interaction_header=0,
                 catalog_csv_path=None, catalog_header=0, 
                 random_negative_samples=0
    ):
        super().__init__()
        self.random_negative_samples = random_negative_samples
        
        # Load interaction
        if df_interaction is not None:
            self.df_interaction = df_interaction.reset_index(drop=True)
        elif interaction_csv_path is not None:
            self.df_interaction = pd.read_csv(interaction_csv_path, header=interaction_header)
        
        # Load catalog
        if df_catalog is not None:
            self.df_catalog = df_catalog.reset_index(drop=True)
        elif catalog_csv_path is not None:
            self.df_catalog = pd.read_csv(catalog_csv_path, header=catalog_header)
        
        # Item ID lookup
        if item2idx is not None:
            self.book2idx = item2idx
            self.idx2book = {idx:raw_book_id for raw_book_id, idx in item2idx.items()}
            self.books = [raw_book_id for raw_book_id, idx in item2idx.items()]
        else:
            raw_book_ids = sorted(list(set(self.df_catalog["title"]))) # sort raw ID for reproduceability
            self.books = [raw_book_id for idx, raw_book_id in enumerate(raw_book_ids)]
            self.book2idx = {raw_book_id:idx for idx, raw_book_id in enumerate(raw_book_ids)}
            self.idx2book = {idx:raw_book_id for idx, raw_book_id in enumerate(raw_book_ids)}
        
        # User ID lookup
        if user2idx is not None:
            self.user2idx = user2idx
            self.idx2user = {idx:raw_user_id for raw_user_id, idx in user2idx.items()}
            self.users = [raw_user_id for raw_user_id, idx in user2idx.items()]
        else:
            raw_user_ids = sorted(list(set(self.df_interaction.userId))) # sort raw ID for reproduceability
            self.users = [raw_user_id for raw_user_id in enumerate(raw_user_ids)]
            self.user2idx = {raw_user_id:idx for idx, raw_user_id in enumerate(raw_user_ids)}
            self.idx2user = {idx:raw_user_id for idx, raw_user_id in enumerate(raw_user_ids)}
        
    def __len__(self):
        return len(self.df_interaction)

    def __getitem__(self, idx):
        """Get all interactions from a user.

        Args
        idx: interaction
        """
        # Get positive interaction
        pos_interaction = self.df_interaction.iloc[idx, :]
        user_id = pos_interaction.userId
        pos_item_id = pos_interaction.title
        all_pos_items = set(self.df_interaction[self.df_interaction.userId == user_id].title)
        # Random sample for negatives
        max_attempt = 1e3 # maximum attempt 1000 times
        all_neg_items = np.array(list(set(self.books) - all_pos_items))
        neg_sample_items = all_neg_items[np.random.randint(0, len(all_neg_items), self.random_negative_samples)]
        # Return data item
        return {
            "user_ids": torch.tensor(self.user2idx.get(user_id, 0), dtype=torch.long), # 0 is unknown user ID
            "item_ids": torch.tensor([self.book2idx.get(pos_item_id, 0)] + [self.book2idx.get(neg_item_id, 0) for neg_item_id in neg_sample_items], dtype=torch.long),
            "binary_scores": torch.tensor([1] + [0 for neg_item_id in neg_sample_items], dtype=torch.float) 
        }

In [None]:
train_ds = TwoTowersDataset(
    df_interaction=books_rating_train, 
    user2idx=user2idx,
    item2idx=item2idx,
    random_negative_samples = 10
)

validation_ds = TwoTowersDataset(
    df_interaction=books_rating_validation, 
    user2idx=user2idx,
    item2idx=item2idx,
    random_negative_samples = 10
)

test_ds = TwoTowersDataset(
    df_interaction=books_rating_test,
    user2idx=user2idx,
    item2idx=item2idx,
    random_negative_samples = 100
)

print(f"Train dataset size: {len(train_ds)}")
print("Train dataset sample:")
print(train_ds[0])

print(f"Validation dataset size: {len(validation_ds)}")
print("Validation dataset sample:")
print(validation_ds[0])

print(f"Test dataset size: {len(test_ds)}")
print("Test dataset sample:")
print(test_ds[0])

# Two Towers Model

## User Tower

In [None]:
# Create user tower
class UserTower(nn.Module):
    def __init__(self, num_users, id_emb_dim, tower_emb_dim):
        """
        User tower that converts user features into a user embedding for dot product.

        Args:
        num_users - total users
        id_emb_dim - Dimension of user ID embedding
        tower_emb_dim - Dimension of user tower embedding
        """
        super().__init__()
        self.id_emb = nn.Embedding(num_users, id_emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(id_emb_dim, tower_emb_dim),
            nn.ReLU(),
            nn.Linear(tower_emb_dim, tower_emb_dim)
        )

    def forward(self, user_id):
        x = self.id_emb(user_id) # [B, id_emb_dim]
        x = self.mlp(x)
        # Apply L2 normalization so to enable cosine similarity using x instead of unbounded dot product
        x = x / x.norm(dim=-1, keepdim=True)
        return x


user_id_emb_dim = 32
user_tower_emb_dim = 32
num_users = len(user2idx)

user_tower = UserTower(num_users, user_id_emb_dim, user_tower_emb_dim)
user_tower

## Item Tower

In [None]:
# Create item tower
class ItemTower(nn.Module):
    def __init__(self, num_items, id_emb_dim, tower_emb_dim):
        super().__init__()
        self.id_emb = nn.Embedding(num_items, id_emb_dim) # [num_items, id_emb_dim]
        self.mlp = nn.Sequential(
            nn.Linear(id_emb_dim, tower_emb_dim),
            nn.ReLU(),
            nn.Linear(tower_emb_dim, tower_emb_dim)
        )

    def forward(self, item_id):
        """Calculate item tower embedding.

        Args:
        item_id - Item ID
        
        Returns:
        """
        x = self.id_emb(item_id)  # [B, id_emb_dim]
        x = self.mlp(x) # [B, tower_emb_dim]
        x = x / x.norm(dim = -1, keepdim=True) # [B, tower_emb_dim]
        return x

num_items = len(item2idx)
id_emb_dim = 32
tower_emb_dim = 32

item_tower = ItemTower(num_items, id_emb_dim, tower_emb_dim)
item_tower

## Two-towers model

In [None]:
# Create two-towers model

class TwoTowersModel(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        # Item tower dimension [item_id_emb_dim, item_tower_emb_dim]
        self.item_tower = ItemTower(
            num_items = kwargs.get("num_items", 0),
            id_emb_dim = kwargs.get("item_id_emb_dim", 32),
            tower_emb_dim = kwargs.get("item_tower_emb_dim", 32),
        )
        # User tower dimension [user_id_emb_dim, user_tower_emb_dim]
        self.user_tower = UserTower(
            num_users = kwargs.get("num_users", 0),
            id_emb_dim = kwargs.get("user_id_emb_dim", 32),
            tower_emb_dim = kwargs.get("user_tower_emb_dim", 32),
        )

    def forward(self, user_id, item_id):
        """Because item_emb and user_emb are normalised in the tower. The dot product here
        is then the cosine similarity, and its value is between [-1, 1].

        Args
        user_id - User IDs, [B,]
        item_id - Item IDs, [B,]
        """
        item_emb = self.item_tower(item_id) # [B, item_tower_dim]. Normalised into Cosine similarity 
        user_emb = self.user_tower(user_id) # [B, user_tower_dim]. Normalised into Cosine similarity
        x = (item_emb * user_emb).sum(axis=-1, keepdim=True).squeeze() # [B]. x elements are between -1 and 1.
        x = nn.Sigmoid()(x) # Output in [0, 1] for BCE
        return x

In [None]:
test_item_ids = torch.tensor([0], dtype=torch.long)
test_user_ids = torch.tensor([0], dtype=torch.long)

num_items = len(item2idx)
num_users = len(user2idx)

test_model = TwoTowersModel(
    num_items=num_items, 
    num_users=num_users
)
test_model(user_id=test_user_ids, item_id=test_item_ids)

# Model Train

Train the two-towers model and monitor the loss and training epoch
* Start with Binary Cross Entropy loss
* Switch to Negative Contrastive Ex loss

In [None]:
EPOCH = 1
BATCH_SIZE = 256
MAX_BATCHES = None
LOG_INTERVAL = 1

# Create dataloader
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
validation_dl = DataLoader(validation_ds, batch_size=BATCH_SIZE, shuffle=False)

# Build model
num_items = len(item2idx)
num_users = len(user2idx)
model = TwoTowersModel(    
    num_items=num_items,
    num_users=num_users
)

# Define loss function
loss_fn = nn.BCELoss()

# Define optimizer
optimizer = torch.optim.Adam(model.parameters())

In [None]:
# Create a in-batch negative dataset that generates negative training samples from other user's positive items
# The negative sampling dataset needs to be created for training and test individually

for epoch in range(EPOCH):

    # -------------------------------------------------------
    # Train
    # -------------------------------------------------------
    train_loss = 0.0
    for i, batch in enumerate(train_dl):
        print("Starting batch", i)
        batch_start_time = time()
        if MAX_BATCHES is not None and i >= MAX_BATCHES:
            break
        model.train()
        # Prepare Two-towers input
        item_ids = batch["item_ids"] # [B, K+1], K is number of negative samples, 1 is positive interaction
        labels = batch["binary_scores"] # [B, K+1]
        user_ids = batch["user_ids"] # [B, ]
        user_ids_exp = user_ids.unsqueeze(1).expand(-1, item_ids.shape[1]) # [B, K+1]
        # Flatten 
        user_ids_flat = user_ids_exp.reshape(-1) # [B*(K+1),]
        item_ids_flat = item_ids.reshape(-1) # [B*(K+1),]
        labels_flat = labels.reshape(-1) # [B*(K+1),]
        # Forward Pass
        logit = model(user_ids_flat, item_ids_flat) # forward pass
        batch_loss = loss_fn(logit, labels_flat)
        # Backprop
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

        print("Finished backprop of training batch", i)
        train_loss += batch_loss.item()    
        batch_finish_time = time()
        elapsed_time = batch_finish_time - batch_start_time

        # -------------------------------------------------------
        # Monitoring
        # -------------------------------------------------------
        if (i+1) % LOG_INTERVAL == 0:
            print(f"epoch {epoch+1}, batch {i+1}, "
                  f"train loss: {batch_loss.item():.4f}, "
                  f"user_ids: {user_ids.shape}, "
                  f"item_ids: {item_ids.shape}, "
                  f"labels: {labels.shape}, "
                  f"elapsed time: {elapsed_time:.1f}s"
            )
        print("Finished batch", i)

    # -------------------------------------------------------
    # Validation
    # -------------------------------------------------------
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        for j, val_batch in enumerate(validation_dl):
            val_item_ids = val_batch["item_ids"] # [B, K+1], K is number of negative samples, 1 is positive interaction
            val_labels = val_batch["binary_scores"] # [B, K+1]
            val_user_ids = val_batch["user_ids"] # [B, ]
            val_user_ids_exp = val_user_ids.unsqueeze(1).expand(-1, val_item_ids.shape[1]) # [B, K+1]
            # Flatten 
            val_user_ids_flat = val_user_ids_exp.reshape(-1) # [B*(K+1),]
            val_item_ids_flat = val_item_ids.reshape(-1) # [B*(K+1),]
            val_labels_flat = val_labels.reshape(-1) # [B*(K+1),]
            # Forward Pass
            val_logit = model(val_user_ids_flat, val_item_ids_flat) # forward pass
            val_batch_loss = loss_fn(val_logit, val_labels_flat)
            val_loss += val_batch_loss.item()
    
    print(f"epoch {epoch+1} completed. "
          f"Average train loss: {train_loss / len(train_dl):.4f}, "
          f"Average validation loss: {val_loss / len(validation_dl):.4f}"
    )
        

# Evaluation
Calculate offline performance metrics on the train, validation and test set
* Hit@K
* Recall@K
* Normalised Weighted Cumulative Gain@K
* Mean Reciprocal Rank@K