In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Exploratory Data Analysis

In [None]:
books_rating_path = "/kaggle/input/amazon-books-reviews/Books_rating.csv"
books_data_path = "/kaggle/input/amazon-books-reviews/books_data.csv"

In [None]:
book_rating = pd.read_csv(books_rating_path, header=0)
book_rating

In [None]:
books_data = pd.read_csv(books_data_path, header=0)
books_data

# Preprocessing

Convert original dataset to a clean dataset, such as missing value, get global lookup table.



In [None]:
# all_books = books_data

# Train / Test Split

The train / test split needs to happen before negative sampling. Use leave-last-out for test set.

# Dataset

In [None]:
# Create a dataset, a dataloader
class TwoTowersDataset(Dataset):
    def __init__(self, 
                 df_interaction=None, df_catalog=None, 
                 interaction_csv_path=None, interaction_header=0,
                 catalog_csv_path=None, catalog_header=0, 
                 random_negative_samples=0
    ):
        super().__init__()
        self.random_negative_samples = random_negative_samples
        
        # Load interaction
        if df_interaction is not None:
            self.df_interaction = df_interaction
        elif interaction_csv_path is not None:
            self.df_interaction = pd.read_csv(interaction_csv_path, header=header)
        
        # Load catalog
        if df_catalog is not None:
            self.df_catalog = df_catalog
        elif catalog_csv_path is not None:
            self.df_catalog = pd.read_csv(catalog_csv_path, header=header)

        # Preprocessing
        self.df_catalog["bookId"] = self.df_catalog.infoLink.str.extract(r"id=([^&]+)")
        self.df_catalog.Title = self.df_catalog.Title.fillna("")
        
        # Fillna
        self.df_interaction.fillna({"User_id": "00000000000000"}, inplace=True) # Use '00000000000000' for unknown User_id
        self.df_interaction.fillna({"Id": "0000000000"}, inplace=True) # Use '0000000000' for unknown book ID
        
        # Item ID lookup
        raw_book_ids = sorted(list(set(self.df_catalog["Title"]))) # sort raw ID for reproduceability
        self.books = [raw_book_id for idx, raw_book_id in enumerate(raw_book_ids)]
        self.book2idx = {raw_book_id:idx for idx, raw_book_id in enumerate(raw_book_ids)}
        self.idx2book = {idx:raw_book_id for idx, raw_book_id in enumerate(raw_book_ids)}
        
        # User ID lookup
        raw_user_ids = sorted(list(set(self.df_interaction.User_id))) # sort raw ID for reproduceability
        self.users = [raw_user_id for raw_user_id in enumerate(raw_user_ids)]
        self.user2idx = {raw_user_id:idx for idx, raw_user_id in enumerate(raw_user_ids)}
        self.idx2user = {idx:raw_user_id for idx, raw_user_id in enumerate(raw_user_ids)}

        # Set bookId as index of catalog
        self.df_catalog.set_index("bookId", inplace=True)
        
    def __len__(self):
        return len(self.df_interaction)

    def __getitem__(self, idx):
        """Get all interactions from a user.

        Args
        idx: interaction
        """
        # Get positive interaction
        pos_interaction = self.df_interaction.loc[idx, :]
        user_id = pos_interaction.User_id
        pos_item_id = pos_interaction.Title
        all_pos_items = set(self.df_interaction[self.df_interaction.User_id == user_id].Title)
        # Random sample for negatives
        max_attempt = 1e3 # maximum attempt 1000 times
        all_neg_items = np.array(list(set(self.books) - all_pos_items))
        neg_sample_items = all_neg_items[np.random.randint(0, len(all_neg_items), self.random_negative_samples)]
        # Return data item
        return {
            "user_ids": torch.tensor(self.user2idx.get(user_id, 0), dtype=torch.long), # 0 is unknown user ID
            "item_ids": torch.tensor([self.book2idx.get(pos_item_id, 0)] + [self.book2idx.get(neg_item_id, 0) for neg_item_id in neg_sample_items], dtype=torch.long),
            "binary_scores": torch.tensor([1] + [0 for neg_item_id in neg_sample_items], dtype=torch.float) 
        }

In [None]:
books_rating_ds = TwoTowersDataset(
    df_interaction=book_rating, 
    df_catalog=books_data,
    random_negative_samples = 10
)

books_rating_ds[0]

# Two Towers Model

## User Tower

In [None]:
# Create user tower
class UserTower(nn.Module):
    def __init__(self, num_users, id_emb_dim, tower_emb_dim):
        """
        User tower that converts user features into a user embedding for dot product.

        Args:
        num_users - total users
        id_emb_dim - Dimension of user ID embedding
        tower_emb_dim - Dimension of user tower embedding
        """
        super().__init__()
        self.id_emb = nn.Embedding(num_users, id_emb_dim)
        self.mlp = nn.Sequential(
            nn.Linear(id_emb_dim, tower_emb_dim),
            nn.ReLU(),
            nn.Linear(tower_emb_dim, tower_emb_dim)
        )

    def forward(self, user_id):
        x = self.id_emb(user_id) # [B, id_emb_dim]
        x = self.mlp(x)
        # Apply L2 normalization so to enable cosine similarity using x instead of unbounded dot product
        x = x / x.norm(dim=-1, keepdim=True)
        return x


user_id_emb_dim = 32
user_tower_emb_dim = 32
num_users = len(books_rating_ds.user2idx)

user_tower = UserTower(num_users, user_id_emb_dim, user_tower_emb_dim)
user_tower

## Item Tower

In [None]:
# Create item tower
class ItemTower(nn.Module):
    def __init__(self, num_items, id_emb_dim, tower_emb_dim):
        super().__init__()
        self.id_emb = nn.Embedding(num_items, id_emb_dim) # [num_items, id_emb_dim]
        self.mlp = nn.Sequential(
            nn.Linear(id_emb_dim, tower_emb_dim),
            nn.ReLU(),
            nn.Linear(tower_emb_dim, tower_emb_dim)
        )

    def forward(self, item_id):
        """Calculate item tower embedding.

        Args:
        item_id - Item ID
        
        Returns:
        """
        x = self.id_emb(item_id)  # [B, id_emb_dim]
        x = self.mlp(x) # [B, tower_emb_dim]
        x = x / x.norm(dim = -1, keepdim=True) # [B, tower_emb_dim]
        return x

num_items = len(books_rating_ds.book2idx)
id_emb_dim = 32
tower_emb_dim = 32

item_tower = ItemTower(num_items, id_emb_dim, tower_emb_dim)
item_tower

## Two-towers model

In [None]:
# Create two-towers model

class TwoTowersModel(nn.Module):
    def __init__(self, **kwargs):
        super().__init__()
        # Item tower dimension [item_id_emb_dim, item_tower_emb_dim]
        self.item_tower = ItemTower(
            num_items = kwargs.get("num_items", 0),
            id_emb_dim = kwargs.get("item_id_emb_dim", 32),
            tower_emb_dim = kwargs.get("item_tower_emb_dim", 32),
        )
        # User tower dimension [user_id_emb_dim, user_tower_emb_dim]
        self.user_tower = UserTower(
            num_users = kwargs.get("num_users", 0),
            id_emb_dim = kwargs.get("user_id_emb_dim", 32),
            tower_emb_dim = kwargs.get("user_tower_emb_dim", 32),
        )

    def forward(self, user_id, item_id):
        """Because item_emb and user_emb are normalised in the tower. The dot product here
        is then the cosine similarity, and its value is between [-1, 1].

        Args
        user_id - User IDs, [B,]
        item_id - Item IDs, [B,]
        """
        item_emb = self.item_tower(item_id) # [B, item_tower_dim]. Normalised into Cosine similarity 
        user_emb = self.user_tower(user_id) # [B, user_tower_dim]. Normalised into Cosine similarity
        x = (item_emb * user_emb).sum(axis=-1, keepdim=True).squeeze() # [B]. x elements are between -1 and 1.
        x = nn.Sigmoid()(x) # Output in [0, 1] for BCE
        return x

In [None]:
test_item_ids = torch.tensor([0], dtype=torch.long)
test_user_ids = torch.tensor([0], dtype=torch.long)

test_model = TwoTowersModel(
    num_items=len(books_rating_ds.books), 
    num_users=len(books_rating_ds.users)
)
test_model(user_id=test_user_ids, item_id=test_item_ids)

# Model Train

Train the two-towers model and monitor the loss and training epoch
* Start with Binary Cross Entropy loss
* Switch to Negative Contrastive Ex loss

In [None]:
from time import time

In [None]:
EPOCH = 1
BATCH_SIZE = 32
MAX_BATCHES = None
LOG_INTERVAL = 10

dataloader = DataLoader(books_rating_ds, batch_size=BATCH_SIZE, shuffle=True)
model = TwoTowersModel(    
    num_items=len(books_rating_ds.books),
    num_users=len(books_rating_ds.users)
)
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

In [None]:
# Create a in-batch negative dataset that generates negative training samples from other user's positive items
# The negative sampling dataset needs to be created for training and test individually

for epoch in range(EPOCH):
    for i, batch in enumerate(dataloader):
        batch_start_time = time()
        if MAX_BATCHES is not None and i >= MAX_BATCHES:
            break
        
        # -------------------------------------------------------
        # Training
        # -------------------------------------------------------
        # Prepare Two-towers input
        item_ids = batch["item_ids"] # [B, K+1], K is number of negative samples, 1 is positive interaction
        labels = batch["binary_scores"] # [B, K+1]
        user_ids = batch["user_ids"] # [B, ]
        user_ids_exp = user_ids.unsqueeze(1).expand(-1, item_ids.shape[1]) # [B, K+1]
        # Flatten 
        user_ids_flat = user_ids_exp.reshape(-1) # [B*(K+1),]
        item_ids_flat = item_ids.reshape(-1) # [B*(K+1),]
        labels_flat = labels.reshape(-1) # [B*(K+1),]
        # Forward Pass
        logit = model(user_ids_flat, item_ids_flat) # forward pass
        loss = loss_fn(logit, labels_flat)
        # Backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        batch_finish_time = time()
        elapsed_time = batch_finish_time - batch_start_time
        
        # -------------------------------------------------------
        # Monitoring
        # -------------------------------------------------------
        if (i+1) % LOG_INTERVAL == 0:
            print(f"epoch {epoch+1}, batch {i+1}, "
                  f"train loss: {loss.item()}, "
                  f"user_ids: {user_ids.shape}, "
                  f"item_ids: {item_ids.shape}, "
                  f"labels: {labels.shape}, "
                  f"elapsed time: {elapsed_time:.1f}s"
            )
        

In [None]:
# Calculate offline performance metrics on the train and test set
# Hit@K
# Recall@K
# Normalised Weighted Cumulative Gain@K
# Mean Reciprocal Rank@K