In [None]:
#!pip install catalyst recbole
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from typing import List
from tqdm import tqdm
from zipfile import ZipFile

from typing import Dict, List, Tuple

import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.nn.init import constant_, xavier_normal_
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader, Dataset

from catalyst import dl, metrics
from catalyst.contrib.datasets import MovieLens
from catalyst.utils import get_device, set_global_seed
from torch.nn.utils.rnn import pad_sequence

from torch import nn
from recbole.model.abstract_recommender import SequentialRecommender
from recbole.model.layers import *
import catalyst
import random

set_global_seed(42)
device = get_device()
print(device)


%matplotlib inline

cuda


In [None]:
#!pip install -U catalyst

In [None]:
catalyst.__version__

'22.04'

In [None]:
PATH_TO_DATA = "/content/data_kion.zip"

with ZipFile(PATH_TO_DATA) as z:

    with z.open("data_kion/interactions_df.csv") as f:
        df = pd.read_csv(f)



In [None]:
df.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [None]:
df['weekday'] = pd.to_datetime(df.last_watch_dt).dt.weekday

In [None]:
train_df = df.loc[(df.last_watch_dt < '2021-08-08')].copy()
valid_df = df.loc[(df.last_watch_dt >= '2021-08-08')&(df.last_watch_dt < '2021-08-15')].copy()
test_df = df.loc[(df.last_watch_dt >= '2021-08-15')].copy()

train_users = train_df.user_id.unique()
valid_users = valid_df.user_id.unique()
test_users = test_df.user_id.unique()


all_included = np.intersect1d(np.intersect1d(valid_users, train_users), test_users)

print('number of users which are included both in train/test data: ', all_included.shape[0])

number of users which are included both in train/test data:  48728


In [None]:
n_users = 5000

all_included = np.random.choice(all_included, size=n_users, replace=False)

train_df = train_df.loc[train_df.user_id.isin(all_included)].copy()
valid_df = valid_df.loc[valid_df.user_id.isin(all_included)].copy()
test_df = test_df.loc[test_df.user_id.isin(all_included)].copy()

assert train_df.last_watch_dt.max() < valid_df.last_watch_dt.min()
assert valid_df.last_watch_dt.max() < test_df.last_watch_dt.min()
assert train_df.user_id.nunique() == n_users
assert valid_df.user_id.nunique() == n_users
assert test_df.user_id.nunique() == n_users

In [None]:
train_grouped = train_df.groupby('user_id').apply(
    lambda x: [(t1, t2, t3) for t1, t2, t3 in sorted(zip(x.item_id,
                                                 x.last_watch_dt,
                                                 x.weekday), key=lambda x: x[1])]
).reset_index()
train_grouped.rename({0:'train_interactions'}, axis=1, inplace=True)

valid_grouped = valid_df.groupby('user_id').apply(
    lambda x: [(t1, t2, t3) for t1, t2, t3 in sorted(zip(x.item_id,
                                                         x.last_watch_dt,
                                                         x.weekday), key=lambda x: x[1])]
).reset_index()
valid_grouped.rename({0:'valid_interactions'}, axis=1, inplace=True)

test_grouped = test_df.groupby('user_id').apply(
    lambda x: [(t1, t2, t3) for t1, t2, t3 in sorted(zip(x.item_id,
                                                         x.last_watch_dt,
                                                         x.weekday), key=lambda x: x[1])]
).reset_index()
test_grouped.rename({0:'test_interactions'}, axis=1, inplace=True)


train_grouped.head()

Unnamed: 0,user_id,train_interactions
0,98,"[(2424, 2021-03-28, 6), (8314, 2021-04-19, 0),..."
1,1052,"[(10440, 2021-07-31, 5), (15297, 2021-08-01, 6..."
2,1309,"[(7793, 2021-07-31, 5), (3784, 2021-08-03, 1),..."
3,1330,"[(10440, 2021-07-29, 3), (11863, 2021-07-30, 4..."
4,1489,"[(8522, 2021-03-16, 1), (172, 2021-03-17, 2), ..."


In [None]:
joined = train_grouped.merge(valid_grouped).merge(test_grouped)
joined.head()

Unnamed: 0,user_id,train_interactions,valid_interactions,test_interactions
0,98,"[(2424, 2021-03-28, 6), (8314, 2021-04-19, 0),...","[(5051, 2021-08-09, 0), (15600, 2021-08-10, 1)]","[(12360, 2021-08-15, 6), (89, 2021-08-19, 3), ..."
1,1052,"[(10440, 2021-07-31, 5), (15297, 2021-08-01, 6...","[(7726, 2021-08-08, 6), (391, 2021-08-08, 6)]","[(16228, 2021-08-18, 2), (14097, 2021-08-18, 2..."
2,1309,"[(7793, 2021-07-31, 5), (3784, 2021-08-03, 1),...","[(12192, 2021-08-14, 5)]","[(3734, 2021-08-15, 6), (1055, 2021-08-16, 0),..."
3,1330,"[(10440, 2021-07-29, 3), (11863, 2021-07-30, 4...","[(9996, 2021-08-09, 0)]","[(512, 2021-08-16, 0)]"
4,1489,"[(8522, 2021-03-16, 1), (172, 2021-03-17, 2), ...","[(9728, 2021-08-10, 1)]","[(14901, 2021-08-17, 1)]"


In [None]:
our_items = set()
for idx, row in tqdm(joined.iterrows()):
    for el in row.train_interactions:
        our_items.add(el[0])

len(our_items)

5000it [00:00, 18568.53it/s]


6633

In [None]:
item2idx = {k: i for i, k in enumerate(our_items)}
idx2item = {i: k for k, i in item2idx.items()}

In [None]:
idx2item[6632]

16516

In [None]:
class CustomDataset(Dataset):

  def __init__(self, ds, num_items, item2idx, phase='valid'):
    super().__init__()
    self.ds = ds
    self.phase = phase
    self.num_items = num_items
    self.item2idx = item2idx

  def __len__(self):
    return len(self.ds)

  def __getitem__(self, idx):

    row = self.ds.iloc[idx]
    x_input = np.zeros(self.num_items + 1)

    x_input[[self.item2idx[x[0]]+1 for x in row['train_interactions'] if x[0] in self.item2idx]] = 1

    days_of_weeks = [x[2] for x in row['train_interactions'] if x[0] in self.item2idx][-99:]

    seq_input = [self.item2idx[x[0]]+1 for x in row['train_interactions'] if x[0] in self.item2idx][-99:]

    targets = np.zeros(self.num_items+1)

    dow_valid = row['valid_interactions'][0][2]
    dow_test = row['test_interactions'][0][2]

    if self.phase == 'train':
        return (seq_input, days_of_weeks, dow_valid)
    elif self.phase == 'valid':
        targets[[self.item2idx[x[0]]+1 for x in row['valid_interactions'] if x[0] in self.item2idx]] = 1
    else:
        return (seq_input,days_of_weeks,dow_test)

       # print(x_input.sum(), targets.sum())
    return (targets, seq_input,days_of_weeks,dow_valid)

In [None]:
n_items = len(item2idx)

train = CustomDataset(ds=joined,
                  num_items=n_items,
                  item2idx=item2idx,
                  phase='train')

valid = CustomDataset(ds=joined,
                  num_items=n_items,
                  item2idx=item2idx,
                  phase='valid')

print(len(train),len(valid))

5000 5000


In [None]:
def collate_fn_train(batch: List[Tuple[torch.Tensor]]) -> Dict[str, torch.Tensor]:

    seq_i,days_of_weeks,dow_valid = zip(*batch)
    seq_len = torch.Tensor([len(x) for x in seq_i])
    dow_valid = torch.Tensor([x for x in dow_valid])
    seq_i = pad_sequence([torch.Tensor(t) for t in seq_i]).T
    days_of_weeks = pad_sequence([torch.Tensor(t) for t in days_of_weeks]).T

    return {'seq_i': seq_i,
            'seq_len':seq_len,
            'dow': days_of_weeks,
            'dow_valid': dow_valid}


def collate_fn_valid(batch: List[Tuple[torch.Tensor]]) -> Dict[str, torch.Tensor]:

    y, seq_i, days_of_weeks, dow_valid = zip(*batch)

    seq_len = torch.Tensor([len(x) for x in seq_i]).long()
    seq_i = pad_sequence([torch.Tensor(t) for t in seq_i]).T.long()
    days_of_weeks = pad_sequence([torch.Tensor(t) for t in days_of_weeks]).T.long()
    dow_valid = torch.Tensor([x for x in dow_valid])

    targets = pad_sequence([torch.Tensor(t) for t in y]).T

    return {"targets": targets,
            'seq_i': seq_i,
            'seq_len':seq_len,
            'dow': days_of_weeks,
            'dow_valid': dow_valid}

In [None]:
loaders = {
        "train": DataLoader(train, batch_size=256, collate_fn=collate_fn_train),
        "valid": DataLoader(valid, batch_size=256, collate_fn=collate_fn_valid),
}

In [None]:
class MultiHeadAttention(nn.Module):
    """
    Multi-head Self-attention layers, a attention score dropout layer is introduced.
    Args:
        input_tensor (torch.Tensor): the input of the multi-head self-attention layer
        attention_mask (torch.Tensor): the attention mask for input tensor
    Returns:
        hidden_states (torch.Tensor): the output of the multi-head self-attention layer
    """

    def __init__(
        self,
        n_heads,
        hidden_size,
        hidden_dropout_prob,
        attn_dropout_prob,
        layer_norm_eps,
    ):
        super(MultiHeadAttention, self).__init__()
        if hidden_size % n_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (hidden_size, n_heads)
            )

        self.num_attention_heads = n_heads
        self.attention_head_size = int(hidden_size / n_heads)
        self.all_head_size = self.num_attention_heads * self.attention_head_size
        self.sqrt_attention_head_size = math.sqrt(self.attention_head_size)

        self.query = nn.Linear(hidden_size, self.all_head_size)
        self.key = nn.Linear(hidden_size, self.all_head_size)
        self.value = nn.Linear(hidden_size, self.all_head_size)

        self.softmax = nn.Softmax(dim=-1)
        self.attn_dropout = nn.Dropout(attn_dropout_prob)

        self.dense = nn.Linear(hidden_size, hidden_size)
        self.LayerNorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
        self.out_dropout = nn.Dropout(hidden_dropout_prob)

    def transpose_for_scores(self, x):
        new_x_shape = x.size()[:-1] + (
            self.num_attention_heads,
            self.attention_head_size,
        )
        x = x.view(*new_x_shape)
        return x

    def forward(self, input_tensor, attention_mask, return_explanations=False):
        mixed_query_layer = self.query(input_tensor)
        mixed_key_layer = self.key(input_tensor)
        mixed_value_layer = self.value(input_tensor)

        query_layer = self.transpose_for_scores(mixed_query_layer).permute(0, 2, 1, 3)
        key_layer = self.transpose_for_scores(mixed_key_layer).permute(0, 2, 3, 1)
        value_layer = self.transpose_for_scores(mixed_value_layer).permute(0, 2, 1, 3)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer, key_layer)

        attention_scores = attention_scores / self.sqrt_attention_head_size
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        # [batch_size heads seq_len seq_len] scores
        # [batch_size 1 1 seq_len]
        attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = self.softmax(attention_scores)
        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.

        attention_probs = self.attn_dropout(attention_probs)
        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(*new_context_layer_shape)
        hidden_states = self.dense(context_layer)
        hidden_states = self.out_dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)

        if return_explanations:
            return hidden_states, attention_probs
        else:
            return hidden_states


In [None]:
class TransformerLayer(nn.Module):
    """
    One transformer layer consists of a multi-head self-attention layer and a point-wise feed-forward layer.
    Args:
        hidden_states (torch.Tensor): the input of the multi-head self-attention sublayer
        attention_mask (torch.Tensor): the attention mask for the multi-head self-attention sublayer
    Returns:
        feedforward_output (torch.Tensor): The output of the point-wise feed-forward sublayer,
                                           is the output of the transformer layer.
    """

    def __init__(
        self,
        n_heads,
        hidden_size,
        intermediate_size,
        hidden_dropout_prob,
        attn_dropout_prob,
        hidden_act,
        layer_norm_eps,
    ):
        super(TransformerLayer, self).__init__()
        self.multi_head_attention = MultiHeadAttention(
            n_heads, hidden_size, hidden_dropout_prob, attn_dropout_prob, layer_norm_eps
        )
        self.feed_forward = FeedForward(
            hidden_size,
            intermediate_size,
            hidden_dropout_prob,
            hidden_act,
            layer_norm_eps,
        )

    def forward(self, hidden_states, attention_mask,return_explanations=False):

        if return_explanations:
            attention_output, expl = self.multi_head_attention(hidden_states, attention_mask,
                                                         return_explanations=return_explanations)

        else:
            attention_output = self.multi_head_attention(hidden_states, attention_mask,
                                                         return_explanations=return_explanations)
        feedforward_output = self.feed_forward(attention_output)

        if return_explanations:
            return feedforward_output, expl
        else:
            return feedforward_output



In [None]:
class TransformerEncoder(nn.Module):
    r"""One TransformerEncoder consists of several TransformerLayers.
    Args:
        n_layers(num): num of transformer layers in transformer encoder. Default: 2
        n_heads(num): num of attention heads for multi-head attention layer. Default: 2
        hidden_size(num): the input and output hidden size. Default: 64
        inner_size(num): the dimensionality in feed-forward layer. Default: 256
        hidden_dropout_prob(float): probability of an element to be zeroed. Default: 0.5
        attn_dropout_prob(float): probability of an attention score to be zeroed. Default: 0.5
        hidden_act(str): activation function in feed-forward layer. Default: 'gelu'
                      candidates: 'gelu', 'relu', 'swish', 'tanh', 'sigmoid'
        layer_norm_eps(float): a value added to the denominator for numerical stability. Default: 1e-12
    """

    def __init__(
        self,
        n_layers=2,
        n_heads=2,
        hidden_size=64,
        inner_size=256,
        hidden_dropout_prob=0.5,
        attn_dropout_prob=0.5,
        hidden_act="gelu",
        layer_norm_eps=1e-12,
    ):

        super(TransformerEncoder, self).__init__()
        layer = TransformerLayer(
            n_heads,
            hidden_size,
            inner_size,
            hidden_dropout_prob,
            attn_dropout_prob,
            hidden_act,
            layer_norm_eps,
        )
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(n_layers)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
        """
        Args:
            hidden_states (torch.Tensor): the input of the TransformerEncoder
            attention_mask (torch.Tensor): the attention mask for the input hidden_states
            output_all_encoded_layers (Bool): whether output all transformer layers' output
        Returns:
            all_encoder_layers (list): if output_all_encoded_layers is True, return a list consists of all transformer
            layers' output, otherwise return a list only consists of the output of last transformer layer.
        """
        all_encoder_layers = []
        for idx, layer_module in enumerate(self.layer):

            hidden_states = layer_module(hidden_states, attention_mask, )
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
        if not output_all_encoded_layers:
            all_encoder_layers.append(hidden_states)

        return all_encoder_layers




In [None]:
class BERT4Rec(torch.nn.Module):

    def __init__(self, n_items):
        super(BERT4Rec, self).__init__()

        # load parameters info
        self.n_layers = 2
        self.n_heads = 2
        self.hidden_size = 64  # same as embedding_size
        self.inner_size = 128 # the dimensionality in feed-forward layer
        self.hidden_dropout_prob = 0.2
        self.attn_dropout_prob = 0.2
        self.hidden_act = 'sigmoid'
        self.layer_norm_eps = 1e-5
        self.ITEM_SEQ = 'seq_i'
        self.ITEM_SEQ_LEN = 'seq_len'
        self.max_seq_length = 100


        self.mask_ratio = 0.2

        self.loss_type =  'CE'
        self.initializer_range = 1e-2

        # load dataset info
        self.n_items = n_items
        self.mask_token = self.n_items
        self.mask_item_length = int(self.mask_ratio * self.max_seq_length)

        # define layers and loss
        self.item_embedding = nn.Embedding(self.n_items + 1, self.hidden_size, padding_idx=0)  # mask token add 1
        self.position_embedding = nn.Embedding(self.max_seq_length + 1, self.hidden_size)  # add mask_token at the last
        self.trm_encoder = TransformerEncoder(
            n_layers=self.n_layers,
            n_heads=self.n_heads,
            hidden_size=self.hidden_size,
            inner_size=self.inner_size,
            hidden_dropout_prob=self.hidden_dropout_prob,
            attn_dropout_prob=self.attn_dropout_prob,
            hidden_act=self.hidden_act,
            layer_norm_eps=self.layer_norm_eps
        )

        self.LayerNorm = nn.LayerNorm(self.hidden_size, eps=self.layer_norm_eps)
        self.dropout = nn.Dropout(self.hidden_dropout_prob)

        # we only need compute the loss at the masked position
        try:
            assert self.loss_type in ['BPR', 'CE']
        except AssertionError:
            raise AssertionError("Make sure 'loss_type' in ['BPR', 'CE']!")

        # parameters initialization
        self.apply(self._init_weights)

    def gather_indexes(self, output, gather_index):
        """Gathers the vectors at the specific positions over a minibatch"""
        gather_index = gather_index.view(-1, 1, 1).expand(-1, -1, output.shape[-1])
        output_tensor = output.gather(dim=1, index=gather_index)
        return output_tensor.squeeze(1)

    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.initializer_range)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def get_attention_mask(self, item_seq):
        """Generate bidirectional attention mask for multi-head attention."""
        attention_mask = (item_seq > 0).long()
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # torch.int64
        # bidirectional mask
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        return extended_attention_mask

    def _neg_sample(self, item_set):
        item = random.randint(1, self.n_items - 1)
        while item in item_set:
            item = random.randint(1, self.n_items - 1)
        return item

    def _padding_sequence(self, sequence, max_length):
        pad_len = max_length - len(sequence)
        sequence = [0] * pad_len + sequence
        sequence = sequence[-max_length:]  # truncate according to the max_length
        return sequence

    def reconstruct_train_data(self, item_seq):
        """
        Mask item sequence for training.
        """
        device = item_seq.device
        batch_size = item_seq.size(0)

        sequence_instances = item_seq.cpu().numpy().tolist()

        # Masked Item Prediction
        # [B * Len]
        masked_item_sequence = []
        pos_items = []
        neg_items = []
        masked_index = []
        for instance in sequence_instances:
            masked_sequence = instance.copy()
            pos_item = []
            neg_item = []
            index_ids = []
            for index_id, item in enumerate(instance):
                # padding is 0, the sequence is end
                if item == 0:
                    break
                prob = random.random()
                if prob < self.mask_ratio:
                    pos_item.append(item)
                    neg_item.append(self._neg_sample(instance))
                    masked_sequence[index_id] = self.mask_token
                    index_ids.append(index_id)

            masked_item_sequence.append(masked_sequence)
            pos_items.append(self._padding_sequence(pos_item, self.mask_item_length))
            neg_items.append(self._padding_sequence(neg_item, self.mask_item_length))
            masked_index.append(self._padding_sequence(index_ids, self.mask_item_length))

        # [B Len]
        masked_item_sequence = torch.tensor(masked_item_sequence, dtype=torch.long, device=device).view(batch_size, -1)
        # [B mask_len]
        pos_items = torch.tensor(pos_items, dtype=torch.long, device=device).view(batch_size, -1)
        # [B mask_len]
        neg_items = torch.tensor(neg_items, dtype=torch.long, device=device).view(batch_size, -1)
        # [B mask_len]
        masked_index = torch.tensor(masked_index, dtype=torch.long, device=device).view(batch_size, -1)
        return masked_item_sequence, pos_items, neg_items, masked_index

    def reconstruct_test_data(self, item_seq, item_seq_len):
        """
        Add mask token at the last position according to the lengths of item_seq
        """
        padding = torch.zeros(item_seq.size(0), dtype=torch.long, device=item_seq.device)  # [B]
        item_seq = torch.cat((item_seq, padding.unsqueeze(-1)), dim=-1)  # [B max_len+1]
        for batch_id, last_position in enumerate(item_seq_len):
            item_seq[batch_id][last_position] = self.mask_token
        return item_seq

    def forward(self, item_seq):

        position_ids = torch.arange(item_seq.size(1), dtype=torch.long, device=item_seq.device)
        position_ids = position_ids.unsqueeze(0).expand_as(item_seq)
        position_embedding = self.position_embedding(position_ids)

        item_emb = self.item_embedding(item_seq)
        input_emb = item_emb + position_embedding
        input_emb = self.LayerNorm(input_emb)
        input_emb = self.dropout(input_emb)
        extended_attention_mask = self.get_attention_mask(item_seq)


        trm_output = self.trm_encoder(input_emb, extended_attention_mask, output_all_encoded_layers=True)

        output = trm_output[-1]

        return output

    def multi_hot_embed(self, masked_index, max_length):
        """
        For memory, we only need calculate loss for masked position.
        Generate a multi-hot vector to indicate the masked position for masked sequence, and then is used for
        gathering the masked position hidden representation.
        Examples:
            sequence: [1 2 3 4 5]
            masked_sequence: [1 mask 3 mask 5]
            masked_index: [1, 3]
            max_length: 5
            multi_hot_embed: [[0 1 0 0 0], [0 0 0 1 0]]
        """
        masked_index = masked_index.view(-1)
        multi_hot = torch.zeros(masked_index.size(0), max_length, device=masked_index.device)
        multi_hot[torch.arange(masked_index.size(0)), masked_index] = 1
        return multi_hot

    def calculate_loss(self, interaction):
        item_seq = interaction[self.ITEM_SEQ].long()
        masked_item_seq, pos_items, neg_items, masked_index = self.reconstruct_train_data(item_seq)

        seq_output = self.forward(masked_item_seq)
        pred_index_map = self.multi_hot_embed(masked_index, masked_item_seq.size(-1))  # [B*mask_len max_len]
        # [B mask_len] -> [B mask_len max_len] multi hot
        pred_index_map = pred_index_map.view(masked_index.size(0), masked_index.size(1), -1)  # [B mask_len max_len]
        # [B mask_len max_len] * [B max_len H] -> [B mask_len H]
        # only calculate loss for masked position
        seq_output = torch.bmm(pred_index_map, seq_output)  # [B mask_len H]

        if self.loss_type == 'BPR':
            pos_items_emb = self.item_embedding(pos_items)  # [B mask_len H]
            neg_items_emb = self.item_embedding(neg_items)  # [B mask_len H]
            pos_score = torch.sum(seq_output * pos_items_emb, dim=-1)  # [B mask_len]
            neg_score = torch.sum(seq_output * neg_items_emb, dim=-1)  # [B mask_len]
            targets = (masked_index > 0).float()
            loss = - torch.sum(torch.log(1e-14 + torch.sigmoid(pos_score - neg_score)) * targets) \
                   / torch.sum(targets)
            return loss

        elif self.loss_type == 'CE':
            loss_fct = nn.CrossEntropyLoss(reduction='none')
            test_item_emb = self.item_embedding.weight[:self.n_items]  # [item_num H]
            logits = torch.matmul(seq_output, test_item_emb.transpose(0, 1))  # [B mask_len item_num]
            targets = (masked_index > 0).float().view(-1)  # [B*mask_len]

            loss = torch.sum(loss_fct(logits.view(-1, test_item_emb.size(0)), pos_items.view(-1)) * targets) \
                   / torch.sum(targets)
            return loss
        else:
            raise NotImplementedError("Make sure 'loss_type' in ['BPR', 'CE']!")


    def full_sort_predict(self, interaction):
        item_seq = interaction[self.ITEM_SEQ].long()
        item_seq_len = interaction[self.ITEM_SEQ_LEN].long()
        item_seq = self.reconstruct_test_data(item_seq, item_seq_len)



        seq_output = self.forward(item_seq)


        seq_output = self.gather_indexes(seq_output, item_seq_len - 1)  # [B H]
        test_items_emb = self.item_embedding.weight[:self.n_items]  # delete masked token
        scores = torch.matmul(seq_output, test_items_emb.transpose(0, 1))  # [B, item_num]

        idxs = item_seq.nonzero()
        item_seq[item_seq==self.n_items] = 0
        scores[idxs[:,0], item_seq[idxs[:,0],idxs[:,1]].long()] = -1000



        return scores

In [None]:
dl.Runner

catalyst.runners.runner.Runner

In [None]:
class RecSysRunner(dl.Runner):
    def on_loader_start(self, runner):
        super().on_loader_start(runner)
        self.meters = {
            key: metrics.AdditiveMetric(compute_on_call=False)
            for key in ["loss_ae", "loss_kld", "loss"]
        }

    def handle_batch(self, batch):

        if 'targets' in batch:
            x_true = batch["targets"]

        loss = self.model.calculate_loss(batch)

        if 'targets' in batch:
            scores = self.model.full_sort_predict(batch)

            self.batch.update({'targets': batch['targets'], 'logits':scores, 'inputs':scores})
        else:
            self.batch.update({"inputs": torch.zeros((30,30)),
                           "targets": torch.zeros((30,30)),
                           'logits': torch.zeros((30,30))})

        self.batch_metrics.update({"loss": loss})

        for key in ["loss"]:
            self.meters[key].update(self.batch_metrics[key].item(), self.batch_size)

    def on_loader_end(self, runner):
        for key in ["loss"]:
            self.loader_metrics[key] = self.meters[key].compute()[0]
        super().on_loader_end(runner)

    def predict_batch(self, batch):
        scores = self.model.full_sort_predict(batch)
        return scores

In [None]:
model = BERT4Rec(n_items=len(item2idx)+1)

optimizer = optim.Adam(model.parameters(), lr=0.001)
lr_scheduler = StepLR(optimizer, step_size=20, gamma=0.1)
engine = catalyst.engines.torch.CPUEngine()
hparams = {
    "anneal_cap": 0.2,
    "total_anneal_steps": 6000,
}


callbacks = [
    dl.NDCGCallback("logits", "targets", [20]),
    dl.MAPCallback("logits", "targets", [10]),
    #dl.MRRCallback("logits", "targets", [20, 50, 100]),
    #dl.HitrateCallback("logits", "targets", [20, 50, 100]),
    dl.OptimizerCallback("loss", accumulation_steps=1),
    dl.EarlyStoppingCallback(
        patience=5, loader_key="valid", metric_key="map10", minimize=False
    )
]


runner = RecSysRunner()
runner.train(
    fp16 = False,
    model=model,
    optimizer=optimizer,
    hparams=hparams,
    scheduler=lr_scheduler,
    loaders=loaders,
    num_epochs=3,
    verbose=True,
    timeit=True,
    callbacks=callbacks,
    #logdir="./logs",
    engine=engine,
)



1/3 * Epoch (train):   0%|          | 0/20 [00:00<?, ?it/s]

train (1/3) loss: 8.806958959960937 | lr: 0.001 | map10: 0.0 | map10/std: 0.0 | momentum: 0.9 | ndcg20: 0.0 | ndcg20/std: 0.0


1/3 * Epoch (valid):   0%|          | 0/20 [00:00<?, ?it/s]

valid (1/3) loss: 8.807097203063966 | lr: 0.001 | map10: 0.00254119052439928 | map10/std: 0.00196709554865624 | momentum: 0.9 | ndcg20: 0.0019607310000341388 | ndcg20/std: 0.00113349196522522
* Epoch (1/3) 


2/3 * Epoch (train):   0%|          | 0/20 [00:00<?, ?it/s]

train (2/3) loss: 8.806808589172363 | lr: 0.001 | map10: 0.0 | map10/std: 0.0 | momentum: 0.9 | ndcg20: 0.0 | ndcg20/std: 0.0


2/3 * Epoch (valid):   0%|          | 0/20 [00:00<?, ?it/s]

valid (2/3) loss: 8.807536502075196 | lr: 0.001 | map10: 0.00254119052439928 | map10/std: 0.00196709554865624 | momentum: 0.9 | ndcg20: 0.0019607310000341388 | ndcg20/std: 0.00113349196522522
* Epoch (2/3) 


3/3 * Epoch (train):   0%|          | 0/20 [00:00<?, ?it/s]

train (3/3) loss: 8.807118504333499 | lr: 0.001 | map10: 0.0 | map10/std: 0.0 | momentum: 0.9 | ndcg20: 0.0 | ndcg20/std: 0.0


3/3 * Epoch (valid):   0%|          | 0/20 [00:00<?, ?it/s]

valid (3/3) loss: 8.807296226501464 | lr: 0.001 | map10: 0.00254119052439928 | map10/std: 0.00196709554865624 | momentum: 0.9 | ndcg20: 0.0019607310000341388 | ndcg20/std: 0.00113349196522522
* Epoch (3/3) 


In [None]:
test_runner = RecSysRunner(model=model)

In [None]:
test_dataset = CustomDataset(ds=joined, num_items=n_items, phase='test',item2idx=item2idx)


inference_loader = DataLoader(test_dataset,
                              batch_size=joined.shape[0]//100,
                              collate_fn=collate_fn_train,)

preds = []

for prediction in tqdm(runner.predict_loader(loader=inference_loader, engine=engine)):
    preds.extend(prediction.detach().cpu().numpy().tolist())

print(len(preds))
assert len(preds) == joined.shape[0]

joined['preds_bert4rec'] = preds
joined['recs_bert4rec'] = joined['preds_bert4rec'].apply(lambda x: np.argsort(-np.array(x))[:30])
joined['recs_bert4rec'] = joined['recs_bert4rec'].apply(lambda x: [idx2item[t]-1 for t in x-1])
joined.drop(['preds_bert4rec'],axis=1, inplace=True)
joined.head()


0it [00:00, ?it/s][A
2it [00:00,  3.88it/s][A
4it [00:00,  7.30it/s][A
7it [00:00, 11.36it/s][A
9it [00:00, 13.16it/s][A
12it [00:00, 16.76it/s][A
15it [00:01, 16.38it/s][A
18it [00:01, 17.89it/s][A
21it [00:01, 19.68it/s][A
24it [00:01, 19.53it/s][A
27it [00:01, 18.00it/s][A
29it [00:01, 17.56it/s][A
31it [00:02, 17.95it/s][A
34it [00:02, 19.06it/s][A
36it [00:02, 18.09it/s][A
38it [00:02, 17.92it/s][A
41it [00:02, 19.18it/s][A
43it [00:02, 19.27it/s][A
45it [00:02, 18.82it/s][A
48it [00:02, 19.32it/s][A
50it [00:03, 16.99it/s][A
53it [00:03, 17.78it/s][A
55it [00:03, 17.23it/s][A
58it [00:03, 18.73it/s][A
60it [00:03, 18.70it/s][A
62it [00:03, 17.09it/s][A
64it [00:03, 17.54it/s][A
67it [00:03, 19.01it/s][A
70it [00:04, 19.74it/s][A
73it [00:04, 20.84it/s][A
76it [00:04, 19.15it/s][A
78it [00:04, 18.47it/s][A
80it [00:04, 18.05it/s][A
82it [00:04, 17.80it/s][A
85it [00:04, 17.91it/s][A
87it [00:05, 18.12it/s][A
89it [00:05, 18.57it/s][A
92it [00:

5000


KeyError: ignored

In [None]:
def ndcg_metric(gt_items, predicted):

    at = len(predicted)
    relevance = np.array([1 if x in predicted else 0 for x in gt_items])
    # DCG uses the relevance of the recommended items
    rank_dcg = dcg(relevance)

    if rank_dcg == 0.0:
        return 0.0

    # IDCG has all relevances to 1 (or the values provided), up to the number of items in the test set that can fit in the list length
    ideal_dcg = dcg(np.sort(relevance)[::-1][:at])

    if ideal_dcg == 0.0:
        return 0.0

    ndcg_ = rank_dcg / ideal_dcg

    return ndcg_


def dcg(scores):
    return np.sum(np.divide(np.power(2, scores) - 1, np.log2(np.arange(scores.shape[0], dtype=np.float64) + 2)),
                  dtype=np.float64)


def recall_metric(gt_items, predicted):

    n_gt = len(gt_items)
    intersection = len(set(gt_items).intersection(set(predicted)))
    return intersection / n_gt



def evaluate_recommender(df, model_preds, gt_col='test_interactions', topn=10):

    metric_values = []

    for idx, row in df.iterrows():
        gt_items = [x[0] for x in row[gt_col]]
        metric_values.append((ndcg_metric(gt_items, row[model_preds]),
                              recall_metric(gt_items, row[model_preds])))

    return {'ndcg':np.mean([x[0] for x in metric_values]),
            'recall':np.mean([x[1] for x in metric_values])}


In [None]:
evaluate_recommender(joined, model_preds='recs_bert4rec')

In [None]:
class TopPopular:

    def __init__(self):

        self.trained = False

    def fit(self, df, col='train_interactions'):

        counts = {}
        for _, row in df.iterrows():
            for item, _, _ in row[col]:
                if item in counts:
                    counts[item] += 1
                else:
                    counts[item] = 1

        counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)

        self.recommenations = [x[0] for x in counts]
        self.trained = True

    def predict(self, df, topn=10)  -> List[np.ndarray]:

        assert self.trained
        return [self.recommenations[:topn]]*len(df)


toppop = TopPopular()
toppop.fit(joined)
joined['toppopular_recs'] = toppop.predict(joined)
joined.head()

In [None]:
evaluate_recommender(joined, model_preds='toppopular_recs')