In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import sys
sys.path.append('../')

In [2]:
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, ModelSummary
from torch.utils.data import DataLoader, Dataset

from src.datasets import (LMDataset, CausalLMDataset, CausalLMPredictionDataset, MaskedLMDataset,
                          MaskedLMPredictionDataset, PaddingCollateFn)
from src.metrics import compute_metrics
from src.models import RNN, BERT4Rec, SASRec
from src.modules import SeqRec, SeqRecWithSampling
from src.postprocess import preds2recs
from src.preprocess import add_time_idx
from src.unbiased_metrics import get_metrics, hr, mrr, ndcg


libgomp: Invalid value for environment variable OMP_NUM_THREADS

libgomp: Invalid value for environment variable OMP_NUM_THREADS


## Load data

In [3]:
ITEM_COL = 'item_id'
RELEVANCE_THRESHOLD = 3.5
RELEVANCE_COL = 'rating'

In [4]:
train = pd.read_csv('../data/ml-1m/train.csv')
test = pd.read_csv('../data/ml-1m/test.csv')
val_1 = pd.read_csv('../data/ml-1m/val_1.csv')
val_2 = pd.read_csv('../data/ml-1m/val_2.csv')
test_users_history = pd.read_csv('../data/ml-1m/test_users_history.csv')
val_users_history_1 = pd.read_csv('../data/ml-1m/val_users_history_1.csv')
val_users_history_2 = pd.read_csv('../data/ml-1m/val_users_history_2.csv')

In [5]:
train.item_id = train.item_id * 2
val_users_history_1.item_id = val_users_history_1.item_id * 2
val_users_history_2.item_id = val_users_history_2.item_id * 2
test_users_history.item_id = test_users_history.item_id * 2

In [6]:
train.loc[train[RELEVANCE_COL] < RELEVANCE_THRESHOLD, 'item_id'] -= 1
val_users_history_1.loc[val_users_history_1[RELEVANCE_COL] < RELEVANCE_THRESHOLD, 'item_id'] -= 1
val_users_history_2.loc[val_users_history_2[RELEVANCE_COL] < RELEVANCE_THRESHOLD, 'item_id'] -= 1
test_users_history.loc[test_users_history[RELEVANCE_COL] < RELEVANCE_THRESHOLD, 'item_id'] -= 1

## Dataloaders

In [7]:
MAX_LENGTH = 200

VALIDATION_SIZE = 10000
# VALIDATION_SIZE = None

BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
NUM_WORKERS = 8

In [8]:
'''class NegFeedbackDataset(Dataset):

    def __init__(self, df, max_length=128,
                 user_col='user_id', item_col=ITEM_COL, time_col='time_idx',
                 duration_col=RELEVANCE_COL, positive_labels=True,
                 duration_threshold=3.5):

        self.max_length = max_length
        self.user_col = user_col
        self.item_col = item_col
        self.time_col = time_col
        self.duration_col = duration_col
        self.positive_labels = positive_labels
        self.duration_threshold = duration_threshold

        df = df.sort_values(time_col)
        self.items = df.groupby(user_col)[item_col].agg(list).to_dict()
        self.duration = df.groupby(user_col)[duration_col].agg(list).to_dict()
        self.user_ids = list(self.items.keys())

    def __len__(self):

        return len(self.user_ids)
    
    def __getitem__(self, idx):

        item_sequence = self.items[self.user_ids[idx]]
        duration_sequence = self.duration[self.user_ids[idx]]

        if len(item_sequence) > self.max_length + 1:
            item_sequence = item_sequence[-self.max_length - 1:]
            duration_sequence = duration_sequence[-self.max_length - 1:]

        input_ids = np.array(item_sequence[:-1])
        duration = np.array(duration_sequence[:-1])
        labels = np.array(item_sequence[1:])
        
        if self.positive_labels:
            labels = labels.astype(float)
            labels[np.array(duration_sequence[1:]) < self.duration_threshold] = np.nan
            labels = pd.Series(labels).fillna(method='bfill').values

            input_ids = input_ids[~np.isnan(labels)]
            duration = duration[~np.isnan(labels)]
            labels = labels[~np.isnan(labels)]
            labels = labels.astype(int)
            
            # convert to binary
            #feedback = (duration >= self.duration_threshold).astype(int) + 1

        return {'input_ids': input_ids, 'labels': labels}

class NegFeedbackPredictionDataset(NegFeedbackDataset):

    def __init__(self, df, max_length=128, validation_mode=False,
                 user_col='test_user_idx', item_col=ITEM_COL,
                 time_col='time_idx', duration_col=RELEVANCE_COL,
                 positive_labels=True, duration_threshold=30):

        super().__init__(df, max_length=max_length,
                         user_col=user_col, item_col=item_col, time_col=time_col,
                         duration_col=duration_col, positive_labels=positive_labels,
                         duration_threshold=duration_threshold)

        self.validation_mode = validation_mode

    def __getitem__(self, idx):

        user_id = self.user_ids[idx]
        item_sequence = self.items[user_id]
        duration_sequence = self.duration[self.user_ids[idx]]

        if self.validation_mode:
            target = item_sequence[-1]
            input_ids = np.array(item_sequence[-self.max_length-1:-1])
            duration = np.array(duration_sequence[-self.max_length-1:-1])
            item_sequence = item_sequence[:-1]
            # convert to binary
            #feedback = (duration >= self.duration_threshold).astype(int) + 1

            return {'input_ids': input_ids, 'user_id': user_id,
                    'full_history': item_sequence, 'target': target,
                    }
        else:
            input_ids = np.array(item_sequence[-self.max_length:])
            duration = np.array(duration_sequence[-self.max_length:])
            # convert to binary
            #feedback = (duration >= self.duration_threshold).astype(int) + 1

            return {'input_ids': input_ids, 'user_id': user_id,
                    'full_history': item_sequence}

In [8]:
class NegFeedbackDataset(LMDataset):

    def __init__(self, df, max_length=200, num_negatives=None, full_negative_sampling=True,
                 user_col='user_id', item_col=ITEM_COL, time_col='time_idx',
                 relevance_col=RELEVANCE_COL, positive_labels=True,
                 relevance_threshold=3.5):

        super().__init__(df, max_length, num_negatives, full_negative_sampling,
                         user_col, item_col, time_col)
        #self.num_negatives = num_negatives
        #self.max_length = max_length
        #self.user_col = user_col
        #self.item_col = item_col
        #self.time_col = time_col
        self.relevance_col = relevance_col
        self.positive_labels = positive_labels
        self.relevance_threshold = relevance_threshold

        df = df.sort_values(time_col)
        self.items = df.groupby(user_col)[item_col].agg(list).to_dict()
        self.relevance = df.groupby(user_col)[relevance_col].agg(list).to_dict()
        self.user_ids = list(self.items.keys())

    def __len__(self):

        return len(self.user_ids)
    
    def __getitem__(self, idx):

        item_sequence = self.items[self.user_ids[idx]]
        relevance_sequence = self.relevance[self.user_ids[idx]]

        if len(item_sequence) > self.max_length + 1:
            item_sequence = item_sequence[-self.max_length - 1:]
            relevance_sequence = relevance_sequence[-self.max_length - 1:]

        input_ids = np.array(item_sequence[:-1])
        relevance = np.array(relevance_sequence[:-1])
        labels = np.array(item_sequence[1:])
        
        if self.positive_labels:
            labels = labels.astype(float)
            labels[np.array(relevance_sequence[1:]) < self.relevance_threshold] = np.nan
            labels = pd.Series(labels).fillna(method='bfill').values

            input_ids = input_ids[~np.isnan(labels)]
            relevance = relevance[~np.isnan(labels)]
            labels = labels[~np.isnan(labels)]
            labels = labels.astype(int)
            
            # convert to binary
            #feedback = (relevance >= self.relevance_threshold).astype(int) + 1
        if self.num_negatives:
            negatives = self.sample_negatives(item_sequence)
            return {'input_ids': input_ids, 'labels': labels, 'negatives': negatives}
        

        return {'input_ids': input_ids, 'labels': labels}

class NegFeedbackPredictionDataset(NegFeedbackDataset):

    def __init__(self, df, max_length=200, validation_mode=False,
                 user_col='test_user_idx', item_col=ITEM_COL,
                 time_col='time_idx', relevance_col=RELEVANCE_COL,
                 positive_labels=True, relevance_threshold=3.5):

        super().__init__(df, max_length=max_length,
                         user_col=user_col, item_col=item_col, time_col=time_col,
                         relevance_col=relevance_col, positive_labels=positive_labels,
                         relevance_threshold=relevance_threshold)

        self.validation_mode = validation_mode

    def __getitem__(self, idx):

        user_id = self.user_ids[idx]
        item_sequence = self.items[user_id]
        relevance_sequence = self.relevance[self.user_ids[idx]]

        if self.validation_mode:
            target = item_sequence[-1]
            input_ids = np.array(item_sequence[-self.max_length-1:-1])
            relevance = np.array(relevance_sequence[-self.max_length-1:-1])
            item_sequence = item_sequence[:-1]
            # convert to binary
            #feedback = (relevance >= self.relevance_threshold).astype(int) + 1

            return {'input_ids': input_ids, 'user_id': user_id,
                    'full_history': item_sequence, 'target': target,
                    }
        else:
            input_ids = np.array(item_sequence[-self.max_length:])
            relevance = np.array(relevance_sequence[-self.max_length:])
            # convert to binary
            #feedback = (relevance >= self.relevance_threshold).astype(int) + 1

            return {'input_ids': input_ids, 'user_id': user_id,
                    'full_history': item_sequence}

In [9]:
class SeqRecNegFeedback(SeqRec):
    def training_step(self, batch, batch_idx):
        outputs = self.model(batch['input_ids'], batch['attention_mask'])
        loss = self.compute_loss(outputs, batch)

        return loss

    def prediction_output(self, batch):
        return self.model(batch['input_ids'], batch['attention_mask'])


def get_eval_dataset_negative_feedback(validation_full, max_length, validation_size, relevance_col, relevance_threshold):
    validation_users = validation_full.user_id.unique()

    if validation_size and (validation_size < len(validation_users)):
        validation_users = np.random.choice(validation_users, size=validation_size, replace=False)

    eval_dataset = NegFeedbackPredictionDataset(
        validation_full[validation_full.user_id.isin(validation_users)],
        max_length=max_length,
        user_col='test_user_idx',
        validation_mode=True,
        relevance_col=relevance_col,
        relevance_threshold=relevance_threshold,
    )
    return eval_dataset

In [10]:
'''class SeqRecNegFeedback(SeqRec):
    def training_step(self, batch, batch_idx):
        outputs = self.model(batch['input_ids'], batch['attention_mask'])
        loss = self.compute_loss(outputs, batch)

        return loss

    def prediction_output(self, batch):
        return self.model(batch['input_ids'], batch['attention_mask'])


def get_eval_dataset_negative_feedback(validation_full, max_length, validation_size, duration_col, duration_threshold):
    validation_users = validation_full.user_id.unique()

    if validation_size and (validation_size < len(validation_users)):
        validation_users = np.random.choice(validation_users, size=validation_size, replace=False)

    eval_dataset = NegFeedbackPredictionDataset(
        validation_full[validation_full.user_id.isin(validation_users)],
        max_length=max_length,
        user_col='test_user_idx',
        validation_mode=True,
        duration_col=duration_col,
        duration_threshold=duration_threshold,
    )
    return eval_dataset

SyntaxError: EOF while scanning triple-quoted string literal (3651170672.py, line 26)

In [11]:
%%time
train_dataset = NegFeedbackDataset(train,
                                   relevance_col=RELEVANCE_COL, 
                                   relevance_threshold=RELEVANCE_THRESHOLD, 
                                   max_length=MAX_LENGTH, 
                                   num_negatives=3000)
val_1_dataset = get_eval_dataset_negative_feedback(val_users_history_1,
                                                  max_length=MAX_LENGTH, 
                                                  validation_size=VALIDATION_SIZE,
                                                  relevance_col=RELEVANCE_COL,
                                                  relevance_threshold=RELEVANCE_THRESHOLD)

val_2_dataset = get_eval_dataset_negative_feedback(val_users_history_2,
                                                  max_length=MAX_LENGTH, 
                                                  validation_size=VALIDATION_SIZE,
                                                  relevance_col=RELEVANCE_COL,
                                                  relevance_threshold=RELEVANCE_THRESHOLD)
test_dataset = get_eval_dataset_negative_feedback(test_users_history,
                                                  max_length=MAX_LENGTH, 
                                                  validation_size=VALIDATION_SIZE,
                                                  relevance_col=RELEVANCE_COL,
                                                  relevance_threshold=RELEVANCE_THRESHOLD)

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE,
    shuffle=True, num_workers=NUM_WORKERS,
    collate_fn=PaddingCollateFn()
)
val_1_loader = DataLoader(
    val_1_dataset, batch_size=TEST_BATCH_SIZE,
    shuffle=False, num_workers=NUM_WORKERS,
    collate_fn=PaddingCollateFn()
)
val_2_loader = DataLoader(
    val_2_dataset, batch_size=TEST_BATCH_SIZE,
    shuffle=False, num_workers=NUM_WORKERS,
    collate_fn=PaddingCollateFn()
)
test_loader = DataLoader(
    test_dataset, batch_size=TEST_BATCH_SIZE,
    shuffle=False, num_workers=NUM_WORKERS,
    collate_fn=PaddingCollateFn()
)

CPU times: user 26.5 s, sys: 2.88 s, total: 29.4 s
Wall time: 29.4 s


In [12]:
batch = next(iter(val_1_loader))
print(batch['input_ids'].shape)

torch.Size([32, 200])


## Model

In [13]:
SASREC_CONFIG = {
    'maxlen': 200,
    'hidden_units': 64,
    'num_blocks': 2,
    'num_heads': 1,
    'dropout_rate': 0.1,
}

In [14]:
item_count = train.item_id.max()
add_head = True

model = SASRec(item_num=item_count, add_head=add_head, **SASREC_CONFIG)

In [15]:
out = model(batch['input_ids'], batch['attention_mask'])
out.shape

torch.Size([32, 200, 7905])

## Train

In [16]:
seqrec_module = SeqRecNegFeedback(model, lr=0.001, predict_top_k=200, filter_seen=True)

early_stopping = EarlyStopping(monitor="val_ndcg", mode="max", patience=10, verbose=False)

model_summary = ModelSummary(max_depth=2)
checkpoint = ModelCheckpoint(save_top_k=1, monitor="val_ndcg",
                             mode="max", save_weights_only=True)
callbacks=[early_stopping, model_summary, checkpoint]

trainer = pl.Trainer(callbacks=callbacks, enable_checkpointing=True,
                     gpus=1, max_epochs=100)

trainer.fit(model=seqrec_module,
            train_dataloaders=train_loader,
            val_dataloaders=val_2_loader)

Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                       | Type       | Params
----------------------------------------------------------
0 | model                      | SASRec     | 569 K 
1 | model.item_emb             | Embedding  | 505 K 
2 | model.pos_emb              | Embedding  | 12.8 K
3 | model.emb_dropout          | Dropout    | 0     
4 | model.attention_layernorms | ModuleList | 256   
5 | model.attention_layers     | ModuleList | 33.3 K
6 | model.forward_layernorms   | ModuleList | 256   
7 | model.forward_layers       | ModuleList | 16.6 K
8 | model.last_layernorm       | LayerNorm  | 128   
----------------------------------------------

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [17]:
seqrec_module.load_state_dict(torch.load(checkpoint.best_model_path)['state_dict'])

<All keys matched successfully>

## Evaluation

In [18]:
seqrec_module.predict_top_k = test.item_id.nunique()
preds = trainer.predict(model=seqrec_module, dataloaders=val_2_loader)

preds_val = preds2recs(preds)
print(preds_val.shape)
preds_val.head()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 188it [00:00, ?it/s]

(76387752, 3)


Unnamed: 0,user_id,item_id,prediction
0,0,2180,6.803426
1,0,220,6.694895
2,0,2466,6.442806
3,0,2484,6.222824
4,0,7248,6.159751


In [19]:
seqrec_module.predict_top_k = test.item_id.nunique()
preds = trainer.predict(model=seqrec_module, dataloaders=test_loader)

preds_test = preds2recs(preds)
print(preds_test.shape)
preds_test.head()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 188it [00:00, ?it/s]

(152775504, 3)


Unnamed: 0,user_id,item_id,prediction
0,0,7502,6.958642
1,0,7156,6.831086
2,0,7728,6.234657
3,0,6656,6.072198
4,0,6962,5.754722


In [20]:
preds_test = preds_test[preds_test.item_id % 2 == 0]
preds_test.loc[:, 'item_id'] /= 2
preds_test = preds_test.groupby('user_id').head(10)

  preds_test.loc[:, 'item_id'] /= 2


In [21]:
preds_val = preds_val[preds_val.item_id % 2 == 0]
preds_val.loc[:, 'item_id'] /= 2
preds_val = preds_val.groupby('user_id').head(10)

  preds_val.loc[:, 'item_id'] /= 2


In [22]:
sum(preds_test.groupby('user_id').count().item_id < 10)

0

In [23]:
preds_val = (preds_val
              .rename(columns={'user_id': 'test_user_idx', 'item_id': 'pred_items'})
              .groupby('test_user_idx')['pred_items']
              .apply(list).reset_index()
              .merge(val_2, on='test_user_idx', how='left'))

In [24]:
preds_test = (preds_test
              .rename(columns={'user_id': 'test_user_idx', 'item_id': 'pred_items'})
              .groupby('test_user_idx')['pred_items']
              .apply(list).reset_index()
              .merge(test, on='test_user_idx', how='left'))

In [27]:
metrics_df, beta = get_metrics(preds_test, preds_val)

In [28]:
metrics_df

Unnamed: 0,type,HR,MRR,nDCG
0,Biased,0.139302,0.049017,0.069879
1,Unbiased,0.219345,0.12906,0.127188
2,Unbiased_feedback_sampling,0.675571,0.253271,0.087945


In [29]:
beta

0.09299743475619497

In [37]:
from src.unbiased_metrics import confusion_matrix_metrics

In [33]:
relevance_col = 'rating'
relevance_threshold=3.5

In [34]:
preds_test_pos = preds_test[preds_test[relevance_col] >= relevance_threshold]
preds_val_neg = preds_val[preds_val[relevance_col] < relevance_threshold]
preds_val_pos = preds_val[preds_val[relevance_col] >= relevance_threshold]

tp, fn = confusion_matrix_metrics(preds_val_pos, user_col, item_col)
fp, tn = confusion_matrix_metrics(preds_val_neg, user_col, item_col)

In [46]:
tp / (tp + fp)

0.6799846331156358

In [36]:
tp / (tp + fn)

0.13676402410755678

In [40]:
preds_test_pos = preds_test[preds_test['rating'] >= 3.5]

In [41]:
hr(preds_test_pos, beta=beta, sample_feedback=True, return_confidence_interval=True)

(0.675571, 0.0014082339740911776)

In [42]:
mrr(preds_test_pos, beta=beta, sample_feedback=True, return_confidence_interval=True)

(0.253271, 0.0010478673038056274)

In [43]:
ndcg(preds_test_pos, beta=beta, sample_feedback=True, return_confidence_interval=True)

(0.087945, 6.648713827063502e-05)