# Module Imports

In [2]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from transformers import BertModel, BertConfig
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
import random

random.seed(42)
np.random.seed(42)

# Load and Process Data

In [3]:
# Load MovieLens dataset
data_path = 'ml-100k/u.data'
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv(data_path, sep='\t', names=column_names)

In [4]:
movies = pd.read_csv('ml-100k/movies.csv', sep=',')

In [5]:
# Preprocess the dataset
user_ids = data['user_id'].unique()
item_ids = data['item_id'].unique()
uid2idx = {uid: idx for idx, uid in enumerate(user_ids)}
iid2idx = {iid: idx for idx, iid in enumerate(item_ids)}
idx2uid = {idx: uid for uid, idx in uid2idx.items()}
idx2iid = {idx: iid for iid, idx in iid2idx.items()}
data['user_id'] = data['user_id'].map(uid2idx)
data['item_id'] = data['item_id'].map(iid2idx)
movies['item_id'] = movies['movieId'].map(iid2idx)

In [6]:
# Train-test split
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [7]:
# Sort data by timestamp
train_data = train_data.sort_values(by='timestamp')
test_data = test_data.sort_values(by='timestamp')

In [16]:
def generate_sequences_and_labels(data, user_ids, item_ids, seq_length, num_items):
    sequences, labels = [], []

    for user_id in user_ids:
        user_data = data[data['user_id'] == user_id]
        user_items = user_data['item_id'].values.tolist()
        
        # Create sequences of fixed length for each user
        for i in range(len(user_items) - seq_length):
            sequences.append(user_items[i:i + seq_length])
            
            label = np.zeros(num_items)
            if user_items[i + seq_length] in user_items[i:i + seq_length]:
                label[user_items[i + seq_length]] = 1  # Positive feedback
            else:
                label[user_items[i + seq_length]] = 0  # Negative feedback
            labels.append(label)

    sequences = np.array(sequences)
    labels = np.array(labels)
    return sequences, labels

In [17]:
seq_length = 5  # Adjust this value based on your requirements
train_sequences, train_labels = generate_sequences_and_labels(train_data, user_ids, item_ids, seq_length, len(item_ids))
test_sequences, test_labels = generate_sequences_and_labels(test_data, user_ids, item_ids, seq_length, len(item_ids))

# Visualize Data

In [13]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,0,3,881250949
1,1,1,3,891717742
2,2,2,1,878887116
3,3,3,2,880606923
4,4,4,1,886397596


In [11]:
num_users = len(user_ids)
num_items = len(item_ids)
print('Number of users: ', num_users)
print('Number of items: ', num_items)
print('Number of interactions: ', len(data))
print('Average rating: ', data['rating'].mean())
print('Start Date: ', pd.to_datetime(data['timestamp'].min(), unit='s'))
print('End Date: ', pd.to_datetime(data['timestamp'].max(), unit='s'))

Number of users:  943
Number of items:  1682
Number of interactions:  100000
Average rating:  3.52986
Start Date:  1997-09-20 03:05:10
End Date:  1998-04-22 23:10:38


# Torch Dataset

In [18]:
class MovieLensDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [19]:
batch_size = 128

train_dataset = MovieLensDataset(train_sequences, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = MovieLensDataset(test_sequences, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# NextItNet Model

In [20]:
class BERT4Rec(nn.Module):
    def __init__(self, num_items, hidden_size=128, num_layers=2, num_heads=2, dropout_rate=0.1):
        super().__init__()
        self.item_embedding = nn.Embedding(num_items, hidden_size)
        bert_config = BertConfig(
            vocab_size=num_items,
            hidden_size=hidden_size,
            num_hidden_layers=num_layers,
            num_attention_heads=num_heads,
            intermediate_size=hidden_size,
            hidden_dropout_prob=dropout_rate,
            attention_probs_dropout_prob=dropout_rate,
        )
        self.bert = BertModel(bert_config)
        self.output_layer = nn.Linear(hidden_size, num_items)

    def forward(self, x):
        x = self.item_embedding(x)
        x = self.bert(inputs_embeds=x).last_hidden_state
        x = self.output_layer(x)
        return x

In [21]:
# Hyperparameters
hidden_size = 128
num_layers = 2
num_heads = 2
dropout_rate = 0.1

# Create the model
bert4rec = BERT4Rec(num_items, hidden_size, num_layers, num_heads, dropout_rate)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(bert4rec.parameters())

In [28]:
num_epochs = 10

bert4rec.train()

for epoch in tqdm(range(num_epochs)):
    for batch_idx, (seqs, labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        logits = bert4rec(seqs)
        logits = logits[:, -1, :]
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}")

Epoch [1/10], Loss: -0.0
Epoch [2/10], Loss: -0.0
Epoch [3/10], Loss: -0.0
Epoch [4/10], Loss: -0.0
Epoch [5/10], Loss: -0.0


KeyboardInterrupt: 

# Uncertainty Score

In [34]:
def estimate_uncertainty(model, user_sequence, num_samples=100):
    user_sequence = torch.tensor([user_sequence], dtype=torch.long)
    with torch.no_grad():
        predictions = torch.stack([model(user_sequence)[:, -1, :] for _ in range(num_samples)], dim=-1)
    means = torch.mean(predictions, dim=-1).numpy()
    variances = torch.var(predictions, dim=-1).numpy()
    return means, variances

def get_movie_titles(recommendations, movies_df):
    movie_titles = []
    for index in recommendations:
        movie_title = movies_df.loc[index, 'title']
        movie_titles.append(movie_title)
    return movie_titles

In [40]:
# Grab X random item ids
test_sequence_length = 16
user_sequence = list(np.random.choice(item_ids, test_sequence_length))  # A sample user sequence
means, variances = estimate_uncertainty(bert4rec, user_sequence)

k = 5
top_k_recommendations = np.argsort(means[0])[-k:][::-1]  # Get top 5 recommended items
top_k_uncertainties = variances[0][top_k_recommendations]  # Get corresponding uncertainty scores

In [41]:
top_k_movie_titles = get_movie_titles(top_k_recommendations, movies)
for i in range(len(top_k_movie_titles)):
    print(top_k_movie_titles[i], 'Uncertainty Score: ', top_k_uncertainties[i])

Last Days of Disco, The (1998) Uncertainty Score:  0.040344868
Ciao, Professore! (Io speriamo che me la cavo) (1992) Uncertainty Score:  0.028077228
Everyone Says I Love You (1996) Uncertainty Score:  0.028014032
Mad Love (1995) Uncertainty Score:  0.024429861
Jerk, The (1979) Uncertainty Score:  0.03341728
