# Import Libraries:

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, BertConfig, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.notebook import tqdm
from torch.cuda.amp import GradScaler, autocast

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Load, Preprocess, and Split:

In [3]:
# Load dataset
ratings = pd.read_csv('/content/drive/MyDrive/recco/ratings.csv')

# Preprocess the dataset
ratings = ratings[['userId', 'movieId', 'rating']]
ratings = ratings.dropna()
ratings['userId'] = ratings['userId'].astype('str')
ratings['movieId'] = ratings['movieId'].astype('str')
ratings['rating'] = ratings['rating'].astype('float32')

# Create train and test sets
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)

# Model Define:

In [4]:
class BERT4Rec(nn.Module):
    def __init__(self):
        super(BERT4Rec, self).__init__()
        config = BertConfig.from_pretrained('bert-base-uncased')
        self.bert = BertModel.from_pretrained('bert-base-uncased', config=config)
        self.fc = nn.Linear(config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        output = self.fc(pooled_output)
        return output

# Dataset Class:

In [5]:
class MovieLensDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        user_id = row['userId']
        movie_id = row['movieId']
        rating = row['rating']

        # Encode user_id and movie_id
        encoded = self.tokenizer.encode_plus(
            user_id,
            movie_id,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoded['input_ids'].flatten(),
            'attention_mask': encoded['attention_mask'].flatten(),
            'rating': torch.tensor(rating, dtype=torch.float32)
        }

# hyperparameters, Tokenizer, DataLoader, Intilize Model, Optimizer, and Loss Function

In [6]:
# Hyperparameters
batch_size = 256
max_length = 16
epochs = 3
accumulation_steps = 2
learning_rate = 3e-5

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# DataLoader
train_dataset = MovieLensDataset(train_df, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=True)

# Initialize model
model = BERT4Rec()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device)

# Optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

# Learning rate scheduler
total_steps = len(train_loader) * epochs // accumulation_steps
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Mixed precision training
scaler = GradScaler()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


cuda


# Training Loop:

In [7]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    optimizer.zero_grad()
    for i, batch in enumerate(tqdm(train_loader, desc=f'Epoch {epoch + 1}', leave=False)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        rating = batch['rating'].to(device)

        with autocast():
            output = model(input_ids, attention_mask)
            loss = loss_fn(output.flatten(), rating)
            loss = loss / accumulation_steps

        scaler.scale(loss).backward()

        if (i + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        total_loss += loss.item() * accumulation_steps

    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}')

Epoch 1:   0%|          | 0/58594 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Evaluation:

In [10]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Evaluation mode
model.eval()
test_dataset = MovieLensDataset(test_df, tokenizer, max_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

total_loss = 0
all_predictions = []
all_ratings = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing', leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        rating = batch['rating'].to(device)

        output = model(input_ids, attention_mask)
        loss = loss_fn(output.flatten(), rating)
        total_loss += loss.item()

        all_predictions.extend(output.flatten().cpu().numpy())
        all_ratings.extend(rating.cpu().numpy())

# Calculate average loss
average_loss = total_loss / len(test_loader)
print(f'Average Test Loss: {average_loss}')

# Calculate RMSE and MAE
rmse = np.sqrt(mean_squared_error(all_ratings, all_predictions))
mae = mean_absolute_error(all_ratings, all_predictions)
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')

Testing:   0%|          | 0/14649 [00:00<?, ?it/s]

Average Test Loss: 1.0763259845362954
RMSE: 1.0374630689620972
MAE: 0.7891854643821716


In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/recco/bert4rec_model.pth')

# Deployment:

In [9]:
# Load dataset and get unique movie IDs
ratings = pd.read_csv('/content/drive/MyDrive/recco/ratings.csv')
movie_ids = ratings['movieId'].unique()

# Load movie data
movies = pd.read_csv('/content/drive/MyDrive/recco/movies.csv')

# Create a dictionary to map movie IDs to titles
movie_id_to_title = dict(zip(movies['movieId'], movies['title']))

# Initialize the tokenizer and load the model
model.load_state_dict(torch.load('/content/drive/MyDrive/recco/bert4rec_model.pth'))
model.to(device)

BERT4Rec(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [11]:
# Function to predict ratings for all movies for a given user
def predict_ratings_for_user(user_id, movie_ids, model, tokenizer, device, max_length=128):
    model.eval()
    predictions = []

    for movie_id in movie_ids:
        encoded = tokenizer.encode_plus(
            str(user_id),
            str(movie_id),
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = encoded['input_ids'].to(device)
        attention_mask = encoded['attention_mask'].to(device)

        with torch.no_grad():
            output = model(input_ids, attention_mask)

        rating = output.flatten().cpu().numpy()[0]
        predictions.append((movie_id, rating))

    return predictions

# Top 5 Movies for Specific User

In [12]:
# Function to get top N movies for a given user
def get_top_n_movies_for_user_with_titles(user_id, movie_ids, model, tokenizer, device, n=5, max_length=128):
    predictions = predict_ratings_for_user(user_id, movie_ids, model, tokenizer, device, max_length)
    predictions.sort(key=lambda x: x[1], reverse=True)
    top_n_movies = predictions[:n]

    # Map movie IDs to titles
    top_n_movies_with_titles = [(movie_id, movie_id_to_title.get(movie_id, "Unknown"), rating) for movie_id, rating in top_n_movies]
    return top_n_movies_with_titles

# Example usage
user_id = "1"
top_n_movies_with_titles = get_top_n_movies_for_user_with_titles(user_id, movie_ids, model, tokenizer, device, n=5)
print("Top 5 movies for user", user_id, ":", top_n_movies_with_titles)

Top 5 movies for user 1 : [(318, 'Shawshank Redemption, The (1994)', 4.825977), (858, 'Godfather, The (1972)', 4.7682385), (1221, 'Godfather: Part II, The (1974)', 4.7254777), (50, 'Usual Suspects, The (1995)', 4.706906), (8650, "Long Day's Journey Into Night (1962)", 4.705018)]


# rows_to_remove = len(ratings) // 4
# rows_to_drop = np.random.choice(ratings.index, size=rows_to_remove, replace=False)
# ratings = ratings.drop(rows_to_drop)