In [1]:
import pandas as pd
import numpy as np
import os

# Define paths to the MIND-small dataset
news_path = 'MINDsmall_train/news.tsv'
behaviors_path = 'MINDsmall_train/behaviors.tsv'

# Load data
news_df = pd.read_csv(news_path, sep='\t', header=None,
                      names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])

behaviors_df = pd.read_csv(behaviors_path, sep='\t', header=None,
                           names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

# Explore data
print(news_df.head())
print(behaviors_df.head())


  news_id   category      subcategory  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   
3  N53526     health           voices   
4  N38324     health          medical   

                                               title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1                      50 Worst Habits For Belly Fat   
2  The Cost of Trump's Aid Freeze in the Trenches...   
3  I Was An NBA Wife. Here's How It Affected My M...   
4  How to Get Rid of Skin Tags, According to a De...   

                                            abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   
3  I felt like I was a fraud, and being an NBA wi...   
4  They seem harmless, but there's a very good re...   

                                             url  \
0  https://assets.msn.com/l

In [2]:
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Clean function for text
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)

news_df['clean_title'] = news_df['title'].apply(clean_text)


[nltk_data] Downloading package stopwords to /home/smitha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from torch.utils.data import Dataset
import random

# Create mapping from news_id to integer index
news_list = news_df['news_id'].tolist()
news_id2idx = {nid: idx+1 for idx, nid in enumerate(news_list)}  # +1 for padding=0

# Convert user histories into index sequences
def encode_history(history_string):
    return [news_id2idx[nid] for nid in history_string.split() if nid in news_id2idx]

sequences = behaviors_df['history'].dropna().apply(encode_history)
sequences = [s for s in sequences if len(s) >= 5]  # keep only long enough histories


In [4]:
class BERT4RecDataset(Dataset):
    def __init__(self, sequences, max_len=50, mask_prob=0.15):
        self.sequences = sequences
        self.max_len = max_len
        self.mask_prob = mask_prob
        self.mask_token = len(news_id2idx) + 1  # use last index for [MASK]

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx][-self.max_len:]
        seq = [0] * (self.max_len - len(seq)) + seq  # pad on the left

        input_ids = seq.copy()
        labels = [-100] * self.max_len

        for i in range(self.max_len):
            if input_ids[i] != 0 and random.random() < self.mask_prob:
                labels[i] = input_ids[i]
                input_ids[i] = self.mask_token

        return torch.tensor(input_ids), torch.tensor(labels)


In [None]:
import math
def get_sinusoidal_encoding(seq_len, dim, device):
    pe = torch.zeros(seq_len, dim, device=device)
    position = torch.arange(0, seq_len, dtype=torch.float, device=device).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, dim, 2, device=device).float() * (-math.log(10000.0) / dim))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe.unsqueeze(0)

In [5]:
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class BERT4Rec(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, num_heads=4, num_layers=2, max_len=50):
        super(BERT4Rec, self).__init__()
        self.embed_dim = embed_dim
        self.token_embed = nn.Embedding(vocab_size + 2, embed_dim, padding_idx=0)  # +2 for mask and pad
        encoder_layer = TransformerEncoderLayer(embed_dim, num_heads, batch_first=True)
        self.transformer = TransformerEncoder(encoder_layer, num_layers)
        self.output_layer = nn.Linear(embed_dim, vocab_size + 1)  # output logits for each token

    def forward(self, input_ids):
        device = input_ids.device
        seq_len = input_ids.size(1)
        x = self.token_embed(input_ids)
        pos_enc = get_sinusoidal_encoding(seq_len, self.embed_dim, device)
        x = x + pos_enc
        x = self.transformer(x)
        return self.output_layer(x)

In [6]:
from torch.utils.data import DataLoader

dataset = BERT4RecDataset(sequences)
dataloader = DataLoader(dataset, batch_size=128, shuffle=True)

model = BERT4Rec(vocab_size=len(news_id2idx))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

model.train()
for epoch in range(10):
    for input_ids, labels in dataloader:
        logits = model(input_ids)
        loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")


Epoch 1: Loss = 8.7381
Epoch 2: Loss = 8.5564
Epoch 3: Loss = 8.3818
Epoch 4: Loss = 8.4325
Epoch 5: Loss = 8.4036
Epoch 6: Loss = 8.3967
Epoch 7: Loss = 8.2226
Epoch 8: Loss = 8.2932
Epoch 9: Loss = 8.5936
Epoch 10: Loss = 8.3075


In [7]:
def recommend_bert4rec(user_seq, top_k=10):
    model.eval()
    encoded = encode_history(user_seq)[-50:]
    input_ids = [0] * (50 - len(encoded)) + encoded
    input_tensor = torch.tensor([input_ids])
    with torch.no_grad():
        logits = model(input_tensor)[0, -1]  # focus on last position
        topk = torch.topk(logits, k=top_k).indices.tolist()
    idx2news = {v: k for k, v in news_id2idx.items()}
    return [idx2news[i] for i in topk if i in idx2news]

# Example: generate recommendation
user_history = behaviors_df.iloc[0]['history']
print("Recommendations:", recommend_bert4rec(user_history))


Recommendations: ['N47020', 'N31801', 'N61864', 'N41375', 'N59704', 'N43142', 'N19347', 'N13429', 'N56753', 'N4607']


In [8]:
val_behaviors_df = pd.read_csv(f"./MINDsmall_dev/behaviors.tsv", sep='\t', header=None,
                               names=['impression_id', 'user_id', 'time', 'history', 'impressions'])
val_behaviors_df['history'] = val_behaviors_df['history'].fillna('')

from sklearn.metrics import ndcg_score, label_ranking_average_precision_score

def evaluate_model(model, val_df, news_id2idx):
    model.eval()
    device = next(model.parameters()).device
    all_ndcgs = []
    all_hits = []

    for _, row in val_df.iterrows():
        history = [news_id2idx[nid] for nid in row['history'].split() if nid in news_id2idx]
        if len(history) == 0:
            continue
        history = history[-50:]
        input_tensor = torch.tensor([history + [0]*(50 - len(history))]).to(device)

        with torch.no_grad():
            logits = model(input_tensor)
            scores = logits[0, -1].cpu().numpy()

        impressions = row['impressions'].split()
        news_ids = [imp.split('-')[0] for imp in impressions if imp.split('-')[0] in news_id2idx]
        labels = [int(imp.split('-')[1]) for imp in impressions if imp.split('-')[0] in news_id2idx]
        candidates = [news_id2idx[nid] for nid in news_ids]
        candidate_scores = [scores[cid] for cid in candidates]

        if len(candidate_scores) > 1 and sum(labels) > 0:
            all_ndcgs.append(ndcg_score([labels], [candidate_scores]))
            all_hits.append(label_ranking_average_precision_score([labels], [candidate_scores]))

    if all_ndcgs:
        print(f"Avg NDCG: {sum(all_ndcgs)/len(all_ndcgs):.4f}")
    if all_hits:
        print(f"Avg MRR: {sum(all_hits)/len(all_hits):.4f}")



In [9]:
evaluate_model(model, val_behaviors_df, news_id2idx)

Avg NDCG: 0.4162
Avg MRR: 0.2469
