In [58]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tqdm import tqdm
import warnings
import gc
warnings.filterwarnings("ignore")

In [59]:
!pip3 install torch_geometric



In [60]:
news_df = pd.read_table("Documents/CMPE256/256-Project/MINDsmall_train/news.tsv", header=None, names=[
    "news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"
])

behaviors_df = pd.read_table("Documents/CMPE256/256-Project/MINDsmall_train/behaviors.tsv", header=None, names=[
    "impression_id", "user_id", "time", "history", "impressions"
])


Preprocessing: Cleaning the dataset

In [61]:
news_df.dropna(subset=['title'], inplace=True)
behaviors_df.dropna(subset=['impressions'], inplace=True)
print("News\n")
print(news_df.head())
print("Click behaviors")
print(behaviors_df.head())

News

  news_id   category      subcategory  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   
3  N53526     health           voices   
4  N38324     health          medical   

                                               title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1                      50 Worst Habits For Belly Fat   
2  The Cost of Trump's Aid Freeze in the Trenches...   
3  I Was An NBA Wife. Here's How It Affected My M...   
4  How to Get Rid of Skin Tags, According to a De...   

                                            abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   
3  I felt like I was a fraud, and being an NBA wi...   
4  They seem harmless, but there's a very good re...   

                                             url  \
0  https://assets.msn

Extracting the clicked news from behaviors dataset with news ids marked with 1(representing clicked news) and thus adding a new column of clicked_news

In [62]:
def extract_clicked_news(imp):
    return [i.split('-')[0] for i in imp.split() if i.endswith('-1')]

behaviors_df['clicked_news'] = behaviors_df['impressions'].apply(extract_clicked_news)

In [63]:
user_clicks = behaviors_df.explode('clicked_news')[['user_id', 'clicked_news']]
user_clicks.columns = ['user_id', 'news_id']


In [64]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_clicks['user_idx'] = user_encoder.fit_transform(user_clicks['user_id'])
user_clicks['item_idx'] = item_encoder.fit_transform(user_clicks['news_id'])

num_users = user_clicks['user_idx'].nunique()
num_items = user_clicks['item_idx'].nunique()
print(f"Unique users: {num_users}, Unique news items: {num_items}")

Unique users: 50000, Unique news items: 7713


In [65]:
import torch
from torch_geometric.data import Data

# Convert user and item indices to torch tensors
edge_index = torch.tensor([
    user_clicks['user_idx'].values,
    user_clicks['item_idx'].values + num_users  # shift item indices to avoid overlap
], dtype=torch.long)

# Create PyG Data object for bipartite graph
data = Data(edge_index=edge_index)

# Save useful attributes for later use
data.num_nodes = num_users + num_items
data.num_users = num_users
data.num_items = num_items

print(data)

Data(edge_index=[2, 236344], num_nodes=57713, num_users=50000, num_items=7713)


In [66]:
import torch.nn as nn
import torch.nn.functional as F

class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64, num_layers=3):
        super(LightGCN, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.num_nodes = num_users + num_items
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        # Initialize user and item embeddings
        self.embedding = nn.Embedding(self.num_nodes, embedding_dim)
        nn.init.xavier_uniform_(self.embedding.weight)

    def forward(self, edge_index):
        # Initial embeddings
        x = self.embedding.weight

        # To accumulate layer-wise embeddings
        all_embeddings = [x]

        for _ in range(self.num_layers):
            # LightGCN propagation: simple mean aggregation from neighbors
            row, col = edge_index
            deg = torch.bincount(row, minlength=self.num_nodes).float().clamp(min=1)
            norm = 1.0 / deg[row].sqrt() / deg[col].sqrt()
            x = torch.zeros_like(x).scatter_add_(0, row.unsqueeze(-1).expand(-1, x.size(1)), x[col] * norm.unsqueeze(1))
            all_embeddings.append(x)

        # Final embedding is the sum of embeddings from all layers
        out = torch.stack(all_embeddings, dim=0).mean(dim=0)
        return out

    def get_user_item_embeddings(self):
        out = self.forward(edge_index)
        user_emb = out[:self.num_users]
        item_emb = out[self.num_users:]
        return user_emb, item_emb

In [67]:
import random

def bpr_loss(user_emb, pos_emb, neg_emb):
    pos_scores = torch.sum(user_emb * pos_emb, dim=1)
    neg_scores = torch.sum(user_emb * neg_emb, dim=1)
    loss = -F.logsigmoid(pos_scores - neg_scores).mean()
    return loss

def sample_mini_batch(edge_index, num_users, num_items, batch_size):
    user_indices = torch.randint(0, num_users, (batch_size,))
    pos_items = []
    neg_items = []

    for u in user_indices:
        user_edges = edge_index[1][edge_index[0] == u]
        if len(user_edges) == 0:
            continue
        pos = user_edges[random.randint(0, len(user_edges) - 1)]
        while True:
            neg = torch.randint(0, num_items, (1,)).item()
            if neg + num_users not in user_edges:
                break
        pos_items.append(pos - num_users)
        neg_items.append(neg)

    return user_indices, torch.tensor(pos_items), torch.tensor(neg_items)

# Set training params
embedding_dim = 256
num_layers = 3
batch_size = 1024
epochs = 15
learning_rate = 1e-3

model = LightGCN(num_users, num_items, embedding_dim, num_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [68]:
#Loading the models and embeddings
user_emb = torch.load("Documents/CMPE256/256-Project/user_embeddings.pt")
item_emb = torch.load("Documents/CMPE256/256-Project/item_embeddings.pt")

In [69]:
# Rebuild the model architecture (must match the one used during training)
model = LightGCN(num_users, num_items, embedding_dim=256, num_layers=3)

# Load the trained weights
model.load_state_dict(torch.load("Documents/CMPE256/256-Project/lightgcn_model.pth", map_location=torch.device('cpu')))

# Set the model to evaluation mode
model.eval()

LightGCN(
  (embedding): Embedding(57713, 256)
)

In [71]:
from sklearn.metrics import ndcg_score

def evaluate_ndcg(model, val_samples, k=10):
    model.eval()
    user_emb, item_emb = model.get_user_item_embeddings()

    ndcgs = []
    for _, row in val_samples.iterrows():
        user_idx = row['user_idx']
        item_idxs = row['news_indices']
        labels = row['labels']

        # Skip samples with unknown news
        if any(idx == -1 for idx in item_idxs):
            continue

        scores = torch.matmul(user_emb[user_idx], item_emb[item_idxs].T).detach().numpy()
        ndcg = ndcg_score([labels], [scores], k=k)
        ndcgs.append(ndcg)

    return sum(ndcgs) / len(ndcgs) if ndcgs else 0.0
    
ndcg10_test = evaluate_ndcg(model, test_samples)
print(f"nDCG@10 on MINDsmall_dev test set: {ndcg10_test:.4f}")

nDCG@10 on MINDsmall_dev test set: 0.7940


In [72]:
import pickle
with open('Documents/CMPE256/256-Project/tfidf_vectorizer.pt', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

tfidf_embeddings = torch.load('Documents/CMPE256/256-Project/tfidf_embeddings.pt')  # TF-IDF representation of news articles

# Load BERT Embeddings (semantic content embeddings)
bert_embeddings = torch.load('Documents/CMPE256/256-Project/bert_embeddings.pt')  # Precomputed BERT embeddings for news articles

print(f"TF-IDF Embeddings Shape: {tfidf_embeddings.shape}")
print(f"BERT Embeddings Shape: {bert_embeddings.shape}")

TF-IDF Embeddings Shape: torch.Size([51282, 5000])
BERT Embeddings Shape: torch.Size([51282, 384])


In [73]:
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class BERT4Rec(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, num_heads=4, num_layers=2, max_len=50):
        super(BERT4Rec, self).__init__()
        self.token_embed = nn.Embedding(vocab_size + 2, embed_dim, padding_idx=0)  # +2 for mask and pad
        encoder_layer = TransformerEncoderLayer(embed_dim, num_heads, batch_first=True)
        self.transformer = TransformerEncoder(encoder_layer, num_layers)
        self.output_layer = nn.Linear(embed_dim, vocab_size + 1)  # output logits for each token

    def forward(self, input_ids):
        x = self.token_embed(input_ids)
        x = self.transformer(x)
        return self.output_layer(x)

In [74]:
# Load BERT4Rec user embeddings
bert4rec_user_embeddings = np.load('Documents/CMPE256/256-Project/bert4rec_embeddings.npy')  # shape: (num_users, embedding_dim)
bert4rec_user_embeddings = torch.tensor(bert4rec_user_embeddings)

print(f"BERT4Rec User Embeddings Shape: {bert4rec_user_embeddings.shape}")

# Create mapping from news_id to integer index
news_list = news_df['news_id'].tolist()
news_id2idx = {nid: idx+1 for idx, nid in enumerate(news_list)}  # +1 for padding=0
# Load the full BERT4Rec model directly (NOT state_dict)
bert4rec_model = torch.load('Documents/CMPE256/256-Project/bert4rec_full_model.pth', map_location=torch.device('cpu'), weights_only=False)

# Set to evaluation mode
bert4rec_model.eval()

print("BERT4Rec full model loaded successfully!")

BERT4Rec User Embeddings Shape: torch.Size([51284, 64])
BERT4Rec full model loaded successfully!


Calculation of scores from each model to be combined using weighted ensemble

In [75]:
def score_gnn(user_id, user_encoder, item_encoder, model, edge_index=None):
    """
    Computes GNN scores for a user.
    """
    if user_id not in user_encoder.classes_:
        return None

    user_idx = user_encoder.transform([user_id])[0]
    user_emb, item_emb = model.get_user_item_embeddings()

    scores = torch.matmul(user_emb[user_idx], item_emb.T).detach().numpy()
    return scores


def score_tfidf(user_id, interactions_df, tfidf_embeddings, tfidf_vectorizer, user_encoder, news_id_to_idx):
    """
    Computes TF-IDF scores for a user.
    """
    if user_id not in user_encoder.classes_:
        return None

    user_clicked = interactions_df[interactions_df['user_id'] == user_id]['news_id'].tolist()
    clicked_idx = [news_id_to_idx.get(nid, None) for nid in user_clicked]
    clicked_idx = [idx for idx in clicked_idx if idx is not None]

    if not clicked_idx:
        return np.zeros(tfidf_embeddings.shape[0])

    user_vector = tfidf_embeddings[clicked_idx].mean(axis=0, keepdim=True)
    similarities = cosine_similarity(user_vector, tfidf_embeddings).flatten()

    return similarities

def score_bert(user_id, interactions_df, bert_embeddings, user_encoder, news_id_to_idx):
    """
    Computes BERT content similarity scores for a user.
    """
    if user_id not in user_encoder.classes_:
        return None

    user_clicked = interactions_df[interactions_df['user_id'] == user_id]['news_id'].tolist()
    clicked_idx = [news_id_to_idx.get(nid, None) for nid in user_clicked]
    clicked_idx = [idx for idx in clicked_idx if idx is not None]

    if not clicked_idx:
        return np.zeros(bert_embeddings.shape[0])

    user_vector = bert_embeddings[clicked_idx].mean(dim=0)
    scores = torch.matmul(user_vector, bert_embeddings.T).detach().numpy()

    return scores

def score_bert4rec(user_id, user_encoder, bert4rec_user_embeddings):
    """
    Computes BERT4Rec user embedding scores.
    """
    if user_id not in user_encoder.classes_:
        return None

    user_idx = user_encoder.transform([user_id])[0]

    if user_idx >= bert4rec_user_embeddings.shape[0]:
        return np.zeros(bert4rec_user_embeddings.shape[1])

    user_emb = bert4rec_user_embeddings[user_idx]
    scores = torch.matmul(user_emb, bert4rec_user_embeddings.T).detach().numpy()

    return scores

In [76]:
# interactions_df: user_id ↔ news_id clicked
try:
    interactions_df
except NameError:
    interactions_df = user_clicks[['user_id', 'news_id']]

# news_id_to_idx and idx_to_news_id mappings
try:
    news_id_to_idx
    idx_to_news_id
except NameError:
    news_id_to_idx = {nid: idx for idx, nid in enumerate(news_df['news_id'].tolist())}
    idx_to_news_id = {idx: nid for nid, idx in news_id_to_idx.items()}

# news_id_to_title mapping
try:
    news_id_to_title
except NameError:
    news_id_to_title = dict(zip(news_df['news_id'], news_df['title']))

print("Created all missing mappings: interactions_df, news_id_to_idx, idx_to_news_id, news_id_to_title.")

Created all missing mappings: interactions_df, news_id_to_idx, idx_to_news_id, news_id_to_title.


Function to blend all the models with weightage defined

In [77]:
import numpy as np

def blended_recommendation(
    user_id,
    user_encoder,
    item_encoder,
    model,
    edge_index,
    interactions_df,
    tfidf_embeddings,
    tfidf_vectorizer,
    bert_embeddings,
    bert4rec_user_embeddings,
    news_id_to_idx,
    alpha=0.4,
    beta=0.2,
    gamma=0.2,
    delta=0.2,
    top_k=10,
    candidate_news_ids=None
):
    """
    Generate blended recommendations by scoring only on candidate news articles.
    """
    model.eval()

    # --- Candidate news list ---
    if candidate_news_ids is not None:
        candidate_indices = [news_id_to_idx.get(nid, -1) for nid in candidate_news_ids if nid in news_id_to_idx]
    else:
        candidate_indices = list(range(tfidf_embeddings.shape[0]))

    # --- GNN Scores ---
    gnn_user_emb, gnn_item_emb = model.get_user_item_embeddings()
    
    if user_id not in user_encoder.classes_:
        return []

    user_idx = user_encoder.transform([user_id])[0]

    gnn_raw_scores = torch.matmul(gnn_user_emb[user_idx], gnn_item_emb.T).detach().cpu().numpy()

    # Fill missing gnn scores
    full_size = tfidf_embeddings.shape[0]
    expanded_gnn_scores = np.full(full_size, -np.inf)
    expanded_gnn_scores[:len(gnn_raw_scores)] = gnn_raw_scores

    gnn_scores = expanded_gnn_scores[candidate_indices]

    # --- TF-IDF Scores ---
    tfidf_scores_full = score_tfidf(user_id, interactions_df, tfidf_embeddings, tfidf_vectorizer, user_encoder, news_id_to_idx)
    tfidf_scores = tfidf_scores_full[candidate_indices]

    # --- BERT Scores ---
    bert_scores_full = score_bert(user_id, interactions_df, bert_embeddings, user_encoder, news_id_to_idx)
    bert_scores = bert_scores_full[candidate_indices]

    # --- BERT4Rec Scores ---
    bert4rec_scores_full = score_bert4rec(user_id, user_encoder, bert4rec_user_embeddings)
    bert4rec_scores = bert4rec_scores_full[candidate_indices]

    def normalize(x):
        x = np.nan_to_num(x)
        if x.size == 0:
            return x  # return empty safely
        min_x = np.min(x)
        max_x = np.max(x)
        if max_x - min_x == 0:
            return np.zeros_like(x)
        return (x - min_x) / (max_x - min_x)

    gnn_scores = normalize(gnn_scores)
    tfidf_scores = normalize(tfidf_scores)
    bert_scores = normalize(bert_scores)
    bert4rec_scores = normalize(bert4rec_scores)

    # --- Blend all normalized scores ---
    final_scores = (
        alpha * gnn_scores +
        beta * tfidf_scores +
        gamma * bert_scores +
        delta * bert4rec_scores
    )

    # --- Pick top-k
    top_indices = np.argsort(final_scores)[::-1][:top_k]

    if candidate_news_ids is not None:
        top_news_ids = [candidate_news_ids[i] for i in top_indices]
    else:
        top_news_ids = item_encoder.inverse_transform(top_indices)

    return top_news_ids

Prediction of top 10 news for any given user id from the dataset

In [78]:
def predict_for_user(
    user_id,
    user_encoder,
    item_encoder,
    model,
    edge_index,
    interactions_df,
    tfidf_embeddings,
    tfidf_vectorizer,
    bert_embeddings,
    bert4rec_user_embeddings,
    news_id_to_idx,
    top_k=10,
    alpha=0.4,
    beta=0.2,
    gamma=0.2,
    delta=0.2
):
    """
    Predict Top-K news articles for a given user using the blended model.
    """

    # Call blended_recommendation (predict across all news if no candidate list)
    top_news_ids = blended_recommendation(
        user_id=user_id,
        user_encoder=user_encoder,
        item_encoder=item_encoder,
        model=model,
        edge_index=edge_index,
        interactions_df=interactions_df,
        tfidf_embeddings=tfidf_embeddings,
        tfidf_vectorizer=tfidf_vectorizer,
        bert_embeddings=bert_embeddings,
        bert4rec_user_embeddings=bert4rec_user_embeddings,
        news_id_to_idx=news_id_to_idx,
        alpha=alpha,
        beta=beta,
        gamma=gamma,
        delta=delta,
        top_k=top_k,
        candidate_news_ids=None  # Predict across all news if not specified
    )

    return top_news_ids

In [79]:
target_user_id = "U12345"  # Replace with a valid user ID
predicted_top_news = predict_for_user(
    user_id=target_user_id,
    user_encoder=user_encoder,
    item_encoder=item_encoder,
    model=model,
    edge_index=edge_index,
    interactions_df=interactions_df,
    tfidf_embeddings=tfidf_embeddings,
    tfidf_vectorizer=tfidf_vectorizer,
    bert_embeddings=bert_embeddings,
    bert4rec_user_embeddings=bert4rec_user_embeddings,
    news_id_to_idx=news_id_to_idx,
    top_k=10
)

print(f"Top-10 Recommended News for User {target_user_id}:")
for idx, news_id in enumerate(predicted_top_news, 1):
    title = news_id_to_title.get(news_id, "Unknown Title")
    print(f"{idx}. {title}")


Top-10 Recommended News for User U12345:
1. Texas Plane Crashed After Dropping Water For Gender Reveal: NTSB
2. 7 Rules for How to Call in Sick to Work
3. Some Burning Steelers Questions about Mike Tomlin
4. Rudy Giuliani's globetrotting complicates US foreign policy
5. No 'Ralphie' for a second straight game leaves CU fans concerned
6. Coast Guard suspends search for missing airman
7. Watch: Fans go crazy when Burrow arrives back in Baton Rouge after win over Tide
8. Fear spreads among Iraqi protesters as government cracks down, keeps death toll secret
9. Fastest American Muscle Cars EVER
10. Walmart sales hit all-time high


Load data for validation from MIND dev dataset

In [80]:
# --- Load behaviors.tsv if not already ---
behaviors_df = pd.read_csv('Documents/CMPE256/256-Project/MINDsmall_dev/behaviors.tsv', sep='\t', header=None,
                           names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

# --- Drop rows with no impressions ---
behaviors_df.dropna(subset=['history'], inplace=True)

# --- Extract clicked news from impressions (correct way) ---
def extract_clicked_news(impressions):
    return [imp.split('-')[0] for imp in impressions.split() if imp.endswith('-1')]

behaviors_df['clicked_news'] = behaviors_df['impressions'].apply(extract_clicked_news)

# --- Filter sessions with at least one click ---
test_behaviors = behaviors_df[behaviors_df['clicked_news'].map(len) > 0].copy()

# --- Prepare test_samples exactly like GNN evaluation ---
def prepare_test_sample(row):
    clicked = row['clicked_news']
    all_news = [imp.split('-')[0] for imp in row['impressions'].split()]
    labels = [1 if n in clicked else 0 for n in all_news]
    return pd.Series([row['user_id'], all_news, labels], index=['user_id', 'candidate_news', 'labels'])

test_samples = test_behaviors.apply(prepare_test_sample, axis=1)

# --- Filter only seen users ---
test_samples = test_samples[test_samples['user_id'].isin(user_encoder.classes_)].copy()

# --- Map to user indices and news indices ---
test_samples['user_idx'] = user_encoder.transform(test_samples['user_id'])

test_samples['news_indices'] = test_samples['candidate_news'].apply(
    lambda news_list: [item_encoder.transform([nid])[0] if nid in item_encoder.classes_ else -1 for nid in news_list]
)

# --- Drop samples with -1 indices (unknown news) ---
test_samples = test_samples[(test_samples['news_indices'].apply(lambda x: all(idx != -1 for idx in x)))]

Evaluation function

In [81]:
from sklearn.metrics import ndcg_score

def evaluate_blended_model(
    model,
    val_samples,
    user_encoder,
    item_encoder,
    edge_index,
    interactions_df,
    tfidf_embeddings,
    tfidf_vectorizer,
    bert_embeddings,
    bert4rec_user_embeddings,
    news_id_to_idx,
    k=10,
    alpha=0.4,
    beta=0.2,
    gamma=0.2,
    delta=0.2
):
    """
    Evaluate the blended model using only the candidate impressions shown to user.
    """
    model.eval()
    ndcgs = []

    for _, row in val_samples.iterrows():
        user_id = row['user_id']
        candidate_news_ids = row['candidate_news']
        labels = row['labels']

        # --- Pass candidate_news_ids correctly ---
        predicted_top_news = blended_recommendation(
            user_id=user_id,
            user_encoder=user_encoder,
            item_encoder=item_encoder,
            model=model,
            edge_index=edge_index,
            interactions_df=interactions_df,
            tfidf_embeddings=tfidf_embeddings,
            tfidf_vectorizer=tfidf_vectorizer,
            bert_embeddings=bert_embeddings,
            bert4rec_user_embeddings=bert4rec_user_embeddings,
            news_id_to_idx=news_id_to_idx,
            top_k=len(candidate_news_ids),
            candidate_news_ids=candidate_news_ids,  # IMPORTANT!!
            alpha=alpha,
            beta=beta,
            gamma=gamma,
            delta=delta
        )

        # Create binary relevance: 1 if predicted in top news, else 0
        predicted_binary = [1 if nid in predicted_top_news else 0 for nid in candidate_news_ids]

        if len(predicted_binary) != len(labels):
            continue  # skip if mismatch

        ndcg = ndcg_score([labels], [predicted_binary], k=k)
        ndcgs.append(ndcg)

    return sum(ndcgs) / len(ndcgs) if ndcgs else 0.0
    
# --- Now evaluate ---
final_ndcg10 = evaluate_blended_model(
    model=model,
    val_samples=test_samples,
    user_encoder=user_encoder,
    item_encoder=item_encoder,
    edge_index=edge_index,
    interactions_df=interactions_df,
    tfidf_embeddings=tfidf_embeddings,
    tfidf_vectorizer=tfidf_vectorizer,
    bert_embeddings=bert_embeddings,
    bert4rec_user_embeddings=bert4rec_user_embeddings,
    news_id_to_idx=news_id_to_idx,
    k=10
)

print(f"nDCG@10 for Blended Model: {final_ndcg10:.4f}")

nDCG@10 for Blended Model: 0.7401
