In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tqdm import tqdm
import warnings
import gc
warnings.filterwarnings("ignore")

In [3]:
!pip3 install torch_geometric



In [4]:
news_df = pd.read_table("./MINDsmall_train/news.tsv", header=None, names=[
    "news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"
])

behaviors_df = pd.read_table("./MINDsmall_train/behaviors.tsv", header=None, names=[
    "impression_id", "user_id", "time", "history", "impressions"
])


In [5]:
news_df.dropna(subset=['title'], inplace=True)
behaviors_df.dropna(subset=['impressions'], inplace=True)
print("News\n")
print(news_df.head())
print("Click behaviors")
print(behaviors_df.head())

News

  news_id   category      subcategory  \
0  N55528  lifestyle  lifestyleroyals   
1  N19639     health       weightloss   
2  N61837       news        newsworld   
3  N53526     health           voices   
4  N38324     health          medical   

                                               title  \
0  The Brands Queen Elizabeth, Prince Charles, an...   
1                      50 Worst Habits For Belly Fat   
2  The Cost of Trump's Aid Freeze in the Trenches...   
3  I Was An NBA Wife. Here's How It Affected My M...   
4  How to Get Rid of Skin Tags, According to a De...   

                                            abstract  \
0  Shop the notebooks, jackets, and more that the...   
1  These seemingly harmless habits are holding yo...   
2  Lt. Ivan Molchanets peeked over a parapet of s...   
3  I felt like I was a fraud, and being an NBA wi...   
4  They seem harmless, but there's a very good re...   

                                             url  \
0  https://assets.msn

In [16]:
def extract_clicked_news(imp):
    return [i.split('-')[0] for i in imp.split() if i.endswith('-1')]

behaviors_df['clicked_news'] = behaviors_df['impressions'].apply(extract_clicked_news)

In [18]:
user_clicks = behaviors_df.explode('clicked_news')[['user_id', 'clicked_news']]
user_clicks.columns = ['user_id', 'news_id']


In [22]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

user_clicks['user_idx'] = user_encoder.fit_transform(user_clicks['user_id'])
user_clicks['item_idx'] = item_encoder.fit_transform(user_clicks['news_id'])

num_users = user_clicks['user_idx'].nunique()
num_items = user_clicks['item_idx'].nunique()
print(f"Unique users: {num_users}, Unique news items: {num_items}")

Unique users: 50000, Unique news items: 7713


In [24]:
import torch
from torch_geometric.data import Data

# Convert user and item indices to torch tensors
edge_index = torch.tensor([
    user_clicks['user_idx'].values,
    user_clicks['item_idx'].values + num_users  # shift item indices to avoid overlap
], dtype=torch.long)

# Create PyG Data object for bipartite graph
data = Data(edge_index=edge_index)

# Save useful attributes for later use
data.num_nodes = num_users + num_items
data.num_users = num_users
data.num_items = num_items

print(data)

Data(edge_index=[2, 236344], num_nodes=57713, num_users=50000, num_items=7713)


In [30]:
import torch.nn as nn
import torch.nn.functional as F

class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=64, num_layers=3):
        super(LightGCN, self).__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.num_nodes = num_users + num_items
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        # Initialize user and item embeddings
        self.embedding = nn.Embedding(self.num_nodes, embedding_dim)
        nn.init.xavier_uniform_(self.embedding.weight)

    def forward(self, edge_index):
        # Initial embeddings
        x = self.embedding.weight

        # To accumulate layer-wise embeddings
        all_embeddings = [x]

        for _ in range(self.num_layers):
            # LightGCN propagation: simple mean aggregation from neighbors
            row, col = edge_index
            deg = torch.bincount(row, minlength=self.num_nodes).float().clamp(min=1)
            norm = 1.0 / deg[row].sqrt() / deg[col].sqrt()
            x = torch.zeros_like(x).scatter_add_(0, row.unsqueeze(-1).expand(-1, x.size(1)), x[col] * norm.unsqueeze(1))
            all_embeddings.append(x)

        # Final embedding is the sum of embeddings from all layers
        out = torch.stack(all_embeddings, dim=0).mean(dim=0)
        return out

    def get_user_item_embeddings(self):
        out = self.forward(edge_index)
        user_emb = out[:self.num_users]
        item_emb = out[self.num_users:]
        return user_emb, item_emb

In [32]:
import random

def bpr_loss(user_emb, pos_emb, neg_emb):
    pos_scores = torch.sum(user_emb * pos_emb, dim=1)
    neg_scores = torch.sum(user_emb * neg_emb, dim=1)
    loss = -F.logsigmoid(pos_scores - neg_scores).mean()
    return loss

def sample_mini_batch(edge_index, num_users, num_items, batch_size):
    user_indices = torch.randint(0, num_users, (batch_size,))
    pos_items = []
    neg_items = []

    for u in user_indices:
        user_edges = edge_index[1][edge_index[0] == u]
        if len(user_edges) == 0:
            continue
        pos = user_edges[random.randint(0, len(user_edges) - 1)]
        while True:
            neg = torch.randint(0, num_items, (1,)).item()
            if neg + num_users not in user_edges:
                break
        pos_items.append(pos - num_users)
        neg_items.append(neg)

    return user_indices, torch.tensor(pos_items), torch.tensor(neg_items)

# Set training params
embedding_dim = 256
num_layers = 3
batch_size = 1024
epochs = 15
learning_rate = 1e-3

model = LightGCN(num_users, num_items, embedding_dim, num_layers)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [192]:
from tqdm import trange
from sklearn.metrics import ndcg_score

model.train()
for epoch in range(epochs):
    total_loss = 0
    num_batches = data.num_users // batch_size + 1

    for _ in trange(num_batches, desc=f"Epoch {epoch+1}"):
        user_idx, pos_idx, neg_idx = sample_mini_batch(edge_index, num_users, num_items, batch_size)

        user_emb, item_emb = model.get_user_item_embeddings()
        u_emb = user_emb[user_idx]
        pos_emb = item_emb[pos_idx]
        neg_emb = item_emb[neg_idx]

        loss = bpr_loss(u_emb, pos_emb, neg_emb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch+1} - Loss: {avg_loss:.4f}")
    
    

Epoch 1: 100%|██████████████████████████████████| 49/49 [00:31<00:00,  1.57it/s]


Epoch 1 - Loss: 0.6896


Epoch 2: 100%|██████████████████████████████████| 49/49 [00:31<00:00,  1.56it/s]


Epoch 2 - Loss: 0.6631


Epoch 3: 100%|██████████████████████████████████| 49/49 [00:32<00:00,  1.51it/s]


Epoch 3 - Loss: 0.5936


Epoch 4: 100%|██████████████████████████████████| 49/49 [00:33<00:00,  1.47it/s]


Epoch 4 - Loss: 0.4904


Epoch 5: 100%|██████████████████████████████████| 49/49 [00:35<00:00,  1.37it/s]


Epoch 5 - Loss: 0.3852


Epoch 6: 100%|██████████████████████████████████| 49/49 [00:32<00:00,  1.53it/s]


Epoch 6 - Loss: 0.3106


Epoch 7: 100%|██████████████████████████████████| 49/49 [00:32<00:00,  1.51it/s]


Epoch 7 - Loss: 0.2578


Epoch 8: 100%|██████████████████████████████████| 49/49 [00:37<00:00,  1.31it/s]


Epoch 8 - Loss: 0.2249


Epoch 9: 100%|██████████████████████████████████| 49/49 [00:39<00:00,  1.24it/s]


Epoch 9 - Loss: 0.2042


Epoch 10: 100%|█████████████████████████████████| 49/49 [00:39<00:00,  1.25it/s]


Epoch 10 - Loss: 0.1873


Epoch 11: 100%|█████████████████████████████████| 49/49 [00:38<00:00,  1.26it/s]


Epoch 11 - Loss: 0.1712


Epoch 12: 100%|█████████████████████████████████| 49/49 [00:35<00:00,  1.37it/s]


Epoch 12 - Loss: 0.1643


Epoch 13: 100%|█████████████████████████████████| 49/49 [00:38<00:00,  1.29it/s]


Epoch 13 - Loss: 0.1539


Epoch 14: 100%|█████████████████████████████████| 49/49 [00:36<00:00,  1.34it/s]


Epoch 14 - Loss: 0.1485


Epoch 15: 100%|█████████████████████████████████| 49/49 [00:36<00:00,  1.34it/s]

Epoch 15 - Loss: 0.1419





In [193]:
from sklearn.metrics import ndcg_score

# Prepare a sample validation set (say, first 1000 impressions)
val_behaviors = behaviors_df.head(1000).copy()
val_behaviors = val_behaviors[val_behaviors['clicked_news'].map(len) > 0]

def prepare_validation_samples(row):
    clicked = [i for i in row['clicked_news']]
    all_impressions = [i.split('-')[0] for i in row['impressions'].split()]
    labels = [1 if news in clicked else 0 for news in all_impressions]
    return pd.Series([row['user_id'], all_impressions, labels], index=['user_id', 'candidate_news', 'labels'])

val_samples = val_behaviors.apply(prepare_validation_samples, axis=1)

# Map news IDs to indices
val_samples['user_idx'] = user_encoder.transform(val_samples['user_id'])
val_samples['news_indices'] = val_samples['candidate_news'].apply(
    lambda news_list: [item_encoder.transform([nid])[0] if nid in item_encoder.classes_ else -1 for nid in news_list]
)

# Validation function
def evaluate_ndcg(model, val_samples, k=10):
    model.eval()
    user_emb, item_emb = model.get_user_item_embeddings()

    ndcgs = []
    for _, row in val_samples.iterrows():
        user_idx = row['user_idx']
        item_idxs = row['news_indices']
        labels = row['labels']

        # Skip samples with unknown news
        if any(idx == -1 for idx in item_idxs):
            continue

        scores = torch.matmul(user_emb[user_idx], item_emb[item_idxs].T).detach().numpy()
        ndcg = ndcg_score([labels], [scores], k=k)
        ndcgs.append(ndcg)

    return sum(ndcgs) / len(ndcgs) if ndcgs else 0.0

# Example usage after training:
ndcg10 = evaluate_ndcg(model, val_samples)
print(f"nDCG@10 on validation set: {ndcg10:.4f}")

nDCG@10 on validation set: 0.6784


In [198]:
# Save model weights
torch.save(model.state_dict(), "lightgcn_model.pth")

# Save embeddings
torch.save(user_emb, "user_embeddings.pt")
torch.save(item_emb, "item_embeddings.pt")

In [200]:
recommendations = []

for uid in user_clicks['user_id'].unique():
    uidx = user_encoder.transform([uid])[0]
    scores = torch.matmul(user_emb[uidx], item_emb.T).detach().numpy()
    topk_indices = scores.argsort()[::-1][:10]
    topk_news_ids = item_encoder.inverse_transform(topk_indices)
    recommendations.append({'user_id': uid, 'top_10_news': topk_news_ids.tolist()})

import pandas as pd
rec_df = pd.DataFrame(recommendations)
rec_df.to_csv("user_recommendations.csv", index=False)

In [202]:
# 🔍 Set your target user_id here
target_user_id = "U1234"  # replace with any valid user_id from behaviors_df or user_clicks

# Convert to internal user_idx
if target_user_id not in user_encoder.classes_:
    print("User ID not found in training data.")
else:
    user_idx = user_encoder.transform([target_user_id])[0]

    # Ensure model is in eval mode and get final embeddings
    model.eval()
    user_emb, item_emb = model.get_user_item_embeddings()

    # Compute scores: dot product between user vector and all item vectors
    scores = torch.matmul(user_emb[user_idx], item_emb.T).detach().numpy()

    # Get top-k recommended item indices (e.g., top 10)
    topk_indices = scores.argsort()[::-1][:10]

    # Map back to news_id and titles
    topk_news_ids = item_encoder.inverse_transform(topk_indices)
    topk_titles = news_df.set_index('news_id').loc[topk_news_ids, 'title'].tolist()

    # 📢 Print Recommendations
    print(f"\nTop 10 Recommendations for User ID: {target_user_id}\n")
    for rank, title in enumerate(topk_titles, 1):
        print(f"{rank}. {title}")


Top 10 Recommendations for User ID: U1234

1. Lamar Odom Is Engaged to Sabrina Parr: See Her Ring!
2. Hannah Brown on Being Surrounded By Exes Tyler Cameron and Colton Underwood
3. Celebrity plastic surgery transformations
4. Stella McCartney Deleted a Meghan Markle Instagram Post After Followers Called Her Out
5. 37 Years After His Wife Is Found Dead with an Ax in Her Skull, Husband Is Arrested
6. College gymnast dies following training accident in Connecticut
7. The son of a Chinese billionaire has been banned from flying first class, playing golf, buying property, or going clubbing
8. Carrie Underwood Praises Miranda Lambert as 'Super Supportive': 'We Lift Each Other Up'
9. Week in celebrity photos for Nov. 11-15, 2019
10. Atlanta college student Alexis Crawford was choked to death, dumped in park, police say


In [26]:
#Loading the models and embeddings
user_emb = torch.load("user_embeddings.pt")
item_emb = torch.load("item_embeddings.pt")

In [34]:
# 🔁 Rebuild the model architecture (must match the one used during training)
model = LightGCN(num_users, num_items, embedding_dim=256, num_layers=3)

# 🔄 Load the trained weights
model.load_state_dict(torch.load("lightgcn_model.pth", map_location=torch.device('cpu')))

# ✅ Set the model to evaluation mode
model.eval()

LightGCN(
  (embedding): Embedding(57713, 256)
)

In [40]:
# Load dev/test behaviors file
dev_behaviors_path = "MINDsmall_dev/behaviors.tsv"

# Load columns
test_behaviors_df = pd.read_table(dev_behaviors_path, header=None, names=[
    "impression_id", "user_id", "time", "history", "impressions"
])

# Drop impressions with no data
test_behaviors_df.dropna(subset=["impressions"], inplace=True)

# Extract clicked news and prepare val-like samples
def extract_clicked_news(impressions):
    return [i.split('-')[0] for i in impressions.split() if i.endswith('-1')]

test_behaviors_df['clicked_news'] = test_behaviors_df['impressions'].apply(extract_clicked_news)

# Keep only valid samples (at least one click)
test_behaviors = test_behaviors_df[test_behaviors_df['clicked_news'].map(len) > 0].copy()

# Convert into format compatible with evaluate_ndcg()
def prepare_test_sample(row):
    clicked = row['clicked_news']
    all_news = [i.split('-')[0] for i in row['impressions'].split()]
    labels = [1 if n in clicked else 0 for n in all_news]
    return pd.Series([row['user_id'], all_news, labels], index=['user_id', 'candidate_news', 'labels'])

test_samples = test_behaviors.apply(prepare_test_sample, axis=1)

# Map user and news IDs to internal indices
test_samples['user_idx'] = test_samples['user_id'].apply(
    lambda uid: user_encoder.transform([uid])[0] if uid in user_encoder.classes_ else -1
)

test_samples['news_indices'] = test_samples['candidate_news'].apply(
    lambda news_list: [item_encoder.transform([nid])[0] if nid in item_encoder.classes_ else -1 for nid in news_list]
)

# Optional: remove samples with unknown users or items
test_samples = test_samples[(test_samples['user_idx'] != -1) & 
                            (test_samples['news_indices'].apply(lambda x: -1 not in x))]

print(f"Test samples retained: {len(test_samples)}")

Test samples retained: 335


In [46]:
from sklearn.metrics import ndcg_score

def evaluate_ndcg(model, val_samples, k=10):
    model.eval()
    user_emb, item_emb = model.get_user_item_embeddings()

    ndcgs = []
    for _, row in val_samples.iterrows():
        user_idx = row['user_idx']
        item_idxs = row['news_indices']
        labels = row['labels']

        # Skip samples with unknown news
        if any(idx == -1 for idx in item_idxs):
            continue

        scores = torch.matmul(user_emb[user_idx], item_emb[item_idxs].T).detach().numpy()
        ndcg = ndcg_score([labels], [scores], k=k)
        ndcgs.append(ndcg)

    return sum(ndcgs) / len(ndcgs) if ndcgs else 0.0
    
ndcg10_test = evaluate_ndcg(model, test_samples)
print(f"nDCG@10 on MINDsmall_dev test set: {ndcg10_test:.4f}")

nDCG@10 on MINDsmall_dev test set: 0.7913
