#Data Loading

In [1]:
import os
import pandas as pd
from collections import Counter
from tqdm import tqdm as tqdm
from sklearn.metrics import roc_auc_score
import numpy as np

#Train Data

In [None]:
!unzip -q MINDsmall_train.zip -d train

In [2]:
train_news_path = os.path.abspath('train/news.tsv')
Train_News_data=pd.read_table(train_news_path,
              header=None,
              names=[
                  'id', 'category', 'subcategory', 'title', 'abstract', 'url',
                  'title_entities', 'abstract_entities'
              ])

print(Train_News_data.shape)

(51282, 8)


In [3]:
train_behaviors_path = os.path.abspath('train/behaviors.tsv')
Train_Behaviors_data=pd.read_table(train_behaviors_path,
              header=None,
              names=[
                  'impression_id', 'user_id', 'time', 'history', 'impressions'
              ])
print(Train_Behaviors_data.shape)

(156965, 5)


#Validation Data

In [None]:
!unzip -q MINDsmall_dev.zip -d dev

In [4]:
val_news_path = os.path.abspath('dev/news.tsv')
Val_News_data=pd.read_table(val_news_path,
              header=None,
              names=[
                  'id', 'category', 'subcategory', 'title', 'abstract', 'url',
                  'title_entities', 'abstract_entities'
              ])
print(Val_News_data.shape)

(42416, 8)


In [5]:
val_behaviors_path = os.path.abspath('dev/behaviors.tsv')
Val_Behaviors_data=pd.read_table(val_behaviors_path,
              header=None,
              names=[
                  'impression_id', 'user_id', 'time', 'history', 'impressions'
              ])
print(Val_Behaviors_data.shape)

(73152, 5)


#Subsampling

In [6]:
def subsample_train_val_same_users_split_by_source(
    Train_News_data, Train_Behaviors_data,
    Val_News_data, Val_Behaviors_data,
    n_users=10000, train_ratio=2.1, seed=42
):
    rng = np.random.default_rng(seed)

    train_users = set(Train_Behaviors_data['user_id'].dropna())
    val_users = set(Val_Behaviors_data['user_id'].dropna())
    common_users = np.array(list(train_users & val_users))
    common_user_ratio = len(train_users) / len(common_users)

    common_user_count = min(int(n_users / common_user_ratio), len(common_users))
    noncommon_user_count = n_users - common_user_count

    sampled_common_users = rng.choice(common_users, size=common_user_count, replace=False)

    train_unique_users = np.setdiff1d(list(train_users), common_users)
    val_unique_users = np.setdiff1d(list(val_users), common_users)

    sampled_train_unique = rng.choice(train_unique_users, size=noncommon_user_count, replace=False)
    sampled_val_unique = rng.choice(val_unique_users, size=noncommon_user_count, replace=False)

    user_train_logs = pd.concat([
        Train_Behaviors_data[Train_Behaviors_data['user_id'].isin(sampled_common_users)],
        Train_Behaviors_data[Train_Behaviors_data['user_id'].isin(sampled_train_unique)]
    ])

    user_val_logs = pd.concat([
        Val_Behaviors_data[Val_Behaviors_data['user_id'].isin(sampled_common_users)],
        Val_Behaviors_data[Val_Behaviors_data['user_id'].isin(sampled_val_unique)]
    ])

    target_val_size = int(len(user_train_logs) / train_ratio)
    if target_val_size < len(user_val_logs):
        user_val_logs = user_val_logs.sample(n=target_val_size, random_state=seed)

    def get_referenced_news(news_df, behaviors_df):
        news_ids = set()
        for _, row in behaviors_df.iterrows():
            history = str(row['history']) if not pd.isna(row['history']) else ''
            news_ids.update(history.split())
            impressions = str(row['impressions']) if not pd.isna(row['impressions']) else ''
            news_ids.update(x.split('-')[0] for x in impressions.split())
        return news_df[news_df['id'].astype(str).isin(news_ids)].copy()

    train_news = get_referenced_news(Train_News_data, user_train_logs)
    val_news = get_referenced_news(Val_News_data, user_val_logs)

    return train_news, user_train_logs, val_news, user_val_logs

In [7]:
# Subsample data
Train_News_data, Train_Behaviors_data, Val_News_data, Val_Behaviors_data = subsample_train_val_same_users_split_by_source(Train_News_data, Train_Behaviors_data, Val_News_data, Val_Behaviors_data, n_users=5000,  train_ratio=2.1)



print(f"Train News Data Shape: {Train_News_data.shape}")
print(f"Train Behaviors Data Shape: {Train_Behaviors_data.shape}")
print(f"Valid News Data Shape: {Val_News_data.shape}")
print(f"Valid Behaviors Data Shape: {Val_Behaviors_data.shape}")



Train News Data Shape: (23705, 8)
Train Behaviors Data Shape: (15667, 5)
Valid News Data Shape: (18666, 8)
Valid Behaviors Data Shape: (7369, 5)


#Data Preprocessing

In [None]:
!wget https://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip

--2025-05-03 23:44:43--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2025-05-03 23:44:43--  https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip]
Saving to: ‘glove.840B.300d.zip’


2025-05-03 23:51:34 (5.06 MB/s) - ‘glove.840B.300d.zip’ saved [2176768927/2176768927]

Archive:  glove.840B.300d.zip
  inflating: glove.840B.300d.txt     


In [38]:
def build_vocab(df, glove=None, min_freq=1):
    counter = Counter()

    for text in df['title'].fillna('').tolist() + df['abstract'].fillna('').tolist():
        tokens = text.lower().split()
        counter.update(tokens)

    vocab = {'<PAD>': 0, '<UNK>': 1}
    oov_words = 0
    in_glove_words = 0

    for word, freq in counter.items():
        if freq >= min_freq:
            if glove is None or word in glove:
                vocab[word] = len(vocab)
                in_glove_words += 1
            else:
                vocab[word] = len(vocab)
                oov_words += 1

    return vocab

In [39]:
def build_category_indices(df):
    cat2idx = {cat: idx for idx, cat in enumerate(df['category'].dropna().unique())}
    subcat2idx = {subcat: idx for idx, subcat in enumerate(df['subcategory'].dropna().unique())}
    return cat2idx, subcat2idx

In [40]:
def tokenize(text, vocab, max_len=20):
    if not isinstance(text, str):
        text = ""
    tokens = text.lower().split()
    token_ids = [vocab.get(w, vocab['<UNK>']) for w in tokens[:max_len]]
    token_ids += [vocab['<PAD>']] * (max_len - len(token_ids))
    return token_ids

In [41]:
def load_glove_embeddings(glove_path, vocab=None, embed_dim=300):
    glove = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            values = line.strip().split()
            if len(values) != embed_dim + 1:
                continue
            word = values[0]

            if vocab is not None and word not in vocab:
                continue

            try:
                vector = torch.tensor([float(v) for v in values[1:]], dtype=torch.float32)
                glove[word] = vector
            except ValueError:
                continue

    return glove

#Utility Functions

In [42]:
def clicked_candidate_news_preparation(behaviors, news):
    history_field = behaviors['history']
    impressions_field = behaviors['impressions']

    history_ids = history_field.split() if isinstance(history_field, str) and history_field.strip() else []
    impression_pairs = impressions_field.split() if isinstance(impressions_field, str) and impressions_field.strip() else []

    candidate_ids = [impr.split('-')[0] for impr in impression_pairs]
    candidate_labels = [int(impr.split('-')[1]) for impr in impression_pairs]

    clicked_news = [nid for nid in history_ids if nid in set(news['id'])]
    candidate_news = [nid for nid in candidate_ids if nid in set(news['id'])]

    return clicked_news, candidate_news, candidate_labels

In [43]:
def encode(title, abstract, category, subcategory, vocab, cat2idx, subcat2idx, max_len=20):
    title = tokenize(title, vocab, max_len)
    abstract = tokenize(abstract, vocab, max_len)
    category = cat2idx.get(category, 0)
    subcategory = subcat2idx.get(subcategory, 0)

    title_tensor = torch.tensor(title, dtype=torch.long)
    abstract_tensor = torch.tensor(abstract, dtype=torch.long)
    cat_tensor = torch.tensor(category, dtype=torch.long)
    subcat_tensor = torch.tensor(subcategory, dtype=torch.long)
    return title_tensor, abstract_tensor, cat_tensor, subcat_tensor

In [44]:
def build_news_vec_dict(News_data, news_encoder, vocab, cat2idx, subcat2idx, device):
    news_vec_dict = {}
    news_encoder.eval()
    with torch.no_grad():
        for i, row in tqdm(News_data.iterrows(), total=len(News_data)):
            title_tensor, abstract_tensor, cat_tensor, subcat_tensor = encode(
                row['title'], row['abstract'], row['category'], row['subcategory'],
                vocab, cat2idx, subcat2idx
            )

            title_tensor = title_tensor.unsqueeze(0).to(device)
            abstract_tensor = abstract_tensor.unsqueeze(0).to(device)
            cat_tensor = cat_tensor.unsqueeze(0).to(device)
            subcat_tensor = subcat_tensor.unsqueeze(0).to(device)

            final_vec, rt, ra = news_encoder(title_tensor, abstract_tensor, cat_tensor, subcat_tensor)

            news_vec_dict[row['id']] = (
                final_vec.squeeze(0).cpu(),
                rt.squeeze(0).cpu(),
                ra.squeeze(0).cpu()
            )
    return news_vec_dict

In [45]:
def move_news_vec_dict_to_device(news_vec_dict, device):
    for nid in news_vec_dict:
        news_vec_dict[nid] = tuple(t.to(device) for t in news_vec_dict[nid])
    return news_vec_dict

#Model Implementation

##Import

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

##News Encoder

In [16]:
class NewsEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, label_size):
        super(NewsEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv_groups = nn.ModuleList([
            nn.Conv1d(embed_dim, embed_dim, kernel_size=3, padding=1) for _ in range(3)
        ])

        self.reduce_proj = nn.Linear(embed_dim * 3, embed_dim)
        self.lstm = nn.LSTM(embed_dim, embed_dim // 2, bidirectional=True, batch_first=True)

        self.title_att_W = nn.Linear(embed_dim, embed_dim)
        self.title_att_b = nn.Parameter(torch.zeros(embed_dim))
        self.title_att_v = nn.Linear(embed_dim, 1)

        self.label_embed = nn.Embedding(label_size, embed_dim)
        self.label_fc = nn.Linear(embed_dim, embed_dim)

        self.fusion_q = nn.Parameter(torch.randn(embed_dim))
        self.fusion_Uv = nn.Linear(embed_dim, embed_dim)
        self.fusion_uv = nn.Parameter(torch.zeros(embed_dim))

    def text_encoder(self, text):
        B = self.embedding(text).transpose(1, 2)
        group_outputs = [conv(B) for conv in self.conv_groups]
        B = F.relu(torch.cat(group_outputs, dim=1)).transpose(1, 2)
        B = self.reduce_proj(B)
        B, _ = self.lstm(B)
        att = torch.tanh(self.title_att_W(B) + self.title_att_b)
        alpha = F.softmax(self.title_att_v(att), dim=1)
        return torch.sum(alpha * B, dim=1)

    def label_encoder(self, x):
        ex = self.label_embed(x)
        return self.label_fc(ex)

    def forward(self, title, abstract, category, subcategory):
        rt = self.text_encoder(title)
        ra = self.text_encoder(abstract)

        rc = self.label_encoder(category)
        rsc = self.label_encoder(subcategory)

        at = torch.matmul(torch.tanh(self.fusion_Uv(rt) + self.fusion_uv), self.fusion_q)
        aa = torch.matmul(torch.tanh(self.fusion_Uv(ra) + self.fusion_uv), self.fusion_q)
        ac = torch.matmul(torch.tanh(self.fusion_Uv(rc) + self.fusion_uv), self.fusion_q)
        asc = torch.matmul(torch.tanh(self.fusion_Uv(rsc) + self.fusion_uv), self.fusion_q)

        weights = torch.stack([at, aa, ac, asc], dim=1)
        alphas = F.softmax(weights, dim=1)
        final_rep = (
              alphas[:, 0].unsqueeze(1) * rt +
              alphas[:, 1].unsqueeze(1) * ra +
              alphas[:, 2].unsqueeze(1) * rc +
              alphas[:, 3].unsqueeze(1) * rsc
        )
        return final_rep, rt, ra

##Detection Module

In [17]:
class DetectionModule(nn.Module):
    def __init__(self, embed_dim, num_heads=8, threshold=0.49, lambda_weight=0.8):
        super(DetectionModule, self).__init__()
        self.multihead_attention = nn.MultiheadAttention(embed_dim, num_heads=num_heads, batch_first=True)
        self.threshold = threshold
        self.lambda_weight = lambda_weight


    def forward(self, rt, ra):
        rt = rt.unsqueeze(0)
        ra = ra.unsqueeze(0)
        tilda_rt,_ = self.multihead_attention(query=ra, key=rt, value=rt)

        tilda_rt_norm = F.normalize(tilda_rt, p=2, dim=-1)
        ra_norm = F.normalize(ra, p=2, dim=-1)

        cos_sim = (tilda_rt_norm * ra_norm).sum(dim=-1)
        pi = torch.sigmoid(cos_sim)

        Si =  Si = torch.where(pi > self.threshold,
                     torch.full_like(pi, self.lambda_weight),
                     torch.ones_like(pi))
        return Si.squeeze(0)

##User Encoder

In [18]:
class UserEncoder(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(UserEncoder, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads

        self.q_linear = nn.ModuleList([nn.Linear(embed_dim, embed_dim) for _ in range(num_heads)])
        self.v_linear = nn.ModuleList([nn.Linear(embed_dim, embed_dim) for _ in range(num_heads)])

    def forward(self, history_vecs, candidate_vec, detection_score=None):
        B, N, D = history_vecs.shape

        head_outputs = []

        for k in range(self.num_heads):
            qk = self.q_linear[k](history_vecs)
            scores = torch.matmul(qk, history_vecs.transpose(1, 2)) / (D ** 0.5)
            beta = F.softmax(scores, dim=-1)

            vk = self.v_linear[k](history_vecs)
            head_output = torch.matmul(beta, vk)
            head_outputs.append(head_output)

        multihead_output = torch.cat(head_outputs, dim=-1)
        multihead_output = multihead_output.view(B, N, self.num_heads, D).mean(dim=2)

        candidate_vec = candidate_vec.unsqueeze(1)
        dot_scores = (candidate_vec * multihead_output).sum(dim=2) / (D ** 0.5)

        if detection_score is not None:
            dot_scores = dot_scores * detection_score

        alpha = F.softmax(dot_scores, dim=1)
        user_vector = torch.bmm(alpha.unsqueeze(1), multihead_output).squeeze(1)
        return user_vector

#Score Calculator

In [19]:
def compute_click_scores(user_vec, candidate_vecs, detection_scores):
    B = candidate_vecs.shape[0]
    user_vec_expand = user_vec.expand(B, -1)
    dot_scores = (user_vec_expand * candidate_vecs).sum(dim=1)
    dd_scores = dot_scores * detection_scores
    click_scores = torch.sigmoid(dd_scores)


    return click_scores

#Loss Function

In [20]:
def ranking_softmax_loss(pos_scores, neg_scores):
    pos_exp = torch.exp(-pos_scores)
    neg_exp = torch.exp(-neg_scores).sum(dim=1)
    denom = pos_exp + neg_exp
    loss = -torch.log(pos_exp / denom)
    return loss.mean()

#Evaluation Metrics

In [21]:
from sklearn.metrics import roc_auc_score

In [22]:
def compute_mrr(y_true_sorted):
    y_true_sorted = np.asarray(y_true_sorted)
    relevant_indices = np.where(y_true_sorted == 1)[0]

    if relevant_indices.size == 0:
        return 0.0

    return 1.0 / (relevant_indices[0] + 1)

In [23]:
def ndcg(y_true_sorted, k):
    y_true_sorted = np.asarray(y_true_sorted)
    k = min(k, len(y_true_sorted))
    if k == 0:
        return 0.0

    gains = 2 ** y_true_sorted[:k] - 1
    discounts = np.log2(np.arange(2, k + 2))
    dcg = np.sum(gains / discounts)

    ideal_sorted = np.sort(y_true_sorted)[::-1]
    ideal_gains = 2 ** ideal_sorted[:k] - 1
    ideal_dcg = np.sum(ideal_gains / discounts)

    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

In [24]:
def eval_metrics(y_true, y_score):
    aucs, mrrs, ndcg5s, ndcg10s = [], [], [], []

    for labels, scores in zip(y_true, y_score):
        sorted_idx = torch.argsort(scores, descending=True)
        sorted_labels = labels[sorted_idx]

        auc = roc_auc_score(labels.numpy(), scores.numpy()) if len(set(labels.numpy())) > 1 else 0.0
        mrr = compute_mrr(sorted_labels)
        ndcg5 = ndcg(sorted_labels, 5)
        ndcg10 = ndcg(sorted_labels, 10)

        aucs.append(auc)
        mrrs.append(mrr)
        ndcg5s.append(ndcg5)
        ndcg10s.append(ndcg10)

    auc = sum(aucs) / len(aucs)
    mrr = sum(mrrs) / len(mrrs)
    ndcg5 = sum(ndcg5s) / len(ndcg5s)
    ndcg10 = sum(ndcg10s) / len(ndcg10s)

    return auc, mrr, ndcg5, ndcg10

#Model Object Initiation

In [25]:
glove = load_glove_embeddings('glove.840B.300d.txt')

2196017it [01:50, 19791.06it/s]


In [46]:
vocab_train = build_vocab(Train_News_data, glove)
cat2idx_train, subcat2idx_train = build_category_indices(Train_News_data)


vocab_val = build_vocab(Val_News_data, glove)
cat2idx_val, subcat2idx_val = build_category_indices(Val_News_data)

print(f"Train Vocab Size: {len(vocab_train)}")
print(f"Val Vocab Size: {len(vocab_val)}")

Train Vocab Size: 78876
Val Vocab Size: 66231


In [47]:
vocab_size = len(vocab_train)
embed_dim = 300
label_size = max(len(cat2idx_train), len(subcat2idx_train))

In [48]:
news_encoder = NewsEncoder(vocab_size, embed_dim, label_size)
user_encoder = UserEncoder(embed_dim=300, num_heads=6)
detection_module = DetectionModule(embed_dim=300, num_heads=6)

In [49]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [50]:
news_encoder.to(device)
user_encoder.to(device)
detection_module.to(device)

DetectionModule(
  (multihead_attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
  )
)

In [51]:
news_vec_dict_train = build_news_vec_dict(Train_News_data, news_encoder, vocab_train, cat2idx_train, subcat2idx_train, device)
news_vec_dict_val = build_news_vec_dict(Val_News_data, news_encoder, vocab_val, cat2idx_val, subcat2idx_val, device)

news_vec_dict_train = move_news_vec_dict_to_device(news_vec_dict_train, device)
news_vec_dict_val = move_news_vec_dict_to_device(news_vec_dict_val, device)

print({len(news_vec_dict_train)})
print({len(news_vec_dict_val)})

100%|██████████| 23705/23705 [02:14<00:00, 176.33it/s]
100%|██████████| 18666/18666 [01:35<00:00, 195.59it/s]


{23705}
{18666}


#Code Component Testing

In [58]:
behavior_row = Train_Behaviors_data.iloc[1]
clicked_news, candidate_news, candidate_labels = clicked_candidate_news_preparation(behavior_row, Train_News_data)

cached_news_ids = set(news_vec_dict_train.keys())

clicked_news = [nid for nid in clicked_news if nid in cached_news_ids]
candidate_news = [nid for nid in candidate_news if nid in cached_news_ids]

fc_click = torch.stack([news_vec_dict_train[nid][0] for nid in clicked_news])
rt_click = torch.stack([news_vec_dict_train[nid][1] for nid in clicked_news])
ra_click = torch.stack([news_vec_dict_train[nid][2] for nid in clicked_news])

fc_cand = torch.stack([news_vec_dict_train[nid][0] for nid in candidate_news])
rt_cand = torch.stack([news_vec_dict_train[nid][1] for nid in candidate_news])
ra_cand = torch.stack([news_vec_dict_train[nid][2] for nid in candidate_news])

detection_scores_clicked = detection_module(rt_click, ra_click).unsqueeze(0)
detection_scores_cand = detection_module(rt_cand, ra_cand)

user_vec = user_encoder(fc_click.unsqueeze(0), fc_cand[0:1], detection_score=detection_scores_clicked)
click_scores = compute_click_scores(user_vec, fc_cand, detection_scores_cand)


print(click_scores)


tensor([0.2906, 0.9081, 0.4881, 0.0171, 0.5049, 0.0380, 0.9096, 0.5065, 0.7878,
        0.5047, 0.2891, 0.5001], grad_fn=<SigmoidBackward0>)


#Train-test

In [59]:
def train_one_epoch(behaviors_df, news_df, news_vec_dict_train, model_components, optimizer, vocab, cat2idx, subcat2idx):
    user_encoder = model_components['user_encoder']
    detection_module = model_components['detection_module']

    user_encoder.train()
    detection_module.train()

    device = next(user_encoder.parameters()).device
    total_loss, count = 0, 0

    cached_news_ids = set(news_vec_dict_train.keys())

    for _, behavior_row in tqdm(behaviors_df.iterrows()):
        clicked_news, candidate_news, candidate_labels = clicked_candidate_news_preparation(behavior_row, news_df)
        clicked_news = clicked_news[:50]
        
        if not candidate_labels or not clicked_news or not candidate_news:
            continue

        clicked_news = [nid for nid in clicked_news if nid in cached_news_ids]
        candidate_news = [nid for nid in candidate_news if nid in cached_news_ids]
        if not clicked_news or not candidate_news:
            continue

        fc_click = torch.stack([news_vec_dict_train[nid][0] for nid in clicked_news])
        rt_click = torch.stack([news_vec_dict_train[nid][1] for nid in clicked_news])
        ra_click = torch.stack([news_vec_dict_train[nid][2] for nid in clicked_news])

        fc_cand = torch.stack([news_vec_dict_train[nid][0] for nid in candidate_news])
        rt_cand = torch.stack([news_vec_dict_train[nid][1] for nid in candidate_news])
        ra_cand = torch.stack([news_vec_dict_train[nid][2] for nid in candidate_news])

        detection_scores_clicked = detection_module(rt_click, ra_click).unsqueeze(0)
        detection_scores_cand = detection_module(rt_cand, ra_cand)

        user_vec = user_encoder(fc_click.unsqueeze(0), fc_cand[0:1], detection_score=detection_scores_clicked)
        click_scores = compute_click_scores(user_vec, fc_cand, detection_scores_cand)

        labels = torch.tensor(candidate_labels, dtype=torch.float32, device=device)

        pos_idx = (labels == 1).nonzero(as_tuple=True)[0]

        if pos_idx.numel() == 1:
            pos_idx = pos_idx.item()
            pos_score = click_scores[pos_idx].unsqueeze(0)
            neg_scores = torch.cat([click_scores[:pos_idx], click_scores[pos_idx + 1:]]).unsqueeze(0)

            loss = ranking_softmax_loss(pos_score, neg_scores)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            count += 1

    return total_loss / max(1, count)

In [64]:
def test_one_epoch(behaviors_df, news_df, news_vec_dict_val, voab, cat2idx, subcat2idx, model_components):
    user_encoder = model_components['user_encoder']
    detection_module = model_components['detection_module']

    user_encoder.eval()
    detection_module.eval()

    device = next(user_encoder.parameters()).device
    all_scores, all_labels = [], []

    cached_news_ids = set(news_vec_dict_val.keys())

    with torch.no_grad():
        for _, behavior_row in tqdm(behaviors_df.iterrows()):
            clicked_news, candidate_news, candidate_labels = clicked_candidate_news_preparation(behavior_row, news_df)
            clicked_news = clicked_news[:50]
            
            if not candidate_labels or not clicked_news or not candidate_news:
                continue

            clicked_news = [nid for nid in clicked_news if nid in cached_news_ids]
            candidate_news = [nid for nid in candidate_news if nid in cached_news_ids]
            if not clicked_news or not candidate_news:
                continue

            fc_click = torch.stack([news_vec_dict_val[nid][0] for nid in clicked_news])
            rt_click = torch.stack([news_vec_dict_val[nid][1] for nid in clicked_news])
            ra_click = torch.stack([news_vec_dict_val[nid][2] for nid in clicked_news])

            fc_cand = torch.stack([news_vec_dict_val[nid][0] for nid in candidate_news])
            rt_cand = torch.stack([news_vec_dict_val[nid][1] for nid in candidate_news])
            ra_cand = torch.stack([news_vec_dict_val[nid][2] for nid in candidate_news])

            detection_scores_clicked = detection_module(rt_click, ra_click).unsqueeze(0)
            detection_scores_cand = detection_module(rt_cand, ra_cand)

            user_vec = user_encoder(fc_click.unsqueeze(0), fc_cand[0:1], detection_score=detection_scores_clicked)
            click_scores = compute_click_scores(user_vec, fc_cand, detection_scores_cand)

            all_scores.append(click_scores.cpu())
            all_labels.append(torch.tensor(candidate_labels, dtype=torch.float32))

    return eval_metrics(all_labels, all_scores)



In [65]:
model_components = {
    'news_encoder': news_encoder,
    'detection_module': detection_module,
    'user_encoder': user_encoder
}


In [66]:
optimizer = torch.optim.Adam(
    list(news_encoder.parameters()) +
    list(user_encoder.parameters()) +
    list(detection_module.parameters()),
    lr=1e-4
)

#Final Evaluation

In [68]:
for epoch in range(5):
    print(f"Epoch {epoch}")

    train_loss = train_one_epoch(
        Train_Behaviors_data,
        Train_News_data,
        news_vec_dict_train,
        model_components,
        optimizer,
        vocab_train,
        cat2idx_train,
        subcat2idx_train
    )

    auc, mrr, ndcg5, ndcg10 = test_one_epoch(
        Val_Behaviors_data,
        Val_News_data,
        news_vec_dict_val,
        vocab_val,
        cat2idx_val,
        subcat2idx_val,
        model_components
    )

    print(f"Epoch {epoch} | Train Loss: {train_loss:.4f} | "
          f"AUC: {auc:.4f}, MRR: {mrr:.4f}, nDCG@5: {ndcg5:.4f}, nDCG@10: {ndcg10:.4f}")

print("-------------------------------------------Final Eval-------------------------------------------")
print(f"AUC: {auc * 100:.2f}, MRR: {mrr * 100:.2f}, nDCG@5: {ndcg5 * 100:.2f}, nDCG@10: {ndcg10 * 100:.2f}")


Epoch 0


15667it [35:03,  7.45it/s]
7369it [10:22, 11.84it/s]


Epoch 0 | Train Loss: 2.7018 | AUC: 0.4095, MRR: 0.2074, nDCG@5: 0.1809, nDCG@10: 0.2431
Epoch 1


15667it [36:26,  7.17it/s]
7369it [10:33, 11.63it/s]


Epoch 1 | Train Loss: 2.6911 | AUC: 0.4066, MRR: 0.2056, nDCG@5: 0.1789, nDCG@10: 0.2412
Epoch 2


15667it [34:52,  7.49it/s]
7369it [10:39, 11.53it/s]


Epoch 2 | Train Loss: 2.6847 | AUC: 0.4084, MRR: 0.2054, nDCG@5: 0.1801, nDCG@10: 0.2422
Epoch 3


15667it [36:59,  7.06it/s]
7369it [10:45, 11.42it/s]


Epoch 3 | Train Loss: 2.6753 | AUC: 0.4130, MRR: 0.2082, nDCG@5: 0.1819, nDCG@10: 0.2443
Epoch 4


15667it [35:53,  7.27it/s]
7369it [10:29, 11.70it/s]


Epoch 4 | Train Loss: 2.6699 | AUC: 0.4166, MRR: 0.2106, nDCG@5: 0.1850, nDCG@10: 0.2468
-------------------------------------------Final Eval-------------------------------------------
AUC: 41.66, MRR: 21.06, nDCG@5: 18.50, nDCG@10: 24.68


In [69]:
module_name = "NQNR_PCNE"

df = pd.DataFrame([{
    'Module': module_name,
    'AUC': f"{auc * 100:.2f}",
    'MRR': f"{mrr * 100:.2f}",
    'nDCG@5': f"{ndcg5 * 100:.2f}",
    'nDCG@10': f"{ndcg10 * 100:.2f}",
}])

filename = f"{module_name}_results.csv"

df.to_csv(filename, index=False)