In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from sentence_transformers import SentenceTransformer

print("Torch version:", torch.__version__)
print("Has torch.compiler?", hasattr(torch, "compiler"))

model = SentenceTransformer("all-MiniLM-L6-v2")
print("✅ BERT model loaded!")


Torch version: 2.1.2+cpu
Has torch.compiler? True


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


✅ BERT model loaded!


In [3]:
news_df = pd.read_csv('news.tsv', sep='\t', header=None,
                      names=['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'])

news_df['content'] = news_df['title'].fillna('') + ' ' + news_df['abstract'].fillna('')
news_df = news_df[['news_id', 'title', 'content']]

behaviors_df = pd.read_csv('behaviors.tsv', sep='\t', header=None,
                           names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

behaviors_df = behaviors_df.dropna(subset=['history'])
behaviors_df['clicked_news'] = behaviors_df['history'].apply(lambda x: x.strip().split())

In [4]:
user_clicks = []
for _, row in behaviors_df.iterrows():
    for nid in row['clicked_news']:
        user_clicks.append((row['user_id'], nid))

interactions_df = pd.DataFrame(user_clicks, columns=['user_id', 'news_id'])
interactions_df = interactions_df.merge(news_df, on='news_id', how='left')

news_id_to_idx = {nid: i for i, nid in enumerate(news_df['news_id'])}
idx_to_news_id = {i: nid for nid, i in news_id_to_idx.items()}
news_id_to_title = dict(zip(news_df['news_id'], news_df['title']))

In [5]:
print("🔢 Fitting TF-IDF...")
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(news_df['content'])

def recommend_tfidf(user_id, top_k=5):
    user_history = interactions_df[interactions_df['user_id'] == user_id]['news_id'].tolist()
    user_idx = [news_id_to_idx[nid] for nid in user_history if nid in news_id_to_idx]
    if not user_idx:
        return []

    user_vector = tfidf_matrix[user_idx].mean(axis=0).A  # to dense NumPy
    similarities = cosine_similarity(user_vector, tfidf_matrix).flatten()

    recommended_idxs = similarities.argsort()[::-1]
    recommended_ids = [idx_to_news_id[i] for i in recommended_idxs if idx_to_news_id[i] not in user_history]
    return [(nid, news_id_to_title[nid]) for nid in recommended_ids[:top_k]]

🔢 Fitting TF-IDF...


In [6]:
print("🤖 Encoding BERT embeddings...")
bert_model = SentenceTransformer("all-MiniLM-L6-v2")
bert_embeddings = bert_model.encode(news_df['content'].tolist(), show_progress_bar=True)
bert_embeddings = np.array(bert_embeddings)

def recommend_bert(user_id, top_k=5):
    user_history = interactions_df[interactions_df['user_id'] == user_id]['news_id'].tolist()
    user_idx = [news_id_to_idx[nid] for nid in user_history if nid in news_id_to_idx]
    if not user_idx:
        return []

    user_vector = bert_embeddings[user_idx].mean(axis=0).reshape(1, -1)
    similarities = cosine_similarity(user_vector, bert_embeddings).flatten()

    recommended_idxs = similarities.argsort()[::-1]
    recommended_ids = [idx_to_news_id[i] for i in recommended_idxs if idx_to_news_id[i] not in user_history]
    return [(nid, news_id_to_title[nid]) for nid in recommended_ids[:top_k]]

🤖 Encoding BERT embeddings...


Batches: 100%|█████████████████████████████████████████████████████████████████████| 1603/1603 [14:21<00:00,  1.86it/s]


In [7]:
print("\n🎯 Running TF-IDF + BERT example...")
for uid in interactions_df['user_id'].unique():
    tfidf_recs = recommend_tfidf(uid)
    bert_recs = recommend_bert(uid)
    if tfidf_recs and bert_recs:
        print(f"\nUser: {uid}")
        print("\n--- TF-IDF Recommendations ---")
        for nid, title in tfidf_recs:
            print(f"• {title} ({nid})")

        print("\n--- BERT Recommendations ---")
        for nid, title in bert_recs:
            print(f"• {title} ({nid})")
        break


🎯 Running TF-IDF + BERT example...

User: U13740

--- TF-IDF Recommendations ---
• Biden on being denied communion: 'I'm a practicing Catholic, I practice my faith' (N34069)
• Fans fume after 'Wheel of Fortune' seemingly makes mistake (N42154)
• Best Response Ever From a 'Wheel of Fortune' Contestant? (N55161)
• Former North Carolina US Sen Kay Hagan dies (N61980)
• Biden refuses to comment on being denied communion, says he's a 'practicing Catholic' (N19522)

--- BERT Recommendations ---
• Exclusive: Hunter Biden on getting married after 6 days and why rehab is 'courageous' (N52589)
• Best Response Ever From a 'Wheel of Fortune' Contestant? (N55161)
• Guy Who 'Doesn't Want Pets' Finally Gives In On His Wedding Day (N18069)
• 'It made it too real that we couldn't be here tomorrow': Couple nearly hit by red-light runner, then a miracle happened​ (N2588)
• Howard Stern and Wife Beth Remarry After 11 Years in Surprise Wedding   Led by Colton Underwood! (N37327)


In [8]:
from sklearn.metrics import ndcg_score

In [9]:
def parse_impressions(imp_str):
    pairs = imp_str.strip().split()
    return [(p.split('-')[0], int(p.split('-')[1])) for p in pairs]

def evaluate_ndcg(embedding_matrix, method='bert', k=5):
    val_df = pd.read_csv('behaviors_test.tsv', sep='\t', header=None,
                         names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

    val_df = val_df.dropna(subset=['history', 'impressions'])

    all_ndcg_scores = []

    for _, row in tqdm(val_df.iterrows(), total=len(val_df)):
        user_history = row['history'].strip().split()
        impression = parse_impressions(row['impressions'])

        clicked_news = [nid for nid, label in impression if label == 1]
        all_news = [nid for nid, _ in impression]

        if not clicked_news or not all_news:
            continue

        # Build user profile vector
        user_idx = [news_id_to_idx[nid] for nid in user_history if nid in news_id_to_idx]
        if not user_idx:
            continue

        user_vec = embedding_matrix[user_idx].mean(axis=0).reshape(1, -1)
        candidate_idxs = [news_id_to_idx[nid] for nid in all_news if nid in news_id_to_idx]

        if len(candidate_idxs) != len(all_news):
            continue  # Skip if any news_id is missing

        candidate_vecs = embedding_matrix[candidate_idxs]
        scores = cosine_similarity(user_vec, candidate_vecs).flatten()

        labels = [label for _, label in impression]
        score = ndcg_score([labels], [scores], k=k)
        all_ndcg_scores.append(score)

    return np.mean(all_ndcg_scores)

In [10]:
print("🔍 Evaluating nDCG@5 with BERT...")
ndcg5 = evaluate_ndcg(bert_embeddings, method='bert', k=5)
print(f"✅ BERT nDCG@5: {ndcg5:.4f}")

print("🔍 Evaluating nDCG@10 with BERT...")
ndcg10 = evaluate_ndcg(bert_embeddings, method='bert', k=10)
print(f"✅ BERT nDCG@10: {ndcg10:.4f}")

🔍 Evaluating nDCG@5 with BERT...


100%|██████████████████████████████████████████████████████████████████████████| 70938/70938 [00:25<00:00, 2781.89it/s]


✅ BERT nDCG@5: 0.6379
🔍 Evaluating nDCG@10 with BERT...


100%|██████████████████████████████████████████████████████████████████████████| 70938/70938 [00:24<00:00, 2864.02it/s]


✅ BERT nDCG@10: 0.6627


In [11]:
evaluate_ndcg(tfidf_matrix.toarray(), method='tfidf', k=5)

100%|███████████████████████████████████████████████████████████████████████████| 70938/70938 [01:31<00:00, 778.46it/s]


0.6116738240301081