In [1]:
from google.colab import drive
import pandas as pd
import os

drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score

# 1. Load Data

In [3]:
def load_tsv(filepath, column_names):
    return pd.read_csv(filepath, sep='\t', header=None, names=column_names)

data_dir = "/content/drive/Shareddrives/CMPE 256 - Group Project/Code/Data/MINDsmall_train"


news_df = load_tsv(os.path.join(data_dir, "news.tsv"),
                   ["NewsID", "Category", "SubCategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities"])

behaviors_df = load_tsv(os.path.join(data_dir, "behaviors.tsv"),
                        ["ImpressionID", "UserID", "Time", "History", "Impressions"])

# 2. Prepare News TF-IDF

In [4]:
news_df.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,TitleEntities,AbstractEntities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [5]:
corpus = news_df['Abstract'].fillna('')
vectorizer = TfidfVectorizer(max_features=1000)
news_tfidf = vectorizer.fit_transform(corpus)

news_id_to_idx = {news_id: idx for idx, news_id in enumerate(news_df['NewsID'])}

def get_news_vector(news_id):
    idx = news_id_to_idx.get(news_id)
    if idx is not None:
        return news_tfidf[idx].toarray()
    else:
        return np.zeros((1, news_tfidf.shape[1]))

In [6]:
news_df.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,TitleEntities,AbstractEntities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


# 3. Build User Profiles

In [7]:
user_profiles = {}

for _, row in behaviors_df.iterrows():
    user_id = row['UserID']
    history = row['History']

    if pd.isna(history):
        continue

    clicked_news_ids = history.split()

    clicked_vectors = [
        get_news_vector(news_id)
        for news_id in clicked_news_ids
        if news_id in news_id_to_idx
    ]

    if clicked_vectors:
        user_profile = np.mean(clicked_vectors, axis=0)
        user_profiles[user_id] = user_profile


In [8]:
for i in user_profiles['U63233']:
  print(len(i))

1000


# 4. Recommendation Function

In [9]:
def recommend(user_id, top_k=5):
    if user_id not in user_profiles:
        return []

    user_vector = user_profiles[user_id]
    similarities = cosine_similarity(user_vector, news_tfidf).flatten()
    top_indices = similarities.argsort()[-top_k:][::-1]

    return news_df.iloc[top_indices]['NewsID'].tolist()

# 5. Predict Scores

In [10]:
def predict_scores(user_id, candidate_news_ids):
    if user_id not in user_profiles:
        return np.random.rand(len(candidate_news_ids))

    user_vector = user_profiles[user_id]

    candidate_vectors = np.vstack([get_news_vector(news_id) for news_id in candidate_news_ids])

    scores = cosine_similarity(user_vector, candidate_vectors).flatten()

    return scores

# 6. Metrics

In [11]:
def parse_impressions(impression_str):
    pairs = impression_str.strip().split()
    news_ids, labels = [], []
    for p in pairs:
        news_id, label = p.split('-')
        news_ids.append(news_id)
        labels.append(int(label))
    return news_ids, labels

def rank_indices(scores):
    return np.argsort(scores)[::-1]

def compute_mrr(labels, scores):
    for rank, idx in enumerate(rank_indices(scores)):
        if labels[idx] == 1:
            return 1.0 / (rank + 1)
    return 0.0

def compute_dcg(labels, scores, k):
    dcg = 0.0
    for i, idx in enumerate(rank_indices(scores)[:k]):
        rel = labels[idx]
        dcg += (2**rel - 1) / np.log2(i + 2)
    return dcg

def compute_ndcg(labels, scores, k):
    ideal_dcg = compute_dcg(sorted(labels, reverse=True), np.array(labels), k)
    if ideal_dcg == 0:
        return 0.0
    return compute_dcg(labels, scores, k) / ideal_dcg


# 7. Evaluation

In [12]:
def evaluate(behaviors_df, top_k=5):
    all_labels, all_scores, mrrs, ndcgs = [], [], [], []

    for _, row in behaviors_df.iterrows():
        user_id = row['UserID']
        impressions = row['Impressions']
        candidate_news_ids, labels = parse_impressions(impressions)

        if len(set(labels)) <= 1:
            continue

        scores = predict_scores(user_id, candidate_news_ids)
        all_labels.extend(labels)
        all_scores.extend(scores)

        mrrs.append(compute_mrr(labels, scores))
        ndcgs.append(compute_ndcg(labels, scores, top_k))

    return {
        "AUC": roc_auc_score(all_labels, all_scores),
        "MRR": np.mean(mrrs),
        "nDCG@{}".format(top_k): np.mean(ndcgs)
    }

# 8. Run Evaluation

In [13]:
results = evaluate(behaviors_df)
print("Evaluation Results:", results)


Evaluation Results: {'AUC': np.float64(0.5248851446326842), 'MRR': np.float64(0.2767283677956073), 'nDCG@5': np.float64(0.2208094975897005)}
