In [7]:
from google.colab import drive
import pandas as pd
import os

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score

Sources:

https://www.geeksforgeeks.org/music-recommendation-system-using-machine-learning/

https://medium.com/@prateekgaurav/step-by-step-content-based-recommendation-system-823bbfd0541c



# 1. Load Data

In [9]:
def load_tsv(filepath, column_names):
    return pd.read_csv(filepath, sep='\t', header=None, names=column_names)

data_dir = "/content/drive/Shareddrives/CMPE 256 - Group Project/Code/Data/MINDsmall_train"


columns = ["NewsID", "Category", "SubCategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities", "Sentiment"]

news_df = pd.read_csv("/content/news_with_sentiment.csv", names=columns, header=0)

behaviors_df = load_tsv(os.path.join(data_dir, "behaviors.tsv"),
                        ["ImpressionID", "UserID", "Time", "History", "Impressions"])

In [10]:
len(news_df)

50944

# 2. Prepare News TF-IDF

In [11]:
news_df.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,TitleEntities,AbstractEntities,Sentiment
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],negative
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",negative
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",negative
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",negative
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",neutral


In [12]:
corpus = news_df['Title'].fillna('') + news_df['Abstract'].fillna('') + news_df['Sentiment'].fillna('')
vectorizer = TfidfVectorizer(max_features=1000)
news_tfidf = vectorizer.fit_transform(corpus)

news_id_to_idx = {news_id: idx for idx, news_id in enumerate(news_df['NewsID'])}

def get_news_vector(news_id):
    idx = news_id_to_idx.get(news_id)
    if idx is not None:
        return news_tfidf[idx].toarray()
    else:
        return np.zeros((1, news_tfidf.shape[1]))

In [13]:
corpus

Unnamed: 0,0
0,"The Brands Queen Elizabeth, Prince Charles, an..."
1,50 Worst Habits For Belly FatThese seemingly h...
2,The Cost of Trump's Aid Freeze in the Trenches...
3,I Was An NBA Wife. Here's How It Affected My M...
4,"How to Get Rid of Skin Tags, According to a De..."
...,...
50939,New York Jets' Jamal Adams Scores Defensive TD...
50940,NJ high school football playoffs 2019: Here ar...
50941,Unusual playoff means Tyrrell Hatton made $1.5...
50942,Washington Apple Crop Among Largest In State H...


In [14]:
news_df.head()

Unnamed: 0,NewsID,Category,SubCategory,Title,Abstract,URL,TitleEntities,AbstractEntities,Sentiment
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],negative
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",negative
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",negative
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",negative
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",neutral


# 3. Build User Profiles

**6 Mins to Run**

In [15]:
user_profiles = {}

for _, row in behaviors_df.iterrows():
    user_id = row['UserID']
    history = row['History']

    if pd.isna(history):
        continue

    clicked_news_ids = history.split()

    clicked_vectors = [
        get_news_vector(news_id)
        for news_id in clicked_news_ids
        if news_id in news_id_to_idx
    ]

    if clicked_vectors:
        user_profile = np.mean(clicked_vectors, axis=0)
        user_profiles[user_id] = user_profile


In [16]:
for i in user_profiles['U63233']:
  print(len(i))

1000


# 4. Recommender



In [17]:
user_id = 'U63233'
top_k = 5

user_vector = user_profiles[user_id]

similarities = cosine_similarity(user_vector, news_tfidf).flatten()

top_indices = similarities.argsort()[-top_k:][::-1]

val = news_df.iloc[top_indices]['NewsID'].tolist()

# 5. Predict Scores

In [18]:
def predict_scores(user_id, candidate_news_ids):
    if user_id not in user_profiles:
        return np.random.rand(len(candidate_news_ids))

    user_vector = user_profiles[user_id]

    candidate_vectors = np.vstack([get_news_vector(news_id) for news_id in candidate_news_ids])

    scores = cosine_similarity(user_vector, candidate_vectors).flatten()

    return scores

# 6. Metrics

In [19]:
def parse_impressions(impression_str):
    pairs = impression_str.strip().split()
    news_ids, labels = [], []
    for p in pairs:
        news_id, label = p.split('-')
        news_ids.append(news_id)
        labels.append(int(label))
    return news_ids, labels

def rank_indices(scores):
    return np.argsort(scores)[::-1]

def compute_mrr(labels, scores):
    for rank, idx in enumerate(rank_indices(scores)):
        if labels[idx] == 1:
            return 1.0 / (rank + 1)
    return 0.0

def hits_at_k(labels, scores, k):
    labels = np.array(labels)
    scores = np.array(scores)
    top_k_indices = np.argsort(scores)[-k:][::-1]
    top_k_labels = labels[top_k_indices]
    return float(np.any(top_k_labels == 1))


In [20]:
from sklearn.metrics import top_k_accuracy_score

# 7. Evaluation

In [21]:
def evaluate(behaviors_df, top_k=10):
    all_labels, all_scores, mrrs, ndcgs, hits = [], [], [], [], []

    for _, row in behaviors_df.iterrows():
        user_id = row['UserID']
        impressions = row['Impressions']
        candidate_news_ids, labels = parse_impressions(impressions)

        if len(set(labels)) <= 1:
            continue

        scores = predict_scores(user_id, candidate_news_ids)
        all_labels.extend(labels)
        all_scores.extend(scores)

        mrrs.append(compute_mrr(labels, scores))
        hits.append(hits_at_k(labels, scores, k = top_k))

    return {
        "AUC": roc_auc_score(all_labels, all_scores),
        "HITS@K": np.mean(hits),
        "MRR": np.mean(mrrs)
    }

# 8. Run Evaluation

**THIS WILL TAKE APPROX 8 MINS TO RUN**

In [22]:
results = evaluate(behaviors_df)
print("Evaluation Results:", results)


Evaluation Results: {'/nAUC': np.float64(0.5330951035778084), '/nHITS@K': np.float64(0.6188831905201797), '/nMRR': np.float64(0.2876778669640404)}
