In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

 # Data preprocessing

In [2]:
# Load data
behaviors = pd.read_csv('./small_training_data/behaviors.tsv', delimiter='\t', header=None)
news = pd.read_csv('./small_training_data/news.tsv', delimiter='\t', header=None)

# Naming columns
behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

In [3]:
# Remove NaN values in the 'abstract' column
news = news.dropna(subset=['abstract'])

# list of valid news (with some abstract)
valid_news_ids = set(news['news_id'])

In [4]:
# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
behaviors['clicked_news'] = behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

In [5]:
#removing unused columns
behaviors = behaviors[["impression_id", "user_id", "clicked_news"]]

In [6]:
# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
clicked_news = behaviors.explode('clicked_news')[['user_id', 'clicked_news']]

In [7]:
#remove non valid news from interactions
clicked_news = clicked_news[clicked_news['clicked_news'].isin(valid_news_ids)]

In [8]:
# Filtering users with more than 4 news clicked since the distribution is 'ultra-skewed'
clicked_news = clicked_news.groupby('user_id').filter(lambda x: len(x) > 4)

In [9]:
# Rename 'clicked_news' column to 'news_id'
clicked_news = clicked_news.rename(columns={'clicked_news': 'news_id'})

In [10]:
print(f"Number of unique clicked news: {clicked_news['news_id'].nunique()}")
print(f"Number of unique users:        {clicked_news['user_id'].nunique()} \n")

Number of unique clicked news: 6522
Number of unique users:        15418 



In [11]:
news = news.reset_index()

## in the cell below we create mappings from real id's to indexes, and create clicked_news_encoded

In [12]:
# Create categorical types without encoding them yet
clicked_news['user_id_cat'] = clicked_news['user_id'].astype("category")
clicked_news['news_id_cat'] = clicked_news['news_id'].astype("category")

# Creating mappings from original IDs to encoded IDs
id_to_user = dict(enumerate(clicked_news['user_id_cat'].cat.categories))
id_to_news = dict(enumerate(clicked_news['news_id_cat'].cat.categories))

# Convert categories to codes (integer encoding)
clicked_news_encoded = pd.DataFrame(columns=['user', 'item'])
clicked_news_encoded['user'] = clicked_news['user_id_cat'].cat.codes
clicked_news_encoded['item'] = clicked_news['news_id_cat'].cat.codes

# Drop the additional categorical columns if they are not needed
clicked_news = clicked_news.drop(columns=['user_id_cat', 'news_id_cat'])

# Creating reverse mappings from original IDs to encoded IDs
user_to_id = {v: k for k, v in id_to_user.items()}
news_to_id = {v: k for k, v in id_to_news.items()}

In [13]:
# Create a sparse user-item interaction matrix
interaction_matrix = coo_matrix((np.ones(clicked_news_encoded.shape[0]),
                                 (clicked_news_encoded['user'], clicked_news_encoded['item'])))

print(f"users: {interaction_matrix.shape[0]} \nitems: {interaction_matrix.shape[1]}")

users: 15418 
items: 6522


In [14]:
#transform into compressed sparse row
interaction_matrix_csr = interaction_matrix.tocsr()

In [15]:
clicked_news_encoded['rating'] = np.ones(len(clicked_news_encoded))

In [16]:
print(clicked_news_encoded)

         user  item  rating
1       14889   756     1.0
5        1803  1190     1.0
5        1803  2565     1.0
9        7888  4810     1.0
10      14512  5700     1.0
...       ...   ...     ...
156963   6291  1261     1.0
156963   6291  4396     1.0
156963   6291  2904     1.0
156963   6291   915     1.0
156963   6291  1137     1.0

[155443 rows x 3 columns]


In [17]:
user_click_counts = clicked_news['user_id'].value_counts().reset_index()
user_click_counts.columns = ['user_id', 'num_of_clicks']
user_click_counts_sorted = user_click_counts.sort_values(by='num_of_clicks', ascending=False)
print(user_click_counts_sorted)

      user_id  num_of_clicks
0      U53220            125
1      U70550            118
2      U63482            109
3      U20833             95
4      U32322             94
...       ...            ...
13230  U91963              5
13231  U65567              5
13232    U417              5
13233  U63788              5
15417   U5480              5

[15418 rows x 2 columns]


# Content based recommendations

In [18]:
# Vectorizing the 'abstract' column of news
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
news_profiles = vectorizer.fit_transform(news['abstract'])

In [19]:
# Reversing the vocabulary dictionary
reverse_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}

print(reverse_vocab.get(3845))

royals


This is a matrix that holds the similarity scores between all pairs of news items. It's a  square matrix where each row and column corresponds to a news item, and the entry [i, j] gives the similarity between news item i and news item j. the diagonal is full of ones for obvious reasons

In [20]:
# Compute the cosine similarity matrix
news_similarity = cosine_similarity(news_profiles, dense_output=False)

In [21]:
all_id_to_news = news['news_id'].to_dict()

In [22]:
def recommend_news(user_id, interaction_matrix, news_similarity, top_n=5):
    """
    Recommend top N news items for a given user.
    
    Parameters:
        user_id (str): The user for whom to make recommendations.
        interaction_matrix (csr_matrix): User-item interaction matrix.
        news_similarity (csr_matrix): Item-item similarity matrix.
        top_n (int): Number of items to recommend.
        
    Returns:
        list: List of recommended item indices.
    """
    user_id = user_to_id[user_id]
    
    # Get the user's past interactions (list of news ids that the user has interacted with)
    user_interactions = interaction_matrix[user_id, :].nonzero()[1]
    
    # Sum the similarities of the interacted items 
    similarity_sum = np.sum(news_similarity[user_interactions, :], axis=0) #shape = (n_interactions, all_news)
    
    # Remove already interacted items
    similarity_sum[0, user_interactions] = 0
    
    # Get top N item indices
    recommended_news_ids = np.argsort(similarity_sum)[0, -top_n:][::-1]
    
    # Convert indices to original news IDs
    recommended_news = [all_id_to_news[idx] for idx in recommended_news_ids.tolist()[0]]
    
    return recommended_news

In [23]:
# Recommend for a user
user_id = 'U53220'
recommended_news_indices = recommend_news(user_id, interaction_matrix_csr, news_similarity, top_n=10)

print(f"Recommended news for user {user_id}:")
print(recommended_news_indices)

Recommended news for user U53220:
['N4830', 'N17496', 'N11545', 'N64885', 'N35290', 'N6405', 'N2445', 'N15320', 'N45022', 'N12262']


## Test set

In [24]:
# Load data
test_behaviors = pd.read_csv('./small_test_data/behaviors.tsv', delimiter='\t', header=None)
test_news = pd.read_csv('./small_test_data/news.tsv', delimiter='\t', header=None)

# Naming columns
test_behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
test_news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

In [25]:
# Remove NaN values in the 'abstract' column
test_news = test_news.dropna(subset=['abstract'])

# list of valid news (with some abstract)
test_valid_news_ids = set(test_news['news_id'])

In [26]:
# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
test_behaviors['clicked_news'] = test_behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

#removing unused columns
test_behaviors = test_behaviors[["impression_id", "user_id", "clicked_news"]]

# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
test_clicked_news = test_behaviors.explode('clicked_news')[['user_id', 'clicked_news']]

#remove non valid news from interactions
test_clicked_news = test_clicked_news[test_clicked_news['clicked_news'].isin(test_valid_news_ids)]

# Rename 'clicked_news' column to 'news_id'
test_data = test_clicked_news.rename(columns={'clicked_news': 'news_id'})

In [27]:
print(f"Number of unique clicked news: {test_data['news_id'].nunique()}")
print(f"Number of unique users:        {test_data['user_id'].nunique()} \n")

Number of unique clicked news: 2115
Number of unique users:        48139 



### Since it's a content-based recommender system we ran into the cold start problem from users present in the test data but not in the training data, so we just recommend them items from the most popular news 

In [28]:
news_popularity = clicked_news['news_id'].value_counts()

# Calculate the click threshold for the top percentile
threshold = np.percentile(news_popularity, 98)

# Get the most popular news items
popular_news = news_popularity[news_popularity >= threshold].index.tolist()

## Precision@k and Recall@k and nCDG@k

In [29]:
recommendations = {} #{'user_id': [list of recommended news_ids]}

In [30]:
# Set of user_ids from test_data and training data
test_data_user_ids = set(test_data['user_id'].unique())

train_data_user_ids = set(clicked_news['user_id'].unique())

In [31]:
for user_id in test_data_user_ids:
    if user_id not in train_data_user_ids:
        # New user or user not in test data
        recommendations[user_id] = random.sample(popular_news, 5)  # Recommend random news from popular news items
    else:
        recommended_news_indices = recommend_news(user_id, interaction_matrix_csr, news_similarity, top_n=10)
        recommendations[user_id] = [idx for idx in recommended_news_indices]

In [32]:
def precision_at_k(recommended_list, relevant_list, k):
    """
    Compute Precision at K.
    
    Parameters:
        recommended_list (list): List of recommended items.
        relevant_list (list): List of relevant items.
        k (int): Number of recommendations to consider.
        
    Returns:
        float: Precision at K score.
    """
    return len(set(recommended_list[:k]) & set(relevant_list)) / k

In [33]:
def recall_at_k(recommended_list, relevant_set, k):
    """Return the recall at k."""
    return len(set(recommended_list[:k]) & set(relevant_set)) / len(relevant_set)

In [34]:
def ndcg_at_k(recommended_list, relevant_set, k):
    """Return the NDCG at k."""
    dcg = 0
    idcg = sum([1 / math.log(i + 2, 2) for i in range(min(k, len(relevant_set)))])
    for i, item in enumerate(recommended_list[:k]):
        if item in relevant_set:
            dcg += 1 / math.log(i + 2, 2)
    return dcg / idcg

In [40]:
k = 5 # or any value you choose

precision_values = []
recall_values = []
ndcg_values = []

for user_id, recommended_items in recommendations.items():
    relevant_items = test_data[test_data['user_id'] == user_id]['news_id'].tolist()
    
    if relevant_items:  # if the user has any relevant items
        
        precision = precision_at_k(recommended_items, relevant_items, k)
        recall = recall_at_k(recommended_items, relevant_items, k)
        ndcg = ndcg_at_k(recommended_items, relevant_items, k)
        
        precision_values.append(precision)
        recall_values.append(recall)
        ndcg_values.append(ndcg)

# Averaging across all users to get the final metric value
mean_precision = np.mean(precision_values)
mean_recall = np.mean(recall_values)
mean_ndcg = np.mean(ndcg_values)

print(f"Mean Precision@{k}: {mean_precision:.4f}")
print(f"Mean Recall@{k}: {mean_recall:.4f}")
print(f"Mean NDCG@{k}: {mean_ndcg:.4f}")

Mean Precision@5: 0.0017
Mean Recall@5: 0.0007
Mean NDCG@5: 0.0015


I think it's due to the sparsity and cold start problem, we have many users without interactions

In [41]:
filtered_test_data = test_data.groupby('user_id').filter(lambda x: len(x) > 10)

In [42]:
recommendations = {} #{'user_id': [list of recommended news_ids]}

# Set of user_ids from test_data and training data
test_data_user_ids = set(filtered_test_data['user_id'].unique())

train_data_user_ids = set(clicked_news['user_id'].unique())

for user_id in test_data_user_ids:
    if user_id not in train_data_user_ids:
        # New user or user not in test data
        recommendations[user_id] = random.sample(popular_news, 5)  # Recommend random news from popular news items
    else:
        recommended_news_indices = recommend_news(user_id, interaction_matrix_csr, news_similarity, top_n=10)
        recommendations[user_id] = [idx for idx in recommended_news_indices]

In [43]:
k = 5  # or any value you choose

precision_values = []
recall_values = []
ndcg_values = []

for user_id, recommended_items in recommendations.items():
    relevant_items = test_data[test_data['user_id'] == user_id]['news_id'].tolist()
    
    if relevant_items:  # if the user has any relevant items
        
        precision = precision_at_k(recommended_items, relevant_items, k)
        recall = recall_at_k(recommended_items, relevant_items, k)
        ndcg = ndcg_at_k(recommended_items, relevant_items, k)
        
        precision_values.append(precision)
        recall_values.append(recall)
        ndcg_values.append(ndcg)

# Averaging across all users to get the final metric value
mean_precision = np.mean(precision_values)
mean_recall = np.mean(recall_values)
mean_ndcg = np.mean(ndcg_values)

print(f"Mean Precision@{k}: {mean_precision:.4f}")
print(f"Mean Recall@{k}: {mean_recall:.4f}")
print(f"Mean NDCG@{k}: {mean_ndcg:.4f}")

Mean Precision@5: 0.0021
Mean Recall@5: 0.0009
Mean NDCG@5: 0.0017


### In fact we can see that is precision increases if we consider users with more than n = 10 interactions

# Group recommendations

In [79]:
# Load JSON string from a file
with open("grouped_dict.json", "r") as f:
    grouped_dict_json = f.read()

# Convert the JSON string back to a dictionary
groups_dict = json.loads(grouped_dict_json)

In [81]:
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import als, Recommender

#  Set up and train the algorithm
algo = als.BiasedMF(100)  # we can change number of factor
train, test = next(xf.partition_users(clicked_news_encoded, 1, xf.SampleFrac(0.2)))
model = algo.fit(train)  # Use fit to train the model

all_recommendations = []

In [82]:
def group_recs(group_users, model = model, train = train):
  train_items = train['item'].unique()
  for user in group_users:
      user_scores = []
      for item in train_items:
          score = model.predict_for_user(user, [item])
          user_scores.append((item, score.iloc[0] if not score.empty else 0))

      user_recs = pd.DataFrame(user_scores, columns=['item', 'score'])
      top_recs = user_recs.sort_values(by='score', ascending=False).head(10)
      all_recommendations.append(top_recs)

  reclist = pd.concat(all_recommendations)

  least_misery_scores = reclist.groupby('item').score.min().reset_index()  # Using min for "least misery"
  group_top_recs = least_misery_scores.sort_values(by='score', ascending=False).head(10) # top 10

  return group_top_recs

In [86]:
group_top_recs = group_recs(groups_dict['Group 3'])
print(group_top_recs)

   item  score
0  1013    NaN
1  1120    NaN
2  1190    NaN
3  1476    NaN
4  2565    NaN
5  2808    NaN
6  5060    NaN
7  5482    NaN
8  5700    NaN
9  5926    NaN
