In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

 # Data preprocessing

In [2]:
# Load data
behaviors = pd.read_csv('./small_training_data/behaviors.tsv', delimiter='\t', header=None)
news = pd.read_csv('./small_training_data/news.tsv', delimiter='\t', header=None)

# Naming columns
behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

In [3]:
# Remove NaN values in the 'abstract' column
news = news.dropna(subset=['abstract'])

# list of valid news (with some abstract)
valid_news_ids = set(news['news_id'])

In [4]:
# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
behaviors['clicked_news'] = behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

In [5]:
#removing unused columns
behaviors = behaviors[["impression_id", "user_id", "clicked_news"]]

In [6]:
# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
clicked_news = behaviors.explode('clicked_news')[['user_id', 'clicked_news']]

In [7]:
#remove non valid news from interactions
clicked_news = clicked_news[clicked_news['clicked_news'].isin(valid_news_ids)]

In [8]:
# Filtering users with more than 4 news clicked since the distribution is 'ultra-skewed'
clicked_news = clicked_news.groupby('user_id').filter(lambda x: len(x) > 4)

In [9]:
# Rename 'clicked_news' column to 'news_id'
clicked_news = clicked_news.rename(columns={'clicked_news': 'news_id'})

In [10]:
print(f"Number of unique clicked news: {clicked_news['news_id'].nunique()}")
print(f"Number of unique users:        {clicked_news['user_id'].nunique()} \n")

Number of unique clicked news: 6522
Number of unique users:        15418 



In [11]:
news = news.reset_index()

## in the cell below we create mappings from real id's to indexes, and create clicked_news_encoded

In [12]:
# Create categorical types without encoding them yet
clicked_news['user_id_cat'] = clicked_news['user_id'].astype("category")
clicked_news['news_id_cat'] = clicked_news['news_id'].astype("category")

# Creating mappings from original IDs to encoded IDs
id_to_user = dict(enumerate(clicked_news['user_id_cat'].cat.categories))
id_to_news = dict(enumerate(clicked_news['news_id_cat'].cat.categories))

# Convert categories to codes (integer encoding)
clicked_news_encoded = pd.DataFrame(columns=['user', 'item'])
clicked_news_encoded['user'] = clicked_news['user_id_cat'].cat.codes
clicked_news_encoded['item'] = clicked_news['news_id_cat'].cat.codes

# Drop the additional categorical columns if they are not needed
clicked_news = clicked_news.drop(columns=['user_id_cat', 'news_id_cat'])

# Creating reverse mappings from original IDs to encoded IDs
user_to_id = {v: k for k, v in id_to_user.items()}
news_to_id = {v: k for k, v in id_to_news.items()}

In [13]:
# Create a sparse user-item interaction matrix
interaction_matrix = coo_matrix((np.ones(clicked_news_encoded.shape[0]),
                                 (clicked_news_encoded['user'], clicked_news_encoded['item'])))

print(f"users: {interaction_matrix.shape[0]} \nitems: {interaction_matrix.shape[1]}")

users: 15418 
items: 6522


In [14]:
#transform into compressed sparse row
interaction_matrix_csr = interaction_matrix.tocsr()

In [15]:
clicked_news_encoded['rating'] = np.ones(len(clicked_news_encoded))

In [16]:
print(clicked_news_encoded)

         user  item  rating
1       14889   756     1.0
5        1803  1190     1.0
5        1803  2565     1.0
9        7888  4810     1.0
10      14512  5700     1.0
...       ...   ...     ...
156963   6291  1261     1.0
156963   6291  4396     1.0
156963   6291  2904     1.0
156963   6291   915     1.0
156963   6291  1137     1.0

[155443 rows x 3 columns]


In [17]:
user_click_counts = clicked_news['user_id'].value_counts().reset_index()
user_click_counts.columns = ['user_id', 'num_of_clicks']
user_click_counts_sorted = user_click_counts.sort_values(by='num_of_clicks', ascending=False)
print(user_click_counts_sorted)

      user_id  num_of_clicks
0      U53220            125
1      U70550            118
2      U63482            109
3      U20833             95
4      U32322             94
...       ...            ...
13230  U91963              5
13231  U65567              5
13232    U417              5
13233  U63788              5
15417   U5480              5

[15418 rows x 2 columns]


# Content based recommendations

In [18]:
# Vectorizing the 'abstract' column of news
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
news_profiles = vectorizer.fit_transform(news['abstract'])

In [19]:
# Reversing the vocabulary dictionary
reverse_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}

print(reverse_vocab.get(3845))

royals


This is a matrix that holds the similarity scores between all pairs of news items. It's a  square matrix where each row and column corresponds to a news item, and the entry [i, j] gives the similarity between news item i and news item j. the diagonal is full of ones for obvious reasons

In [20]:
# Compute the cosine similarity matrix
news_similarity = cosine_similarity(news_profiles, dense_output=False)

In [196]:
all_id_to_news = news['news_id'].to_dict()
all_news_to_id = {v: k for k, v in all_id_to_news.items()}

In [22]:
def recommend_news(user_id, interaction_matrix, news_similarity, top_n=5):
    """
    Recommend top N news items for a given user.
    
    Parameters:
        user_id (str): The user for whom to make recommendations.
        interaction_matrix (csr_matrix): User-item interaction matrix.
        news_similarity (csr_matrix): Item-item similarity matrix.
        top_n (int): Number of items to recommend.
        
    Returns:
        list: List of recommended item indices.
    """
    user_id = user_to_id[user_id]
    
    # Get the user's past interactions (list of news ids that the user has interacted with)
    user_interactions = interaction_matrix[user_id, :].nonzero()[1]
    
    # Sum the similarities of the interacted items 
    similarity_sum = np.sum(news_similarity[user_interactions, :], axis=0) #shape = (n_interactions, all_news)
    
    # Remove already interacted items
    similarity_sum[0, user_interactions] = 0
    
    # Get top N item indices
    recommended_news_ids = np.argsort(similarity_sum)[0, -top_n:][::-1]
    
    # Convert indices to original news IDs
    recommended_news = [all_id_to_news[idx] for idx in recommended_news_ids.tolist()[0]]
    
    return recommended_news

In [23]:
# Recommend for a user
user_id = 'U53220'
recommended_news_indices = recommend_news(user_id, interaction_matrix_csr, news_similarity, top_n=10)

print(f"Recommended news for user {user_id}:")
print(recommended_news_indices)

Recommended news for user U53220:
['N4830', 'N17496', 'N11545', 'N64885', 'N35290', 'N6405', 'N2445', 'N15320', 'N45022', 'N12262']


## Test set

In [24]:
# Load data
test_behaviors = pd.read_csv('./small_test_data/behaviors.tsv', delimiter='\t', header=None)
test_news = pd.read_csv('./small_test_data/news.tsv', delimiter='\t', header=None)

# Naming columns
test_behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
test_news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

In [25]:
# Remove NaN values in the 'abstract' column
test_news = test_news.dropna(subset=['abstract'])

# list of valid news (with some abstract)
test_valid_news_ids = set(test_news['news_id'])

In [26]:
# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
test_behaviors['clicked_news'] = test_behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

#removing unused columns
test_behaviors = test_behaviors[["impression_id", "user_id", "clicked_news"]]

# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
test_clicked_news = test_behaviors.explode('clicked_news')[['user_id', 'clicked_news']]

#remove non valid news from interactions
test_clicked_news = test_clicked_news[test_clicked_news['clicked_news'].isin(test_valid_news_ids)]

# Rename 'clicked_news' column to 'news_id'
test_data = test_clicked_news.rename(columns={'clicked_news': 'news_id'})

In [27]:
print(f"Number of unique clicked news: {test_data['news_id'].nunique()}")
print(f"Number of unique users:        {test_data['user_id'].nunique()} \n")

Number of unique clicked news: 2115
Number of unique users:        48139 



### Since it's a content-based recommender system we ran into the cold start problem from users present in the test data but not in the training data, so we just recommend them items from the most popular news 

In [28]:
news_popularity = clicked_news['news_id'].value_counts()

# Calculate the click threshold for the top percentile
threshold = np.percentile(news_popularity, 98)

# Get the most popular news items
popular_news = news_popularity[news_popularity >= threshold].index.tolist()

## Model evaluation - Precision@k and Recall@k and nCDG@k

In [29]:
recommendations = {} #{'user_id': [list of recommended news_ids]}

In [30]:
# Set of user_ids from test_data and training data
test_data_user_ids = set(test_data['user_id'].unique())

train_data_user_ids = set(clicked_news['user_id'].unique())

In [31]:
for user_id in test_data_user_ids:
    if user_id not in train_data_user_ids:
        # New user or user not in test data
        recommendations[user_id] = random.sample(popular_news, 5)  # Recommend random news from popular news items
    else:
        recommended_news_indices = recommend_news(user_id, interaction_matrix_csr, news_similarity, top_n=10)
        recommendations[user_id] = [idx for idx in recommended_news_indices]

In [32]:
def precision_at_k(recommended_list, relevant_list, k):
    """
    Compute Precision at K.
    
    Parameters:
        recommended_list (list): List of recommended items.
        relevant_list (list): List of relevant items.
        k (int): Number of recommendations to consider.
        
    Returns:
        float: Precision at K score.
    """
    return len(set(recommended_list[:k]) & set(relevant_list)) / k

In [33]:
def recall_at_k(recommended_list, relevant_set, k):
    """Return the recall at k."""
    return len(set(recommended_list[:k]) & set(relevant_set)) / len(relevant_set)

In [34]:
def ndcg_at_k(recommended_list, relevant_set, k):
    """Return the NDCG at k."""
    dcg = 0
    idcg = sum([1 / math.log(i + 2, 2) for i in range(min(k, len(relevant_set)))])
    for i, item in enumerate(recommended_list[:k]):
        if item in relevant_set:
            dcg += 1 / math.log(i + 2, 2)
    return dcg / idcg

In [35]:
k = 5 # or any value you choose

precision_values = []
recall_values = []
ndcg_values = []

for user_id, recommended_items in recommendations.items():
    relevant_items = test_data[test_data['user_id'] == user_id]['news_id'].tolist()
    
    if relevant_items:  # if the user has any relevant items
        
        precision = precision_at_k(recommended_items, relevant_items, k)
        recall = recall_at_k(recommended_items, relevant_items, k)
        ndcg = ndcg_at_k(recommended_items, relevant_items, k)
        
        precision_values.append(precision)
        recall_values.append(recall)
        ndcg_values.append(ndcg)

# Averaging across all users to get the final metric value
mean_precision = np.mean(precision_values)
mean_recall = np.mean(recall_values)
mean_ndcg = np.mean(ndcg_values)

print(f"Mean Precision@{k}: {mean_precision:.4f}")
print(f"Mean Recall@{k}: {mean_recall:.4f}")
print(f"Mean NDCG@{k}: {mean_ndcg:.4f}")

Mean Precision@5: 0.0003
Mean Recall@5: 0.0005
Mean NDCG@5: 0.0004


I think it's due to the sparsity and cold start problem, we have many users without interactions

In [36]:
filtered_test_data = test_data.groupby('user_id').filter(lambda x: len(x) > 10)

In [37]:
recommendations = {} #{'user_id': [list of recommended news_ids]}

# Set of user_ids from test_data and training data
test_data_user_ids = set(filtered_test_data['user_id'].unique())

train_data_user_ids = set(clicked_news['user_id'].unique())

for user_id in test_data_user_ids:
    if user_id not in train_data_user_ids:
        # New user or user not in test data
        recommendations[user_id] = random.sample(popular_news, 5)  # Recommend random news from popular news items
    else:
        recommended_news_indices = recommend_news(user_id, interaction_matrix_csr, news_similarity, top_n=10)
        recommendations[user_id] = [idx for idx in recommended_news_indices]

In [38]:
k = 5  # or any value you choose

precision_values = []
recall_values = []
ndcg_values = []

for user_id, recommended_items in recommendations.items():
    relevant_items = test_data[test_data['user_id'] == user_id]['news_id'].tolist()
    
    if relevant_items:  # if the user has any relevant items
        
        precision = precision_at_k(recommended_items, relevant_items, k)
        recall = recall_at_k(recommended_items, relevant_items, k)
        ndcg = ndcg_at_k(recommended_items, relevant_items, k)
        
        precision_values.append(precision)
        recall_values.append(recall)
        ndcg_values.append(ndcg)

# Averaging across all users to get the final metric value
mean_precision = np.mean(precision_values)
mean_recall = np.mean(recall_values)
mean_ndcg = np.mean(ndcg_values)

print(f"Mean Precision@{k}: {mean_precision:.4f}")
print(f"Mean Recall@{k}: {mean_recall:.4f}")
print(f"Mean NDCG@{k}: {mean_ndcg:.4f}")

Mean Precision@5: 0.0009
Mean Recall@5: 0.0003
Mean NDCG@5: 0.0006


### In fact we can see that is precision increases if we consider users with more than n = 10 interactions

# Group recommendations

In [106]:
from lenskit.algorithms.als import ImplicitMF

In [107]:
# Load JSON string from a file
with open("grouped_dict.json", "r") as f:
    grouped_dict_json = f.read()

# Convert the JSON string back to a dictionary
groups_dict = json.loads(grouped_dict_json)

In [108]:
all_clicks = clicked_news_encoded.reset_index()
all_clicks = all_clicks[['user', 'item', 'rating']]

In [109]:
# Create an implicit feedback model with 50 features and 20 iterations
model = ImplicitMF(features=50, iterations=20, reg=0.1)

# Train the model
model.fit(all_clicks)

<lenskit.algorithms.als.ImplicitMF at 0x175f0533640>

In [153]:
def group_recommendations(user_group, model, all_items, user_item_data):
    #getting user_idexes
    user_group = [user_to_id[user] for user in groups if user in user_to_id]
    
    # A dictionary to aggregate scores
    aggregated_scores = {}

    # A set to collect items already read by any user in the group
    read_by_group = set()

    for user in user_group:
        # Get individual user recommendations
        recs_user = model.predict_for_user(user, all_items)
        
        # Exclude items already read by the user
        read_items = user_item_data[user_item_data['user'] == user]['item'].tolist()
        recs_user = recs_user.drop(read_items, errors='ignore')

        # Add these read items to the group set
        read_by_group.update(read_items)

        # Aggregate scores
        for item, score in recs_user.items():
            if item in aggregated_scores:
                aggregated_scores[item] += score
            else:
                aggregated_scores[item] = score

    # Convert to pandas series for easier manipulation
    aggregated_scores = pd.Series(aggregated_scores)

    # Remove items read by any user in the group
    aggregated_scores = aggregated_scores.drop(list(read_by_group), errors='ignore')

    # Sort and get the top items
    top_items = aggregated_scores.sort_values(ascending=False).head(10)

    # Convert the index to news ids using the dictionary
    top_items.index = top_items.index.map(id_to_news)
    
    return top_items

In [158]:
def explain_group_recs(group_users, news, top_items):
    explanations = {}
    
    for item in top_items:
        # Check for topic
        filtered_news = news.loc[news['news_id'] == item, 'subcategory']
        if not filtered_news.empty:
            topic = filtered_news.values[0]
        else:
            topic = "Unknown"  
        
        user_group = [user for user in groups if user in id_to_user]
        
        # Generate explanation string
        explanations[item] = f"Recommended because users {group_users} showed a high average preference score for {topic}."

    return explanations

In [159]:
all_items = clicked_news_encoded.item.unique().tolist()

group = groups_dict['Group 6']
group

['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596']

In [160]:
top_10_group_recs = group_recommendations(group, model, all_items, all_clicks)

print("Top 10 recommended items for the group:")
print(top_10_group_recs)

Top 10 recommended items for the group:
N41178    4.497051
N24272    3.613691
N18870    3.467665
N7328     3.240432
N41881    3.221370
N55606    3.169339
N23548    2.868267
N14780    2.861550
N3930     2.727446
N62762    2.680939
dtype: float64


In [162]:
explanations = explain_group_recs(group, all_news, top_10_group_recs.index)

In [163]:
explanations

{'N41178': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596'] showed a high average preference score for newspolitics.",
 'N24272': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596'] showed a high average preference score for movies-celebrity.",
 'N18870': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596'] showed a high average preference score for shop-holidays.",
 'N7328': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596'] showed a high average preference score for restaurantsandnews.",
 'N41881': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596'] showed a high average preference score for tv-celebrity.",
 'N55606': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643

# Survey recommendations

In [193]:
survey_news = ["N55468", "N8091", "N61914", "N7857", "N32907", "N33584", "N50566", "N47845", "N20212", "N37038", "N39173", "N54950", "N63975", "N28867", "N62996",
               "N35703", "N9680", "N50299", "N61837", "N59295", "N13113", "N57018", "N48320", "N49981", "N11200", "N10470", "N46481", "N27435", "N39125", "N24808",
               "N1587", "N22605", "N10886", "N52386", "N52620", "N30867", "N47140", "N29552", "N55528", "N55610"]

In [203]:
survey_news_ids = [all_news_to_id[item] for item in survey_news if item in all_news_to_id]

In [207]:
print(survey_news_ids)

[4690, 14338, 154, 1807, 317, 675, 18571, 21184, 1759, 11189, 116, 1023, 3774, 4040, 49, 360, 2, 7, 3455, 393, 431, 13433, 29181, 59, 2720, 7627, 3346, 67, 68, 7872, 20997, 3458, 11150, 0, 15]


In [218]:
# Create a list of letters A to O
users = [chr(i) for i in range(ord('A'), ord('O')+1)]

# Create a DataFrame with 'user' column filled with letters A to O and 'clicked_news' with empty lists
survey_interactions = pd.DataFrame({
    'user': users,
    'item': [[] for _ in users]  # Create an empty list for each user
})

survey_interactions

Unnamed: 0,user,item
0,A,[]
1,B,[]
2,C,[]
3,D,[]
4,E,[]
5,F,[]
6,G,[]
7,H,[]
8,I,[]
9,J,[]


In [221]:
survey_interactions.at[0, 'item'].extend(["N7857", "N33584", "N50566", "N47845", "N39173", "N54950", "N11200", "N22605", "N52620"])
survey_interactions.at[1, 'item'].extend(["N55468", "N61914", "N50566", "N20212", "N37038", "N39173", "N54950", "N35703", "N50299", "N61837", "N11200", "N10470", "N46481", "N27435", "N1587", "N22605", "N10886", "N52386", "N30867", "N55528"])
survey_interactions.at[2, 'item'].extend(["N55468", "N8091", "N7857", "N32907", "N33584", "N50566", "N47845", "N20212", "N37038", "N39173", "N54950", "N22605", "N10886"])
survey_interactions.at[3, 'item'].extend([])
survey_interactions.at[4, 'item'].extend([])
survey_interactions.at[5, 'item'].extend([])
survey_interactions.at[6, 'item'].extend([])
survey_interactions.at[7, 'item'].extend([])
survey_interactions.at[8, 'item'].extend([])
survey_interactions.at[9, 'item'].extend([])
survey_interactions.at[10, 'item'].extend([])
survey_interactions.at[11, 'item'].extend([])
survey_interactions.at[12, 'item'].extend([])
survey_interactions.at[13, 'item'].extend([])



Unnamed: 0,user,item
0,A,"[N7857, N33584, N50566, N47845, N39173, N54950..."
1,B,"[N7857, N33584, N50566, N47845, N39173, N54950..."
2,C,[]
3,D,[]
4,E,[]
5,F,[]
6,G,[]
7,H,[]
8,I,[]
9,J,[]


Unnamed: 0,user,item
0,A,"[News_2, News_3]"
1,B,[]
2,C,[]
3,D,[]
4,E,[]
5,F,[]
6,G,[]
7,H,[]
8,I,[]
9,J,[]
