In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import math
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix
from sklearn.metrics.pairwise import cosine_similarity

 # Data preprocessing

In [2]:
# Load data
behaviors = pd.read_csv('./small_training_data/behaviors.tsv', delimiter='\t', header=None)
news = pd.read_csv('./small_training_data/news.tsv', delimiter='\t', header=None)

# Naming columns
behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

In [3]:
# Remove NaN values in the 'abstract' column
news = news.dropna(subset=['abstract'])

# list of valid news (with some abstract)
valid_news_ids = set(news['news_id'])

In [4]:
# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
behaviors['clicked_news'] = behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

In [5]:
#removing unused columns
behaviors = behaviors[["impression_id", "user_id", "clicked_news"]]

In [6]:
# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
clicked_news = behaviors.explode('clicked_news')[['user_id', 'clicked_news']]

In [7]:
#remove non valid news from interactions
clicked_news = clicked_news[clicked_news['clicked_news'].isin(valid_news_ids)]

In [8]:
# Filtering users with more than 4 news clicked since the distribution is 'ultra-skewed'
clicked_news = clicked_news.groupby('user_id').filter(lambda x: len(x) > 4)

In [9]:
# Rename 'clicked_news' column to 'news_id'
clicked_news = clicked_news.rename(columns={'clicked_news': 'news_id'})

In [10]:
print(f"Number of unique clicked news: {clicked_news['news_id'].nunique()}")
print(f"Number of unique users:        {clicked_news['user_id'].nunique()} \n")

Number of unique clicked news: 6522
Number of unique users:        15418 



In [11]:
news = news.reset_index()

## in the cell below we create mappings from real id's to indexes, and create clicked_news_encoded

In [12]:
# Create categorical types without encoding them yet
clicked_news['user_id_cat'] = clicked_news['user_id'].astype("category")
clicked_news['news_id_cat'] = clicked_news['news_id'].astype("category")

# Creating mappings from original IDs to encoded IDs
id_to_user = dict(enumerate(clicked_news['user_id_cat'].cat.categories))
id_to_news = dict(enumerate(clicked_news['news_id_cat'].cat.categories))

# Convert categories to codes (integer encoding)
clicked_news_encoded = pd.DataFrame(columns=['user', 'item'])
clicked_news_encoded['user'] = clicked_news['user_id_cat'].cat.codes
clicked_news_encoded['item'] = clicked_news['news_id_cat'].cat.codes

# Drop the additional categorical columns if they are not needed
clicked_news = clicked_news.drop(columns=['user_id_cat', 'news_id_cat'])

# Creating reverse mappings from original IDs to encoded IDs
user_to_id = {v: k for k, v in id_to_user.items()}
news_to_id = {v: k for k, v in id_to_news.items()}

In [13]:
# Create a sparse user-item interaction matrix
interaction_matrix = coo_matrix((np.ones(clicked_news_encoded.shape[0]),
                                 (clicked_news_encoded['user'], clicked_news_encoded['item'])))

print(f"users: {interaction_matrix.shape[0]} \nitems: {interaction_matrix.shape[1]}")

users: 15418 
items: 6522


In [14]:
#transform into compressed sparse row
interaction_matrix_csr = interaction_matrix.tocsr()

In [15]:
clicked_news_encoded['rating'] = np.ones(len(clicked_news_encoded))

In [16]:
print(clicked_news_encoded)

         user  item  rating
1       14889   756     1.0
5        1803  1190     1.0
5        1803  2565     1.0
9        7888  4810     1.0
10      14512  5700     1.0
...       ...   ...     ...
156963   6291  1261     1.0
156963   6291  4396     1.0
156963   6291  2904     1.0
156963   6291   915     1.0
156963   6291  1137     1.0

[155443 rows x 3 columns]


In [17]:
user_click_counts = clicked_news['user_id'].value_counts().reset_index()
user_click_counts.columns = ['user_id', 'num_of_clicks']
user_click_counts_sorted = user_click_counts.sort_values(by='num_of_clicks', ascending=False)
print(user_click_counts_sorted)

      user_id  num_of_clicks
0      U53220            125
1      U70550            118
2      U63482            109
3      U20833             95
4      U32322             94
...       ...            ...
13230  U91963              5
13231  U65567              5
13232    U417              5
13233  U63788              5
15417   U5480              5

[15418 rows x 2 columns]


# Content based recommendations

In [18]:
# Vectorizing the 'abstract' column of news
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
news_profiles = vectorizer.fit_transform(news['abstract'])

In [19]:
# Reversing the vocabulary dictionary
reverse_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}

print(reverse_vocab.get(3845))

royals


This is a matrix that holds the similarity scores between all pairs of news items. It's a  square matrix where each row and column corresponds to a news item, and the entry [i, j] gives the similarity between news item i and news item j. the diagonal is full of ones for obvious reasons

In [20]:
# Compute the cosine similarity matrix
news_similarity = cosine_similarity(news_profiles, dense_output=False)

In [21]:
all_id_to_news = news['news_id'].to_dict()
all_news_to_id = {v: k for k, v in all_id_to_news.items()}

In [22]:
def recommend_news(user_id, interaction_matrix, news_similarity, user_to_id = user_to_id, top_n=5):
    """
    Recommend top N news items for a given user.
    
    Parameters:
        user_id (str): The user for whom to make recommendations.
        interaction_matrix (csr_matrix): User-item interaction matrix.
        news_similarity (csr_matrix): Item-item similarity matrix.
        top_n (int): Number of items to recommend.
        
    Returns:
        list: List of recommended item indices.
    """
    user_id = user_to_id[user_id]
    
    # Get the user's past interactions (list of news ids that the user has interacted with)
    user_interactions = interaction_matrix[user_id, :].nonzero()[1]
    
    # Sum the similarities of the interacted items 
    similarity_sum = np.sum(news_similarity[user_interactions, :], axis=0) #shape = (n_interactions, all_news)
    
    # Remove already interacted items
    similarity_sum[0, user_interactions] = 0
    
    # Get top N item indices
    recommended_news_ids = np.argsort(similarity_sum)[0, -top_n:][::-1]

    #get similarity scores of top N items
    similarity_scores = [similarity_sum[0, idx] for idx in recommended_news_ids.tolist()[0][::-1]]

    # Convert indices to original news IDs
    recommended_news = [all_id_to_news[idx] for idx in recommended_news_ids.tolist()[0]]
    
    return recommended_news, similarity_scores

In [23]:
def explain_individual(user_id, recommended_items, interaction_matrix, similarity_scores):

    # Get the user's past interactions (list of news ids that the user has interacted with)
    user_id_letter = user_id
    user_id = user_to_id[user_id]
    user_interactions = interaction_matrix[user_id, :].nonzero()[1][:5]
    explanation = f"{user_id_letter}: You previously watched these articles:\n{user_interactions}\n"
    explanation += f"Based off of your click history and similar content, we recommend these articles:\n{recommended_items}.\nThese are their respective similarity scores:\n{similarity_scores}"

    return explanation

In [24]:
# Recommend for a user
user_id = 'U53220'
recommended_news_indices, similarities = recommend_news(user_id, interaction_matrix_csr, news_similarity, top_n=10)

print(explain_individual(user_id, recommended_news_indices, interaction_matrix_csr, similarities))

U53220: You previously watched these articles:
[ 64  91 181 184 208]
Based off of your click history and similar content, we recommend these articles:
['N4830', 'N17496', 'N11545', 'N64885', 'N35290', 'N6405', 'N2445', 'N15320', 'N45022', 'N12262'].
These are their respective similarity scores:
[2.985750511470631, 2.756920555907457, 2.744329483002449, 2.6892713962617187, 2.6892713962617187, 2.6565837180483913, 2.6106609239726457, 2.562446860836066, 2.5504442878516675, 2.5362196562507755]


## Test set

In [25]:
# Load data
test_behaviors = pd.read_csv('./small_test_data/behaviors.tsv', delimiter='\t', header=None)
test_news = pd.read_csv('./small_test_data/news.tsv', delimiter='\t', header=None)

# Naming columns
test_behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
test_news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

In [26]:
# Remove NaN values in the 'abstract' column
test_news = test_news.dropna(subset=['abstract'])

# list of valid news (with some abstract)
test_valid_news_ids = set(test_news['news_id'])

In [27]:
# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
test_behaviors['clicked_news'] = test_behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

#removing unused columns
test_behaviors = test_behaviors[["impression_id", "user_id", "clicked_news"]]

# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
test_clicked_news = test_behaviors.explode('clicked_news')[['user_id', 'clicked_news']]

#remove non valid news from interactions
test_clicked_news = test_clicked_news[test_clicked_news['clicked_news'].isin(test_valid_news_ids)]

# Rename 'clicked_news' column to 'news_id'
test_data = test_clicked_news.rename(columns={'clicked_news': 'news_id'})

In [28]:
print(f"Number of unique clicked news: {test_data['news_id'].nunique()}")
print(f"Number of unique users:        {test_data['user_id'].nunique()} \n")

Number of unique clicked news: 2115
Number of unique users:        48139 



### Since it's a content-based recommender system we ran into the cold start problem from users present in the test data but not in the training data, so we just recommend them items from the most popular news 

In [29]:
news_popularity = clicked_news['news_id'].value_counts()

# Calculate the click threshold for the top percentile
threshold = np.percentile(news_popularity, 95)

# Get the most popular news items
popular_news = news_popularity[news_popularity >= threshold].index.tolist()

## Model evaluation - Precision@k and Recall@k and nCDG@k

In [30]:
recommendations = {} #{'user_id': [list of recommended news_ids]}

In [31]:
# Set of user_ids from test_data and training data
test_data_user_ids = set(test_data['user_id'].unique())

train_data_user_ids = set(clicked_news['user_id'].unique())

In [32]:
for user_id in test_data_user_ids:
    if user_id not in train_data_user_ids:
        # New user or user not in test data
        recommendations[user_id] = random.sample(popular_news, 5)  # Recommend random news from popular news items
    else:
        recommended_news_indices = recommend_news(user_id, interaction_matrix_csr, news_similarity, top_n=10)[0]
        recommendations[user_id] = [idx for idx in recommended_news_indices]

In [33]:
def precision_at_k(recommended_list, relevant_list, k):
    """
    Compute Precision at K.
    
    Parameters:
        recommended_list (list): List of recommended items.
        relevant_list (list): List of relevant items.
        k (int): Number of recommendations to consider.
        
    Returns:
        float: Precision at K score.
    """
    return len(set(recommended_list[:k]) & set(relevant_list)) / k

In [34]:
def recall_at_k(recommended_list, relevant_set, k):
    """Return the recall at k."""
    return len(set(recommended_list[:k]) & set(relevant_set)) / len(relevant_set)

In [35]:
def ndcg_at_k(recommended_list, relevant_set, k):
    """Return the NDCG at k."""
    dcg = 0
    idcg = sum([1 / math.log(i + 2, 2) for i in range(min(k, len(relevant_set)))])
    for i, item in enumerate(recommended_list[:k]):
        if item in relevant_set:
            dcg += 1 / math.log(i + 2, 2)
    return dcg / idcg

In [36]:
k = 5 # or any value you choose

precision_values = []
recall_values = []
ndcg_values = []

for user_id, recommended_items in recommendations.items():
    relevant_items = tuple(test_data[test_data['user_id'] == user_id]['news_id'].tolist()) 
    if relevant_items:  # if the user has any relevant items
        precision = precision_at_k(recommended_items, relevant_items, k)
        recall = recall_at_k(recommended_items, relevant_items, k)
        ndcg = ndcg_at_k(recommended_items, relevant_items, k)
        
        precision_values.append(precision)
        recall_values.append(recall)
        ndcg_values.append(ndcg)

# Averaging across all users to get the final metric value
mean_precision = np.mean(precision_values)
mean_recall = np.mean(recall_values)
mean_ndcg = np.mean(ndcg_values)

print(f"Mean Precision@{k}: {mean_precision:.4f}")
print(f"Mean Recall@{k}: {mean_recall:.4f}")
print(f"Mean NDCG@{k}: {mean_ndcg:.4f}")

Mean Precision@5: 0.0004
Mean Recall@5: 0.0008
Mean NDCG@5: 0.0007


I think it's due to the sparsity and cold start problem, we have many users without interactions

In [37]:
filtered_test_data = test_data.groupby('user_id').filter(lambda x: len(x) > 15)

In [38]:
recommendations = {} #{'user_id': [list of recommended news_ids]}

# Set of user_ids from test_data and training data
test_data_user_ids = set(filtered_test_data['user_id'].unique())

train_data_user_ids = set(clicked_news['user_id'].unique())

for user_id in test_data_user_ids:
    if user_id not in train_data_user_ids:
        # New user or user not in test data
        recommendations[user_id] = random.sample(popular_news, 5)  # Recommend random news from popular news items
    else:
        recommended_news_indices = recommend_news(user_id, interaction_matrix_csr, news_similarity, top_n=10)[0]
        recommendations[user_id] = [idx for idx in recommended_news_indices]

In [39]:
k = 5  # or any value you choose

precision_values = []
recall_values = []
ndcg_values = []

for user_id, recommended_items in recommendations.items():
    relevant_items = test_data[test_data['user_id'] == user_id]['news_id'].tolist()
    
    if relevant_items:  # if the user has any relevant items
        
        precision = precision_at_k(recommended_items, relevant_items, k)
        recall = recall_at_k(recommended_items, relevant_items, k)
        ndcg = ndcg_at_k(recommended_items, relevant_items, k)
        
        precision_values.append(precision)
        recall_values.append(recall)
        ndcg_values.append(ndcg)

# Averaging across all users to get the final metric value
mean_precision = np.mean(precision_values)
mean_recall = np.mean(recall_values)
mean_ndcg = np.mean(ndcg_values)

print(f"Mean Precision@{k}: {mean_precision:.4f}")
print(f"Mean Recall@{k}: {mean_recall:.4f}")
print(f"Mean NDCG@{k}: {mean_ndcg:.4f}")

Mean Precision@5: 0.0063
Mean Recall@5: 0.0014
Mean NDCG@5: 0.0065


### In fact we can see that is precision increases if we consider users with more than n = 10 interactions

# Group recommendations

In [40]:
from lenskit.algorithms.als import ImplicitMF

In [41]:
# Load JSON string from a file
with open("grouped_dict.json", "r") as f:
    grouped_dict_json = f.read()

# Convert the JSON string back to a dictionary
groups_dict = json.loads(grouped_dict_json)

In [42]:
all_clicks = clicked_news_encoded.reset_index()
all_clicks = all_clicks[['user', 'item', 'rating']]

In [43]:
# Create an implicit feedback model with 50 features and 20 iterations
model = ImplicitMF(features=50, iterations=20, reg=0.1)

# Train the model
model.fit(all_clicks)

<lenskit.algorithms.als.ImplicitMF at 0x143e25b0f10>

In [44]:
def group_recommendations(user_group, model, all_items, user_item_data):
    #getting user_idexes
    user_group = [user_to_id[user] for user in user_group if user in user_to_id]
    
    # A dictionary to aggregate scores
    aggregated_scores = {}

    # A set to collect items already read by any user in the group
    read_by_group = set()

    for user in user_group:
        # Get individual user recommendations
        recs_user = model.predict_for_user(user, all_items)
        
        # Exclude items already read by the user
        read_items = user_item_data[user_item_data['user'] == user]['item'].tolist()
        recs_user = recs_user.drop(read_items, errors='ignore')

        # Add these read items to the group set
        read_by_group.update(read_items)

        # Aggregate scores
        for item, score in recs_user.items():
            if item in aggregated_scores:
                aggregated_scores[item] += score
            else:
                aggregated_scores[item] = score

    # Convert to pandas series for easier manipulation
    aggregated_scores = pd.Series(aggregated_scores)

    # Remove items read by any user in the group
    aggregated_scores = aggregated_scores.drop(list(read_by_group), errors='ignore')

    # Sort and get the top items
    top_items = aggregated_scores.sort_values(ascending=False).head(10)

    # Convert the index to news ids using the dictionary
    top_items.index = top_items.index.map(id_to_news)
    
    return top_items

In [45]:
def explain_group_recs(group_users, news, top_items):
    explanations = {}
    
    for item in top_items:
        # Check for topic
        filtered_news = news.loc[news['news_id'] == item, 'subcategory']
        if not filtered_news.empty:
            topic = filtered_news.values[0]
        else:
            topic = "Unknown"  
        
        user_group = [user for user in group_users if user in id_to_user]
        
        # Generate explanation string
        explanations[item] = f"Recommended because users {group_users} showed a high average preference score for {topic}."

    return explanations

In [46]:
all_items = clicked_news_encoded.item.unique().tolist()

group = groups_dict['Group 6']
group

['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596']

In [47]:
top_10_group_recs = group_recommendations(group, model, all_items, all_clicks)

print("Top 10 recommended items for the group:")
print(top_10_group_recs)

Top 10 recommended items for the group:
N33885    4.710758
N7821     4.303284
N56211    4.020681
N49685    3.952753
N62366    3.654688
N59252    3.513958
N28047    3.503249
N20134    3.428160
N41881    3.335301
N12029    3.222161
dtype: float64


In [48]:
explanations = explain_group_recs(group, news, top_10_group_recs.index)

In [49]:
explanations

{'N33885': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596'] showed a high average preference score for finance-companies.",
 'N7821': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596'] showed a high average preference score for football_nfl.",
 'N56211': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596'] showed a high average preference score for newsus.",
 'N49685': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596'] showed a high average preference score for music-celebrity.",
 'N62366': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U80596'] showed a high average preference score for newspolitics.",
 'N59252': "Recommended because users ['U69084', 'U31631', 'U57214', 'U21331', 'U1331', 'U64554', 'U42643', 'U8059

# Survey recommendations

In [50]:
survey_news = ["N55468", "N8091", "N61914", "N7857", "N32907", "N33584", "N50566", "N47845", "N20212", "N37038", "N39173", "N54950", "N63975", "N28867", "N62996",
               "N35703", "N9680", "N50299", "N61837", "N59295", "N13113", "N57018", "N48320", "N49981", "N11200", "N10470", "N46481", "N27435", "N39125", "N24808",
               "N1587", "N22605", "N10886", "N52386", "N52620", "N30867", "N47140", "N29552", "N55528", "N55610"]

In [51]:
#news for which we don't have the mapping for
invalid_list = [item for item in survey_news if item not in all_news_to_id]

In [52]:
print(invalid_list)

['N63975', 'N28867', 'N57018', 'N46481', 'N22605']


In [53]:
survey_news_ids = [all_news_to_id[item] for item in survey_news if item in all_news_to_id]

In [54]:
# Create a list of letters A to O
users = [chr(i) for i in range(ord('A'), ord('O')+1)]

# Create a DataFrame with 'user' column filled with letters A to O and 'clicked_news' with empty lists
survey_interactions = pd.DataFrame({
    'user': users,
    'item': [[] for _ in users]  # Create an empty list for each user
})


In [55]:
survey_interactions.at[0, 'item'].extend(["N7857", "N33584", "N50566", "N47845", "N39173", "N54950", "N11200", "N22605", "N52620"])
survey_interactions.at[1, 'item'].extend(["N55468", "N61914", "N50566", "N20212", "N37038", "N39173", "N54950", "N35703", "N50299", "N61837", "N11200", "N10470", "N46481", "N27435", "N1587", "N22605", "N10886", "N52386", "N30867", "N55528"])
survey_interactions.at[2, 'item'].extend(["N55468", "N8091", "N7857", "N32907", "N33584", "N50566", "N47845", "N20212", "N37038", "N39173", "N54950", "N22605", "N10886"])
survey_interactions.at[3, 'item'].extend(["N8091","N7857","N50566","N47845","N20212","N37038","N39173","N54950","N28867","N62996","N35703","N9680","N50299","N61837","N59295","N48320","N46481","N39125","N24808","N22605","N10886","N52386","N52620","N29552","N55528","N55610"])
survey_interactions.at[4, 'item'].extend(["N61914","N47845","N20212","N39173","N28867","N62996","N35703","N61837","N13113","N48320","N11200","N46481","N39125","N24808","N10886","N52386","N47140","N29552","N55528"])
survey_interactions.at[5, 'item'].extend(["N55468","N8091","N7857","N47845","N20212","N35703","N9680","N59295","N48320","N49981","N10470","N24808","N1587","N22605","N10886","N30867"])
survey_interactions.at[6, 'item'].extend(["N55468","N7857","N47845","N62996","N9680","N11200","N10886","N47140","N29552"])
survey_interactions.at[7, 'item'].extend(["N55468","N8091","N7857","N33584","N50566","N47845","N39173","N59295","N11200","N39125","N22605","N10886","N30867","N29552"])
survey_interactions.at[8, 'item'].extend(["N55468","N61914","N50566","N47845","N20212","N37038","N39173","N54950","N62996","N35703","N61837","N59295","N57018","N39125","N24808","N1587","N52620","N29552"])
survey_interactions.at[9, 'item'].extend(["N8091","N7857","N33584","N61837","N57018","N10470","N1587","N10886","N52620","N55528","N55610"])
survey_interactions.at[10, 'item'].extend(["N8091","N61914","N7857","N33584","N47845","N37038","N24808","N10886","N52620","N30867"])
survey_interactions.at[11, 'item'].extend(["N61914","N33584","N20212","N37038","N28867","N57018","N49981","N11200","N10470","N27435","N39125","N24808","N10886","N52386","N52620","N30867","N47140","N29552"])
survey_interactions.at[12, 'item'].extend(["N7857","N32907","N50566","N47845","N62996","N35703","N39125","N24808"])
survey_interactions.at[13, 'item'].extend(["N55468","N8091","N50566","N20212","N62996","N35703","N59295","N57018","N10470","N39125","N22605","N10886","N52620","N30867"])
survey_interactions.at[14, 'item'].extend(["N39173","N54950","N59295","N13113","N10886"])

In [56]:
survey_interactions['item'] = survey_interactions['item'].apply(lambda x: [item for item in x if item not in invalid_list])

In [57]:
survey_interactions

Unnamed: 0,user,item
0,A,"[N7857, N33584, N50566, N47845, N39173, N54950..."
1,B,"[N55468, N61914, N50566, N20212, N37038, N3917..."
2,C,"[N55468, N8091, N7857, N32907, N33584, N50566,..."
3,D,"[N8091, N7857, N50566, N47845, N20212, N37038,..."
4,E,"[N61914, N47845, N20212, N39173, N62996, N3570..."
5,F,"[N55468, N8091, N7857, N47845, N20212, N35703,..."
6,G,"[N55468, N7857, N47845, N62996, N9680, N11200,..."
7,H,"[N55468, N8091, N7857, N33584, N50566, N47845,..."
8,I,"[N55468, N61914, N50566, N47845, N20212, N3703..."
9,J,"[N8091, N7857, N33584, N61837, N10470, N1587, ..."


In [58]:
# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
survey_interactions = survey_interactions.explode('item')[['user', 'item']]

In [59]:
survey_interactions

Unnamed: 0,user,item
0,A,N7857
0,A,N33584
0,A,N50566
0,A,N47845
0,A,N39173
...,...,...
14,O,N39173
14,O,N54950
14,O,N59295
14,O,N13113


In [60]:
print(f"Number of unique users:        {survey_interactions['user'].nunique()} \n")
print(f"Number of unique clicked news: {survey_interactions['item'].nunique()}")

Number of unique users:        15 

Number of unique clicked news: 35


In [61]:
#concat new user data
clicked_news.columns = ['user', 'item']
survey_interactions = pd.concat([clicked_news, survey_interactions], ignore_index=True, axis = 0)
# Create categorical types without encoding them yet
survey_interactions['user_id_cat'] = survey_interactions['user'].astype("category")
survey_interactions['news_id_cat'] = survey_interactions['item'].astype("category")

# Creating mappings from original IDs to encoded IDs
id_to_user = dict(enumerate(survey_interactions['user_id_cat'].cat.categories))
id_to_news = dict(enumerate(survey_interactions['news_id_cat'].cat.categories))

# Convert categories to codes (integer encoding)
survey_interactions_encoded = pd.DataFrame(columns=['user', 'item'])
survey_interactions_encoded['user'] = survey_interactions['user_id_cat'].cat.codes
survey_interactions_encoded['item'] = survey_interactions['news_id_cat'].cat.codes

# Drop the additional categorical columns if they are not needed
survey_interactions = survey_interactions.drop(columns=['user_id_cat', 'news_id_cat'])

# Creating reverse mappings from original IDs to encoded IDs
user_to_id = {v: k for k, v in id_to_user.items()}
news_to_id = {v: k for k, v in id_to_news.items()}

# Create a sparse user-item interaction matrix
interaction_matrix = coo_matrix((np.ones(survey_interactions_encoded.shape[0]),
                                 (survey_interactions_encoded['user'], survey_interactions_encoded['item'])))

print(f"users: {interaction_matrix.shape[0]} \nitems: {interaction_matrix.shape[1]}")

#transform into compressed sparse row
interaction_matrix_csr = interaction_matrix.tocsr()

survey_interactions_encoded['rating'] = np.ones(len(survey_interactions_encoded))

print(survey_interactions_encoded)

users: 15433 
items: 6556
         user  item  rating
0       14904   760     1.0
1        1818  1195     1.0
2        1818  2576     1.0
3        7903  4833     1.0
4       14527  5728     1.0
...       ...   ...     ...
155631     14  3230     1.0
155632     14  4978     1.0
155633     14  5454     1.0
155634     14   355     1.0
155635     14    98     1.0

[155636 rows x 3 columns]


In [62]:
user_id = 'B'
recommended_news_indices = recommend_news('A', interaction_matrix_csr, news_similarity, user_to_id, top_n=10)

In [63]:
users = list('ABCDEFGHIJKLMNO')

In [64]:
recommendations = {}


# Iterate through all users and print individual explanations
for user in users:
    recommended_news_indices, similarity_scores = recommend_news(user, interaction_matrix_csr, news_similarity, user_to_id, top_n=5)
    print(explain_individual(user, recommended_news_indices, interaction_matrix_csr, similarity_scores))
    recommendations[user] = recommended_news_indices

A: You previously watched these articles:
[ 124 2573 3230 4177 4499]
Based off of your click history and similar content, we recommend these articles:
['N51923', 'N35778', 'N62171', 'N63004', 'N13589'].
These are their respective similarity scores:
[0.5798973804199299, 0.5538830162498715, 0.525525562409543, 0.5217986832414446, 0.5129110803151521]
B: You previously watched these articles:
[  54   98  124  627 1086]
Based off of your click history and similar content, we recommend these articles:
['N10617', 'N732', 'N64778', 'N723', 'N20167'].
These are their respective similarity scores:
[0.9901153258164977, 0.8059423130067741, 0.7685992367111084, 0.7581549423846585, 0.7543851863944394]
C: You previously watched these articles:
[  98 1086 2505 2573 2982]
Based off of your click history and similar content, we recommend these articles:
['N14598', 'N55628', 'N37062', 'N9470', 'N58055'].
These are their respective similarity scores:
[0.7922353633086818, 0.7434540610570919, 0.70385598537113

In [65]:
recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index')
recommendations_df

Unnamed: 0,0,1,2,3,4
A,N51923,N35778,N62171,N63004,N13589
B,N10617,N732,N64778,N723,N20167
C,N14598,N55628,N37062,N9470,N58055
D,N57715,N37062,N21212,N40099,N22556
E,N39961,N36954,N57715,N53891,N22556
F,N9800,N9261,N20536,N57796,N37062
G,N63406,N48316,N40683,N13589,N9470
H,N26457,N22556,N55628,N39961,N57715
I,N3081,N39961,N57715,N22556,N40099
J,N12352,N4244,N732,N56687,N20306


In [66]:
filtered_news = pd.DataFrame(columns=news.columns)
for recommended_indices_arr in recommendations_df[0]:
    for recommended_index in recommended_indices_arr:
        recommended_item_row = news.loc[news['news_id'] == recommended_index]
        filtered_news = pd.concat([filtered_news, recommended_item_row])
filtered_news

Unnamed: 0,index,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities


## Grouping

In [67]:
survey_groups = {
    'Group 1': ['A', 'C', 'E', 'G', 'I'],
    'Group 2': ['B', 'D', 'F', 'H', 'J'],
    'Group 3': ['K', 'L', 'O', 'A', 'B'],
    'Group 4': ['M', 'N', 'K', 'F', 'I']
}

In [68]:
all_clicks = survey_interactions_encoded.reset_index()

In [69]:
all_clicks

Unnamed: 0,index,user,item,rating
0,0,14904,760,1.0
1,1,1818,1195,1.0
2,2,1818,2576,1.0
3,3,7903,4833,1.0
4,4,14527,5728,1.0
...,...,...,...,...
155631,155631,14,3230,1.0
155632,155632,14,4978,1.0
155633,155633,14,5454,1.0
155634,155634,14,355,1.0


In [70]:
# Create an implicit feedback model with 50 features and 20 iterations
model = ImplicitMF(features=50, iterations=20, reg=0.1)

# Train the model
model.fit(all_clicks)

<lenskit.algorithms.als.ImplicitMF at 0x143e37d51c0>

In [71]:
all_items = survey_interactions_encoded.item.unique().tolist()
survey_groups_recommendations = {}
explanations = []

In [72]:
for group_name, group in survey_groups.items():
    top_10_group_recs = group_recommendations(group, model, all_items, all_clicks)
    explanations.append(explain_group_recs(group, news, top_10_group_recs.index))
    survey_groups_recommendations[group_name] = top_10_group_recs

In [73]:
for group_name, recommendations in survey_groups_recommendations.items():
    print(group_name)
    print(recommendations)

Group 1
N32691    4.916315
N43502    4.841764
N59267    4.436607
N43083    3.855361
N14184    3.845436
N39010    3.824397
N52723    3.753384
N8957     3.626516
N23814    3.613584
N64851    3.579932
dtype: float64
Group 2
N32691    5.356681
N43502    5.262572
N59267    4.837103
N43083    4.188864
N14184    4.169224
N39010    4.150050
N52723    4.093013
N23814    3.963080
N8957     3.931205
N64851    3.897182
dtype: float64
Group 3
N32691    4.653915
N43502    4.578521
N59267    4.200742
N43083    3.638959
N14184    3.621068
N39010    3.600225
N52723    3.550620
N23814    3.428664
N8957     3.418934
N64851    3.387455
dtype: float64
Group 4
N32691    4.918325
N43502    4.836203
N59267    4.438738
N43083    3.845624
N14184    3.828534
N39010    3.806192
N52723    3.754688
N23814    3.627754
N8957     3.611449
N64851    3.578092
dtype: float64


In [74]:
for explanation in explanations:
    pretty_string = json.dumps(explanation, indent=0)
    print(f"{pretty_string}\n\n")

{
"N32691": "Recommended because users ['A', 'C', 'E', 'G', 'I'] showed a high average preference score for lifestylebeauty.",
"N43502": "Recommended because users ['A', 'C', 'E', 'G', 'I'] showed a high average preference score for newspolitics.",
"N59267": "Recommended because users ['A', 'C', 'E', 'G', 'I'] showed a high average preference score for music-celebrity.",
"N43083": "Recommended because users ['A', 'C', 'E', 'G', 'I'] showed a high average preference score for lifestylehomeandgarden.",
"N14184": "Recommended because users ['A', 'C', 'E', 'G', 'I'] showed a high average preference score for newsus.",
"N39010": "Recommended because users ['A', 'C', 'E', 'G', 'I'] showed a high average preference score for newsus.",
"N52723": "Recommended because users ['A', 'C', 'E', 'G', 'I'] showed a high average preference score for newsus.",
"N8957": "Recommended because users ['A', 'C', 'E', 'G', 'I'] showed a high average preference score for cma-awards.",
"N23814": "Recommended beca

In [75]:
filtered_news_groups = pd.DataFrame(columns=news.columns)
keys = []
print(recommendations)
for group_name, recommendations in survey_groups_recommendations.items():

    for key, value in recommendations.items():
        #print(key)
        keys.append(key)
for key in keys:
    recommended_item_row = news[news['news_id'] == key]
    filtered_news_groups = pd.concat([filtered_news_groups, recommended_item_row])

filtered_news_groups

N32691    4.918325
N43502    4.836203
N59267    4.438738
N43083    3.845624
N14184    3.828534
N39010    3.806192
N52723    3.754688
N23814    3.627754
N8957     3.611449
N64851    3.578092
dtype: float64


Unnamed: 0,index,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
41591,43766,N32691,lifestyle,lifestylebeauty,"When You See Paris Jackson's Wheat-Blond Hair,...",Paris Jackson stepped out on Nov.,https://assets.msn.com/labs/mind/BBWtvGv.html,"[{""Label"": ""Blond"", ""Type"": ""C"", ""WikidataId"":...",[]
48319,50960,N43502,news,newspolitics,Donald Trump Jr walks out of Triggered book la...,Event at University of California is cut short...,https://assets.msn.com/labs/mind/BBWyrYC.html,"[{""Label"": ""Donald Trump Jr."", ""Type"": ""P"", ""W...","[{""Label"": ""University of California"", ""Type"":..."
39973,42049,N59267,music,music-celebrity,Celine Dion Shares What She Misses Most About ...,"'Rene will always be with me, but I'm not in p...",https://assets.msn.com/labs/mind/BBWL8jB.html,"[{""Label"": ""Celine Dion"", ""Type"": ""P"", ""Wikida...","[{""Label"": ""Ren\u00e9 Ang\u00e9lil"", ""Type"": ""..."
20938,21975,N43083,lifestyle,lifestylehomeandgarden,What It Was Like Inside the Homes of the Pilgrims,There's a lot of folklore surrounding the firs...,https://assets.msn.com/labs/mind/AAJOJle.html,"[{""Label"": ""Pilgrims (Plymouth Colony)"", ""Type...","[{""Label"": ""Pilgrims (Plymouth Colony)"", ""Type..."
38481,40456,N14184,news,newsus,Man cuffed for eating sandwich on train platfo...,Bay Area Rapid Transit was slammed for its han...,https://assets.msn.com/labs/mind/BBWBDWu.html,[],"[{""Label"": ""Bay Area Rapid Transit"", ""Type"": ""..."
34604,36332,N39010,news,newsus,Ruth Bader Ginsburg misses court due to illness,Supreme Court Justice Ruth Bader Ginsburg was ...,https://assets.msn.com/labs/mind/BBWHJti.html,"[{""Label"": ""Ruth Bader Ginsburg"", ""Type"": ""P"",...","[{""Label"": ""Ruth Bader Ginsburg"", ""Type"": ""P"",..."
30966,32479,N52723,news,newsus,"Cat ""repeatedly"" breaks his friends out of ani...",A mischievous cat named Quilty has earned inte...,https://assets.msn.com/labs/mind/BBWGeR0.html,[],"[{""Label"": ""Texas"", ""Type"": ""G"", ""WikidataId"":..."
30318,31795,N8957,music,cma-awards,Best country music albums of all time,Stacker has compiled a list of the best countr...,https://assets.msn.com/labs/mind/BBVxt0T.html,"[{""Label"": ""Country music"", ""Type"": ""B"", ""Wiki...","[{""Label"": ""Stac Electronics"", ""Type"": ""O"", ""W..."
5523,5766,N23814,lifestyle,voices,I moved from the US to the UK. Here are the 8 ...,I've found that living in England comes with a...,https://assets.msn.com/labs/mind/AAHr37p.html,"[{""Label"": ""United States"", ""Type"": ""G"", ""Wiki...","[{""Label"": ""England"", ""Type"": ""G"", ""WikidataId..."
39684,41741,N64851,news,newsus,Former U.S. President Carter recovering in hos...,Former U.S. President Jimmy Carter was recover...,https://assets.msn.com/labs/mind/BBWEoZ6.html,"[{""Label"": ""Jimmy Carter"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Jimmy Carter"", ""Type"": ""P"", ""Wikid..."
