In [1]:
from lenskit.algorithms.als import ImplicitMF
import numpy as np
import pandas as pd
from lenskit import batch, topn, util
from lenskit import crossfold as xf

from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from lenskit.algorithms.user_knn import UserUser

from lenskit.algorithms import Recommender, Predictor
from lenskit.algorithms.item_knn import ItemItem
from lenskit.algorithms.basic import Bias
from lenskit.metrics.predict import rmse
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load data
all_behaviors = pd.read_csv('./small_training_data/behaviors.tsv', delimiter='\t', header=None)
all_news = pd.read_csv('./small_training_data/news.tsv', delimiter='\t', header=None)

# Naming columns
all_behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
all_news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

# Remove NaN values in the 'abstract' column
all_news = all_news.dropna(subset=['abstract'])

# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
all_behaviors['clicked_news'] = all_behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])
all_clicks = all_behaviors.explode('clicked_news')[['user_id', 'clicked_news']].dropna()
all_clicks.columns = ['user', 'item']
all_clicks['rating'] = np.ones(len(all_clicks))
all_clicks['user'] = all_clicks['user'].str[1:].astype(int)
all_clicks['item'] = all_clicks['item'].str[1:].astype(int)

all_clicks

Unnamed: 0,user,item,rating
0,13740,55689,1.0
1,91836,17059,1.0
2,73700,23814,1.0
3,34670,49685,1.0
4,8125,8400,1.0
...,...,...,...
156963,44625,50007,1.0
156963,44625,366,1.0
156963,44625,18573,1.0
156963,44625,20630,1.0


In [3]:
# Create an implicit feedback model with 50 features and 20 iterations
model = ImplicitMF(features=50, iterations=20, reg=0.1)

# Train the model
model.fit(all_clicks)

<lenskit.algorithms.als.ImplicitMF at 0x1cf08841550>

In [4]:

# For individual users
user = 62470
recs_user = model.predict_for_user(user, all_clicks.item.unique().tolist())
read_items = all_clicks[all_clicks['user'] == user]['item'].tolist()

# 2. Exclude these items from the predicted recommendations
recs_user = recs_user.drop(read_items, errors='ignore')
# Sort the predictions in descending order and select the top 10
top_10_items = recs_user.sort_values(ascending=False).head(10)

print(f"Top 10 recommended items for user {user}:")
print(top_10_items)

Top 10 recommended items for user 62470:
23877    1.278271
16148    1.245736
7809     1.217857
10960    1.100213
55689    1.055545
3123     1.047330
41578    0.996690
49712    0.978590
21420    0.963932
10812    0.956730
dtype: float64


In [5]:
# for group (aggregation)

In [6]:
def group_recommendations(user_group, model, all_items, user_item_data):
    # A dictionary to aggregate scores
    aggregated_scores = {}

    # A set to collect items already read by any user in the group
    read_by_group = set()

    for user in user_group:
        # Get individual user recommendations
        recs_user = model.predict_for_user(user, all_items)
        
        # Exclude items already read by the user
        read_items = user_item_data[user_item_data['user'] == user]['item'].tolist()
        recs_user = recs_user.drop(read_items, errors='ignore')

        # Add these read items to the group set
        read_by_group.update(read_items)

        # Aggregate scores
        for item, score in recs_user.items():
            if item in aggregated_scores:
                aggregated_scores[item] += score
            else:
                aggregated_scores[item] = score

    # Convert to pandas series for easier manipulation
    aggregated_scores = pd.Series(aggregated_scores)

    # Remove items read by any user in the group
    aggregated_scores = aggregated_scores.drop(list(read_by_group), errors='ignore')

    # Sort and get the top items
    top_items = aggregated_scores.sort_values(ascending=False).head(10)

    return top_items

In [7]:
import json
# Load JSON string from a file
with open("grouped_dict.json", "r") as f:
    grouped_dict_json = f.read()

# Convert the JSON string back to a dictionary
groups_dict = json.loads(grouped_dict_json)

group = groups_dict['Group 6']
group = [int(item[1:]) if item[1:].isdigit() else item for item in group]
group

[69084, 31631, 57214, 21331, 1331, 64554, 42643, 80596]

In [8]:
all_items = all_clicks.item.unique().tolist()
top_10_group_recs = group_recommendations(group, model, all_items, all_clicks)

print("Top 10 recommended items for the group:")
print(top_10_group_recs)

Top 10 recommended items for the group:
12029    4.805334
56211    4.501323
45523    3.842434
28047    3.820224
57132    3.709945
17307    3.692024
63106    3.536385
42515    3.529362
36545    3.426949
7821     3.350823
dtype: float64


In [9]:
def explain_group_recs(group_users, model, user_item_data, news, all_items):
    explanations = {}

    for item in all_items:
        # Check for topic
        filtered_news = news.loc[news['news_id'] == 'N' + str(item), 'subcategory']
        if not filtered_news.empty:
            topic = filtered_news.values[0]
        else:
            topic = "Unknown"  

        # Generate explanation string
        user_string = ', '.join(map(str, group_users))
        explanations[item] = f"Recommended because users {user_string} showed a high average preference score for {topic}.\n"

    return explanations


In [None]:
# Example Usage
explanations = explain_group_recs(group, model, all_clicks, all_news, all_items)
print(explanations)