In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from lenskit.algorithms.user_knn import UserUser

from lenskit.batch import predict
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, Predictor
from lenskit.algorithms.item_knn import ItemItem
from lenskit.algorithms.basic import Bias
from lenskit.metrics.predict import rmse
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load data
behaviors = pd.read_csv('./small_training_data/behaviors.tsv', delimiter='\t', header=None)
news = pd.read_csv('./small_training_data/news.tsv', delimiter='\t', header=None)

# Naming columns
behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

In [3]:
# Remove NaN values in the 'abstract' column
news = news.dropna(subset=['abstract'])

# list of valid news (with some abstract)
valid_news_ids = set(news['news_id'])

In [4]:
# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
behaviors['clicked_news'] = behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

In [5]:
#removing unused columns
behaviors = behaviors[["impression_id", "user_id", "clicked_news"]]

In [6]:
# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
clicked_news = behaviors.explode('clicked_news')[['user_id', 'clicked_news']]

In [7]:
#remove non valid news from interactions
clicked_news = clicked_news[clicked_news['clicked_news'].isin(valid_news_ids)]

In [8]:
# Filtering users with more than 4 news clicked
clicked_news = clicked_news.groupby('user_id').filter(lambda x: len(x) > 10)

In [9]:
# Rename 'clicked_news' column to 'news_id'
clicked_news = clicked_news.rename(columns={'clicked_news': 'news_id'})

In [10]:
print(f"Number of unique clicked news: {clicked_news['news_id'].nunique()}")
print(f"Number of unique users:        {clicked_news['user_id'].nunique()} \n")

Number of unique clicked news: 5213
Number of unique users:        4615 



## in the cell below we create mappings from real id's to indexes, and create clicked_news_encoded

In [11]:
# Create categorical types without encoding them yet
clicked_news['user_id_cat'] = clicked_news['user_id'].astype("category")
clicked_news['news_id_cat'] = clicked_news['news_id'].astype("category")

# Creating mappings from original IDs to encoded IDs
id_to_user = dict(enumerate(clicked_news['user_id_cat'].cat.categories))
id_to_news = dict(enumerate(clicked_news['news_id_cat'].cat.categories))

# Convert categories to codes (integer encoding)
clicked_news_encoded = pd.DataFrame(columns=['user', 'item'])
clicked_news_encoded['user'] = clicked_news['user_id_cat'].cat.codes
clicked_news_encoded['item'] = clicked_news['news_id_cat'].cat.codes

# Drop the additional categorical columns if they are not needed
clicked_news = clicked_news.drop(columns=['user_id_cat', 'news_id_cat'])

# Creating reverse mappings from original IDs to encoded IDs
user_to_id = {v: k for k, v in id_to_user.items()}
news_to_id = {v: k for k, v in id_to_news.items()}

In [12]:
# Create a sparse user-item interaction matrix
interaction_matrix = coo_matrix((np.ones(clicked_news.shape[0]),
                                 (clicked_news_encoded['user'], clicked_news_encoded['item'])))

print(f"users: {interaction_matrix.shape[0]} \nitems: {interaction_matrix.shape[1]}")

users: 4615 
items: 5213


In [13]:
#transform into compressed sparse row
interaction_matrix_csr = interaction_matrix.tocsr()

In [14]:
clicked_news_encoded['rating'] = np.ones(len(clicked_news_encoded))

In [15]:
print(clicked_news_encoded)

        user  item  rating
1       4459   589     1.0
5        516   942     1.0
5        516  2041     1.0
10      4340  4557     1.0
10      4340  1172     1.0
...      ...   ...     ...
156963  1854   997     1.0
156963  1854  3515     1.0
156963  1854  2314     1.0
156963  1854   715     1.0
156963  1854   893     1.0

[82698 rows x 3 columns]


In [16]:
# Train User-User Collaborative Filtering Model
user_user = UserUser(15, min_nbrs= 3)  # 15 neighbors, minimum 3 neighbors for prediction
user_user.fit(clicked_news_encoded)

could not load LIBBLAS: Could not find module 'libblas' (or one of its dependencies). Try using the full path with constructor syntax.


<lenskit.algorithms.user_knn.UserUser at 0x15c3b841b80>

In [17]:
def recommend_user_user (user_id, user_user = user_user, clicked_news = clicked_news_encoded, n=10):
    """
    Recommend top-N items for a user using the UserUser collaborative filtering model.
    
    Parameters:
    - user_user: Trained UserUser collaborative filtering model.
    - user_id: The internal ID of the user for whom to generate recommendations.
    - all_item_ids: List of all possible item IDs to consider for recommendation.
    - n: Number of recommendations to generate.
    
    Returns:
    A DataFrame containing the top-N recommended items and their predicted ratings.
    """
    
    all_item_ids = clicked_news['item'].unique()
    
    user_item_df = pd.DataFrame({
    'user': [user_id] * len(all_item_ids),
    'item': all_item_ids
    })
    
    # Predict ratings for all user-item pairs
    all_predictions = predict(user_user, user_item_df)
    
    # Select top-N items
    top_items = all_predictions.nlargest(10, 'prediction')
    
    return top_items

In [18]:
recommended_items_user_user = recommend_user_user('U53220', n = 100)  # requesting more to ensure we have enough after merging

print(f"useruser recommendations\n{recommended_items_user_user}")

useruser recommendations
     user  item  prediction
0  U53220   589         NaN
1  U53220   942         NaN
2  U53220  2041         NaN
3  U53220  4557         NaN
4  U53220  1172         NaN
5  U53220  4039         NaN
6  U53220  5075         NaN
7  U53220   793         NaN
8  U53220  2248         NaN
9  U53220  3983         NaN
