In [24]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from lenskit.algorithms.user_knn import UserUser

from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, Predictor
from lenskit.algorithms.item_knn import ItemItem
from lenskit.algorithms.basic import Bias
from lenskit.metrics.predict import rmse
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Load data
behaviors = pd.read_csv('./small_training_data/behaviors.tsv', delimiter='\t', header=None)
news = pd.read_csv('./small_training_data/news.tsv', delimiter='\t', header=None)

# Naming columns
behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

# Remove NaN values in the 'abstract' column
news = news.dropna(subset=['abstract'])

# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
behaviors['clicked_news'] = behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
clicked_news = behaviors.explode('clicked_news')[['user_id', 'clicked_news']].dropna()

# Remove clicked news that were removed from the news DataFrame
valid_news_ids = set(news['news_id'])
# Remove clicked news that were removed from the news DataFrame
valid_news_ids = set(news['news_id'])
clicked_news = clicked_news[clicked_news['clicked_news'].isin(valid_news_ids)].copy()

# Encoding user_id and news_id as categorical variables for memory and computation efficiency
clicked_news['user_id'] = clicked_news['user_id'].astype("category")
clicked_news['clicked_news'] = clicked_news['clicked_news'].astype("category")

print(f"Total number of news items: {news.shape[0]}") #48616 unique news
print(f"Number of unique clicked news: {clicked_news['clicked_news'].nunique()}") #7307 unique news have been clicked
print(f"Number of unique users: {clicked_news['user_id'].nunique()}")

Total number of news items: 48616
Number of unique clicked news: 7307
Number of unique users: 49445


In [5]:
# Ensure 'user_id' and 'news_id' are strings
clicked_news['user_id'] = clicked_news['user_id'].astype(str)
clicked_news['clicked_news'] = clicked_news['clicked_news'].astype(str)

# Rename 'clicked_news' column to 'news_id'
clicked_news = clicked_news.rename(columns={'clicked_news': 'news_id'})

In [6]:
# Ensure 'user_id' and 'news_id' are categorical and encode them as integer codes
clicked_news['user_id'] = clicked_news['user_id'].astype("category").cat.codes
clicked_news['news_id'] = clicked_news['news_id'].astype("category").cat.codes

In [7]:
# Creating mappings from encoded IDs to original IDs
id_to_user = dict(enumerate(clicked_news['user_id'].astype("category").cat.categories))
id_to_news = dict(enumerate(clicked_news['news_id'].astype("category").cat.categories))

# Creating reverse mappings from original IDs to encoded IDs
user_to_id = {v: k for k, v in id_to_user.items()}
news_to_id = {v: k for k, v in id_to_news.items()}

In [8]:
# Create a sparse user-item interaction matrix
interaction_matrix = coo_matrix((np.ones(clicked_news.shape[0]),
                                 (clicked_news['user_id'], clicked_news['news_id'])))

print(f"users: {interaction_matrix.shape[0]} \nitems: {interaction_matrix.shape[1]}")

users: 49445 
items: 7307


In [9]:
interaction_matrix_csr = interaction_matrix.tocsr()

In [10]:
clicked_news_lenskit = clicked_news.rename(columns={'user_id': 'user', 'news_id': 'item'})

clicked_news_lenskit['user'] = clicked_news_lenskit['user'].astype(int)
clicked_news_lenskit['item'] = clicked_news_lenskit['item'].astype(int)

In [11]:
duplicates = clicked_news_lenskit.duplicated(subset=['user', 'item'])
print(f"Number of duplicate entries: {duplicates.sum()}")

Number of duplicate entries: 1806


In [22]:
clicked_news_lenskit['rating'] = np.ones(len(clicked_news_lenskit))

In [23]:
# Step 1: Train User-User Collaborative Filtering Model
user_user = UserUser(15, min_nbrs=3)  # 15 neighbors, minimum 3 neighbors for prediction
user_user.fit(clicked_news_lenskit)

<lenskit.algorithms.user_knn.UserUser at 0x276b75b6c40>

In [26]:
# Step 2: Train Item-Item Collaborative Filtering Model
item_item = ItemItem(15, min_nbrs=3)  # 15 neighbors, minimum 3 neighbors for prediction
item_item.fit(clicked_news_lenskit)

normalized ratings are zero, centering is not recommended


<lenskit.algorithms.item_knn.ItemItem at 0x276b75b6d60>