In [None]:
pip install lenskit

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from lenskit.algorithms.user_knn import UserUser

from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, Predictor
from lenskit.algorithms.item_knn import ItemItem
from lenskit.algorithms.basic import Bias
from lenskit.metrics.predict import rmse
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load data
behaviors = pd.read_csv('behaviors.tsv', delimiter='\t', header=None)
news = pd.read_csv('news.tsv', delimiter='\t', header=None)

# Naming columns
behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

# Remove NaN values in the 'abstract' column
news = news.dropna(subset=['abstract'])

# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
behaviors['clicked_news'] = behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
clicked_news = behaviors.explode('clicked_news')[['user_id', 'clicked_news']].dropna()

# Remove clicked news that were removed from the news DataFrame
valid_news_ids = set(news['news_id'])
# Remove clicked news that were removed from the news DataFrame
valid_news_ids = set(news['news_id'])
clicked_news = clicked_news[clicked_news['clicked_news'].isin(valid_news_ids)].copy()

# Encoding user_id and news_id as categorical variables for memory and computation efficiency
clicked_news['user_id'] = clicked_news['user_id'].astype("category")
clicked_news['clicked_news'] = clicked_news['clicked_news'].astype("category")

print(f"Total number of news items: {news.shape[0]}") #48616 unique news
print(f"Number of unique clicked news: {clicked_news['clicked_news'].nunique()}") #7307 unique news have been clicked
print(f"Number of unique users: {clicked_news['user_id'].nunique()}")

In [None]:
# Ensure 'user_id' and 'news_id' are strings
clicked_news['user_id'] = clicked_news['user_id'].astype(str)
clicked_news['clicked_news'] = clicked_news['clicked_news'].astype(str)

# Rename 'clicked_news' column to 'news_id'
clicked_news = clicked_news.rename(columns={'clicked_news': 'news_id'})

In [None]:
# Ensure 'user_id' and 'news_id' are categorical and encode them as integer codes
clicked_news['user_id'] = clicked_news['user_id'].astype("category").cat.codes
clicked_news['news_id'] = clicked_news['news_id'].astype("category").cat.codes

In [None]:
# Creating mappings from encoded IDs to original IDs
id_to_user = dict(enumerate(clicked_news['user_id'].astype("category").cat.categories))
id_to_news = dict(enumerate(clicked_news['news_id'].astype("category").cat.categories))

# Creating reverse mappings from original IDs to encoded IDs
user_to_id = {v: k for k, v in id_to_user.items()}
news_to_id = {v: k for k, v in id_to_news.items()}

In [None]:
print(id_to_user[25103])

In [None]:
# Create a sparse user-item interaction matrix
interaction_matrix = coo_matrix((np.ones(clicked_news.shape[0]),
                                 (clicked_news['user_id'], clicked_news['news_id'])))

print(f"users: {interaction_matrix.shape[0]} \nitems: {interaction_matrix.shape[1]}")

In [None]:
interaction_matrix_csr = interaction_matrix.tocsr()

In [None]:
clicked_news_lenskit = clicked_news.rename(columns={'user_id': 'user', 'news_id': 'item'})

clicked_news_lenskit['user'] = clicked_news_lenskit['user'].astype(int)
clicked_news_lenskit['item'] = clicked_news_lenskit['item'].astype(int)

In [None]:
duplicates = clicked_news_lenskit.duplicated(subset=['user', 'item'])
print(f"Number of duplicate entries: {duplicates.sum()}")

In [None]:
clicked_news_lenskit['rating'] = np.ones(len(clicked_news_lenskit))

In [None]:
clicked_news_lenskit

In [None]:
clicked_news_lenskit = clicked_news_lenskit.reset_index(drop=True)

In [None]:
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import als, Recommender

#  Set up and train the algorithm
algo = als.BiasedMF(100)  # we can change number of factor
train, test = next(xf.partition_users(clicked_news_lenskit, 1, xf.SampleFrac(0.2)))
model = algo.fit(train)  # Use fit to train the model

all_recommendations = []
def group_recs(group_users, model, train):
  train_items = train['item'].unique()
  for user in group_users:
      user_scores = []
      for item in train_items:
          score = model.predict_for_user(user, [item])
          user_scores.append((item, score.iloc[0] if not score.empty else 0))

      user_recs = pd.DataFrame(user_scores, columns=['item', 'score'])
      top_recs = user_recs.sort_values(by='score', ascending=False).head(10)
      all_recommendations.append(top_recs)

  reclist = pd.concat(all_recommendations)

  least_misery_scores = reclist.groupby('item').score.min().reset_index()  # Using min for "least misery"
  group_top_recs = least_misery_scores.sort_values(by='score', ascending=False).head(10) # top 10
  return group_top_recs