In [16]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from lenskit.algorithms.user_knn import UserUser

from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, Predictor
from lenskit.algorithms.item_knn import ItemItem
from lenskit.algorithms.basic import Bias
from lenskit.metrics.predict import rmse
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
# Load data
behaviors = pd.read_csv('./small_training_data/behaviors.tsv', delimiter='\t', header=None)
news = pd.read_csv('./small_training_data/news.tsv', delimiter='\t', header=None)

# Naming columns
behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

# Remove NaN values in the 'abstract' column
news = news.dropna(subset=['abstract'])

# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
behaviors['clicked_news'] = behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
clicked_news = behaviors.explode('clicked_news')[['user_id', 'clicked_news']].dropna()

# Remove clicked news that were removed from the news DataFrame
valid_news_ids = set(news['news_id'])
# Remove clicked news that were removed from the news DataFrame
valid_news_ids = set(news['news_id'])
clicked_news = clicked_news[clicked_news['clicked_news'].isin(valid_news_ids)].copy()

# Encoding user_id and news_id as categorical variables for memory and computation efficiency
clicked_news['user_id'] = clicked_news['user_id'].astype("category")
clicked_news['clicked_news'] = clicked_news['clicked_news'].astype("category")

print(f"Total number of news items: {news.shape[0]}") #48616 unique news
print(f"Number of unique clicked news: {clicked_news['clicked_news'].nunique()}") #7307 unique news have been clicked
print(f"Number of unique users: {clicked_news['user_id'].nunique()}")

Total number of news items: 48616
Number of unique clicked news: 7307
Number of unique users: 49445


In [18]:
# Ensure 'user_id' and 'news_id' are strings
clicked_news['user_id'] = clicked_news['user_id'].astype(str)
clicked_news['clicked_news'] = clicked_news['clicked_news'].astype(str)

# Rename 'clicked_news' column to 'news_id'
clicked_news = clicked_news.rename(columns={'clicked_news': 'news_id'})

In [19]:
# Ensure 'user_id' and 'news_id' are categorical and encode them as integer codes
clicked_news['user_id'] = clicked_news['user_id'].astype("category").cat.codes
clicked_news['news_id'] = clicked_news['news_id'].astype("category").cat.codes

In [20]:
# Creating mappings from encoded IDs to original IDs
id_to_user = dict(enumerate(clicked_news['user_id'].astype("category").cat.categories))
id_to_news = dict(enumerate(clicked_news['news_id'].astype("category").cat.categories))

# Creating reverse mappings from original IDs to encoded IDs
user_to_id = {v: k for k, v in id_to_user.items()}
news_to_id = {v: k for k, v in id_to_news.items()}

In [30]:
print(id_to_user[25103])

25103


In [21]:
# Create a sparse user-item interaction matrix
interaction_matrix = coo_matrix((np.ones(clicked_news.shape[0]),
                                 (clicked_news['user_id'], clicked_news['news_id'])))

print(f"users: {interaction_matrix.shape[0]} \nitems: {interaction_matrix.shape[1]}")

users: 49445 
items: 7307


In [22]:
interaction_matrix_csr = interaction_matrix.tocsr()

In [23]:
clicked_news_lenskit = clicked_news.rename(columns={'user_id': 'user', 'news_id': 'item'})

clicked_news_lenskit['user'] = clicked_news_lenskit['user'].astype(int)
clicked_news_lenskit['item'] = clicked_news_lenskit['item'].astype(int)

In [24]:
duplicates = clicked_news_lenskit.duplicated(subset=['user', 'item'])
print(f"Number of duplicate entries: {duplicates.sum()}")

Number of duplicate entries: 1806


In [25]:
clicked_news_lenskit['rating'] = np.ones(len(clicked_news_lenskit))

In [26]:
# Step 1: Train User-User Collaborative Filtering Model
user_user = UserUser(15, min_nbrs=3)  # 15 neighbors, minimum 3 neighbors for prediction
user_user.fit(clicked_news_lenskit)

<lenskit.algorithms.user_knn.UserUser at 0x1f6fa58d370>

In [29]:
# Group by 'user_id' and count the number of clicks per user
user_click_counts = clicked_news.groupby('user_id').size().reset_index(name='num_clicks')

# Sort users by the number of clicks in descending order and get the top 10
top_users = user_click_counts.sort_values(by='num_clicks', ascending=False).head(10)

# Map internal user IDs to real user IDs
top_users['real_user_id'] = top_users['user_id'].map(id_to_user)

# Display the top 10 users with real user IDs
print(top_users[['real_user_id', 'num_clicks']])


       real_user_id  num_clicks
25103         25103         125
35240         35240         118
31128         31128         109
6392           6392          95
13091         13091          94
40439         40439          87
36417         36417          83
40589         40589          82
17935         17935          82
21478         21478          76


In [None]:
# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # or another number that suits your data

# Fit and transform the abstracts to create item profiles
item_profiles = vectorizer.fit_transform(news['abstract'])

In [12]:
def get_clicked_news(real_user_id, clicked_news):
    # Convert real user ID to internal user ID
    user_id = user_to_id[real_user_id]
    return clicked_news[clicked_news['user_id'] == user_id]['news_id'].tolist()


In [13]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_partial_similarity(clicked_item_profiles, all_item_profiles):
    # Compute the similarity between clicked items and all items
    similarity_matrix = cosine_similarity(clicked_item_profiles, all_item_profiles)
    return similarity_matrix

In [14]:
def get_top_n_similar(news_id, similarity_matrix, n=10):
    # Get the top-N similar items for a specific item
    similar_items = np.argsort(similarity_matrix[news_id, :])[-n-1:-1][::-1]
    return similar_items

In [15]:
def recommend_for_user(real_user_id, clicked_news, item_profiles, user_to_id, id_to_news, n=10):
    # Get clicked news by the user
    clicked_items = get_clicked_news(real_user_id, clicked_news, user_to_id)
    
    # Extract item profiles for clicked items
    clicked_item_profiles = item_profiles[clicked_items, :]
    
    # Compute similarity matrix
    similarity_matrix = compute_partial_similarity(clicked_item_profiles, item_profiles)
    
    # Get top-N similar items for each clicked item
    recommended_items = set()
    for idx, item_id in enumerate(clicked_items):
        similar_items = get_top_n_similar(idx, similarity_matrix, n)
        recommended_items.update(similar_items)
    
    # Remove items that the user has already clicked on
    recommended_items = recommended_items - set(clicked_items)
    
    # Convert internal item IDs to real item IDs
    recommended_real_items = [id_to_news[item_id] for item_id in recommended_items]
    
    return recommended_real_items[:n]

In [None]:
recommended_items = recommend_for_user(user_id, clicked_news, item_profiles, n_recommendations)

print(f"Recommended items for user {user_id}: {recommended_items}")