In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from lenskit.algorithms.user_knn import UserUser
from lenskit.batch import predict
from lenskit import batch, topn, util
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, Predictor
from lenskit.algorithms.item_knn import ItemItem
from lenskit.algorithms.basic import Bias
from lenskit.metrics.predict import rmse
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load data
behaviors = pd.read_csv('./small_training_data/behaviors.tsv', delimiter='\t', header=None)
news = pd.read_csv('./small_training_data/news.tsv', delimiter='\t', header=None)

# Naming columns
behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

# Remove NaN values in the 'abstract' column
news = news.dropna(subset=['abstract'])

# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
behaviors['clicked_news'] = behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
clicked_news = behaviors.explode('clicked_news')[['user_id', 'clicked_news']].dropna()

# Remove clicked news that were removed from the news DataFrame
valid_news_ids = set(news['news_id'])
# Remove clicked news that were removed from the news DataFrame
valid_news_ids = set(news['news_id'])
clicked_news = clicked_news[clicked_news['clicked_news'].isin(valid_news_ids)].copy()

# Encoding user_id and news_id as categorical variables for memory and computation efficiency
clicked_news['user_id'] = clicked_news['user_id'].astype("category")
clicked_news['clicked_news'] = clicked_news['clicked_news'].astype("category")

print(f"Total number of news items: {news.shape[0]}") #48616 unique news
print(f"Number of unique clicked news: {clicked_news['clicked_news'].nunique()}") #7307 unique news have been clicked
print(f"Number of unique users: {clicked_news['user_id'].nunique()}")


Total number of news items: 48616
Number of unique clicked news: 7307
Number of unique users: 49445


In [3]:
# Ensure 'user_id' and 'clicked_news' are strings
clicked_news['user_id'] = clicked_news['user_id'].astype(str)
clicked_news['clicked_news'] = clicked_news['clicked_news'].astype(str)

# Create categorical types without encoding them yet
clicked_news['user_id_cat'] = clicked_news['user_id'].astype("category")
clicked_news['news_id_cat'] = clicked_news['news_id'].astype("category")

# Creating mappings from original IDs to encoded IDs
id_to_user = dict(enumerate(clicked_news['user_id_cat'].cat.categories))
id_to_news = dict(enumerate(clicked_news['news_id_cat'].cat.categories))

# Convert categories to codes (integer encoding)
clicked_news['user_id'] = clicked_news['user_id_cat'].cat.codes
clicked_news['news_id'] = clicked_news['news_id_cat'].cat.codes

# Drop the additional categorical columns if they are not needed
clicked_news = clicked_news.drop(columns=['user_id_cat', 'news_id_cat'])

# Creating reverse mappings from original IDs to encoded IDs
user_to_id = {v: k for k, v in id_to_user.items()}
news_to_id = {v: k for k, v in id_to_news.items()}


KeyError: 'news_id'

In [None]:
# Create a sparse user-item interaction matrix
interaction_matrix = coo_matrix((np.ones(clicked_news.shape[0]),
                                 (clicked_news['user_id'], clicked_news['news_id'])))

print(f"users: {interaction_matrix.shape[0]} \nitems: {interaction_matrix.shape[1]}")

In [None]:
interaction_matrix_csr = interaction_matrix.tocsr()

In [None]:
clicked_news_lenskit = clicked_news.rename(columns={'user_id': 'user', 'news_id': 'item'})

clicked_news_lenskit['user'] = clicked_news_lenskit['user'].astype(int)
clicked_news_lenskit['item'] = clicked_news_lenskit['item'].astype(int)

In [None]:
clicked_news_lenskit['rating'] = np.ones(len(clicked_news_lenskit))

In [None]:
print(clicked_news_lenskit)

In [None]:
# Train User-User Collaborative Filtering Model
user_user = UserUser(5, min_nbrs= 1)  # 15 neighbors, minimum 3 neighbors for prediction
user_user.fit(clicked_news_lenskit)

In [None]:
# Group by 'user_id' and count the number of clicks per user
user_click_counts = clicked_news.groupby('user_id').size().reset_index(name='num_clicks')

# Sort users by the number of clicks in descending order and get the top 10
top_users = user_click_counts.sort_values(by='num_clicks', ascending=False).head(10)

# Map internal user IDs to real user IDs
top_users['real_user_id'] = top_users['user_id'].map(id_to_user)

# Display the top 10 users with real user IDs
print(top_users[['real_user_id', 'num_clicks']])


Item_profiles matrix is in the Compressed Sparse Row (CSR) format, which is a memory-efficient way to store large sparse matrices. In each tuple (i,j): i corresponds to a news item and j corresponds to a specific word in the vocabulary from vectorizer.

The TF-IDF value represents the importance of term j in the abstract of news item i.

In [None]:
# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # or another number that suits your data

# Fit and transform the abstracts to create item profiles
item_profiles = vectorizer.fit_transform(news['abstract'])

In [12]:
print(item_profiles[:5])

  (0, 517)	1.0
  (1, 368)	1.0
  (2, 585)	0.5096714647312968
  (2, 929)	0.4937392329610019
  (2, 957)	0.5212708291507341
  (2, 512)	0.47406042921788183
  (3, 596)	0.36665808943657396
  (3, 309)	0.4406935092526457
  (3, 401)	0.33416629676258836
  (3, 261)	0.3629959351817992
  (3, 975)	0.41711744807412415
  (3, 593)	0.4080566359127731
  (3, 510)	0.2956699678737417
  (4, 70)	0.5522490166848188
  (4, 40)	0.4038523720269147
  (4, 688)	0.5213453133241538
  (4, 368)	0.5100228911076689


In [13]:
#let's check what is the word mapped to id
terms = vectorizer.get_feature_names_out()
print(terms[368])

good


In [14]:
# Retrieve and print the abstract of item 0
original_news_id = id_to_news[1]

abstract = news[news['news_id'] == original_news_id]['abstract'].values[0]
print(f"Abstract for news item {original_news_id}:\n\n{abstract}")

Abstract for news item N10051:

From 2023 on, Volkswagen intends to produce 1.4 million of similar electric drive units annually.


 # the above trained item profiles don't really match with the respective abstracts!!!!

In [15]:
def get_clicked_news(real_user_id, clicked_news = clicked_news):
    """
    Retrieve the news item indexes clicked by a specific user.
    
    Parameters:
    - real_user_id (str): The original user ID from the dataset.
    - clicked_news (DataFrame): The DataFrame containing user interactions with news items.
                                 Expected columns are ['user_id', 'news_id'].
    
    Returns:
    - list of str: A list containing the IDs of news items clicked by the specified user.
    
    Example:
    >>> get_clicked_news('U53220', clicked_news)
    [6583, 4676, 6715, ...]
    """
    
    # Convert real user ID to internal user ID
    user_id = user_to_id[real_user_id]
    
    # Retrieve and return the clicked news IDs for the specified user
    return clicked_news[clicked_news['user_id'] == user_id]['news_id'].tolist()

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_partial_similarity(clicked_item_profiles, all_item_profiles):
    # Compute the similarity between clicked items and all items
    similarity_matrix = cosine_similarity(clicked_item_profiles, all_item_profiles)
    return similarity_matrix

In [17]:
def get_top_n_similar(news_id, similarity_matrix, n=10):
    # Get the top-N similar items for a specific item
    similar_items = np.argsort(similarity_matrix[news_id, :])[-n-1:-1][::-1]
    return similar_items

In [18]:
def recommend_user_user (user_id, user_user = user_user, clicked_news = clicked_news_lenskit, n=10):
    """
    Recommend top-N items for a user using the UserUser collaborative filtering model.
    
    Parameters:
    - user_user: Trained UserUser collaborative filtering model.
    - user_id: The internal ID of the user for whom to generate recommendations.
    - all_item_ids: List of all possible item IDs to consider for recommendation.
    - n: Number of recommendations to generate.
    
    Returns:
    A DataFrame containing the top-N recommended items and their predicted ratings.
    """
    
    all_item_ids = clicked_news['item'].unique()
    
    user_item_df = pd.DataFrame({
    'user': [user_id] * len(all_item_ids),
    'item': all_item_ids
    })
    
    # Predict ratings for all user-item pairs
    all_predictions = predict(user_user, user_item_df)
    
    # Select top-N items
    top_items = all_predictions.nlargest(1000, 'prediction')
    
    return top_items

In [29]:
def recommend_for_user(real_user_id, clicked_news = clicked_news, item_profiles = item_profiles, user_to_id = user_to_id, id_to_news = id_to_news, user_user = user_user, n=10, w_content=0.5, w_user_user=0.5):
    """
    Recommend items for a user based on a hybrid approach combining collaborative filtering and content-based recommendations.
    
    Parameters:
    - real_user_id: The real ID of the user for whom we want to generate recommendations.
    - clicked_news: DataFrame containing user-item interactions.
    - item_profiles: Sparse matrix containing item profiles (TF-IDF values).
    - user_to_id: Dictionary mapping real user IDs to internal user IDs.
    - id_to_news: Dictionary mapping internal item IDs to real item IDs.
    - user_user: Trained UserUser collaborative filtering model.
    - n: Number of recommendations to generate.
    - w_content: Weight for the content-based model in the hybrid recommendation.
    - w_user_user: Weight for the user-user collaborative filtering model in the hybrid recommendation.
    
    Returns:
    A list of n recommended items for the user.
    """
    # Convert real user ID to internal user ID
    user_id = user_to_id[real_user_id]
    
    # Get clicked news by the user
    clicked_items = get_clicked_news(real_user_id)
    
    # Extract item profiles for clicked items
    clicked_item_profiles = item_profiles[clicked_items, :]
    
    # Compute similarity matrix
    similarity_matrix = compute_partial_similarity(clicked_item_profiles, item_profiles)
    
    # Get top-N similar items for each clicked item from content-based model
    recommended_items_content_based = set()
    for idx, item_id in enumerate(clicked_items):
        similar_items = get_top_n_similar(idx, similarity_matrix, n)
        recommended_items_content_based.update(similar_items)
    
    # Remove items that the user has already clicked on
    recommended_items_content_based = recommended_items_content_based - set(clicked_items)
    
    print(f"content-based recommendations \n{recommended_items_content_based}\n\n")
    
    # Get top-N similar items from user-user collaborative filtering model
    recommended_items_user_user = recommend_user_user(user_id, n = n*2)  # requesting more to ensure we have enough after merging
    
    print(f"useruser recommendations\n{recommended_items_user_user}")
    
    # Create a DataFrame to store the scores from both models
    all_recommended_items = pd.DataFrame(list(recommended_items_content_based) + recommended_items_user_user['item'].tolist(), columns=['item'])
    all_recommended_items = all_recommended_items.drop_duplicates().reset_index(drop=True)
    
    # Add scores from content-based model
    all_recommended_items['score_content'] = 0  # initialize with 0
    for item_id in recommended_items_content_based:
        all_recommended_items.loc[all_recommended_items['item'] == item_id, 'score_content'] = 1  # or another score
    
    # Add scores from user-user model
    all_recommended_items = all_recommended_items.merge(recommended_items_user_user, on='item', how='left').fillna(0)
    
    # Compute the hybrid score as a weighted sum of the scores from both models
    all_recommended_items['hybrid_score'] = w_content * all_recommended_items['score_content'] + w_user_user * all_recommended_items['score']
    
    # Select the top-N items based on the hybrid score
    recommended_items = all_recommended_items.nlargest(n, 'hybrid_score')['item'].tolist()
    
    # Convert internal item IDs to real item IDs
    recommended_real_items = [id_to_news[item_id] for item_id in recommended_items]
    
    return recommended_real_items[:n]


In [30]:
recommended_items = recommend_for_user('U53220', n = 5)

print(f"Recommended items for user {user_id}: {recommended_items}")

content-based recommendations 
{1, 20487, 38928, 45072, 38929, 20502, 28697, 28700, 12316, 4124, 47134, 14381, 24624, 8242, 51, 64, 18497, 22599, 4175, 2127, 10323, 83, 26711, 18522, 18523, 18527, 2144, 41056, 43108, 28774, 8296, 6262, 22649, 39038, 22655, 4225, 12417, 32898, 22663, 6280, 28811, 16529, 8345, 22685, 24735, 32928, 28832, 163, 37030, 8358, 18600, 12456, 28842, 26791, 28843, 16559, 26802, 2232, 189, 35006, 24768, 30924, 2252, 37069, 6349, 32977, 18650, 224, 20706, 8419, 24814, 18671, 20720, 41208, 10488, 16637, 259, 24838, 30985, 14602, 16649, 8461, 28943, 14609, 14615, 41239, 37161, 20778, 45356, 18733, 12590, 10545, 8500, 22837, 8509, 41284, 33092, 35157, 12630, 2389, 6498, 6501, 20840, 45417, 45423, 6511, 39282, 41333, 41342, 14718, 47487, 47490, 2438, 12680, 10636, 22924, 2453, 8597, 4501, 47509, 14745, 16806, 18858, 27052, 39341, 47534, 29103, 6577, 18865, 35254, 6584, 20921, 2491, 4546, 451, 41412, 8647, 43466, 20940, 35277, 43469, 25046, 8664, 23003, 477, 43485, 189

KeyError: 'score'