In [15]:
import pandas as pd
import numpy as np
import lenskit
import lenskit.crossfold as xf
from scipy.sparse import lil_matrix
from scipy.sparse import coo_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from lenskit.batch import predict
from lenskit.metrics.predict import rmse
from lenskit.algorithms import Recommender, Predictor
from lenskit.algorithms.user_knn import UserUser
from lenskit import topn
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load data
behaviors = pd.read_csv('./small_training_data/behaviors.tsv', delimiter='\t', header=None)
news = pd.read_csv('./small_training_data/news.tsv', delimiter='\t', header=None)

# Naming columns
behaviors.columns = ["impression_id", "user_id", "time", "history", "impressions"]
news.columns = ["news_id", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

# Remove NaN values in the 'abstract' column
news = news.dropna(subset=['abstract']) 

# Extracting clicked news from behaviors, this is a column of lists of the clicked news (tagget with 1) for each impression
behaviors['clicked_news'] = behaviors['impressions'].apply(lambda x: [imp.split('-')[0] for imp in x.split() if imp.split('-')[1] == '1'])

# Flattening the clicked news and associating with user_id, that means we divide the lists into one row for each clicked news
clicked_news = behaviors.explode('clicked_news')[['user_id', 'clicked_news']].dropna()

# Remove clicked news that were removed from the news DataFrame
valid_news_ids = set(news['news_id'])
# Remove clicked news that were removed from the news DataFrame
valid_news_ids = set(news['news_id'])
clicked_news = clicked_news[clicked_news['clicked_news'].isin(valid_news_ids)].copy()

# Encoding user_id and news_id as categorical variables for memory and computation efficiency
clicked_news['user_id'] = clicked_news['user_id'].astype("category")
clicked_news['clicked_news'] = clicked_news['clicked_news'].astype("category")

print(f"Total number of news items: {news.shape[0]}") #48616 unique news
print(f"Number of unique clicked news: {clicked_news['clicked_news'].nunique()}") #7307 unique news have been clicked
print(f"Number of unique users: {clicked_news['user_id'].nunique()}")

Total number of news items: 48616
Number of unique clicked news: 7307
Number of unique users: 49445


### Content-based filtering
Here we try to pick the best hyperparameter for max_features since with shorter texts, having a large number of features might lead to overly sparse representations and might not capture the meaningful information effectively.

In [4]:
# Checking the total vocabulary size
total_vocabulary = set(word for abstract in news['abstract'] for word in abstract.split())
print(f"Total vocabulary size: {len(total_vocabulary)}")

# Adjusting max_features based on the vocabulary size, we might try different values to check the results
max_features = min(1000, len(total_vocabulary))

Total vocabulary size: 120090


In [5]:
#Create item profiles using the abstract of the news articles.

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=max_features)

# Fit and transform the abstracts to create item profiles
item_profiles = vectorizer.fit_transform(news['abstract'])


### Collaborative filtering

In [6]:
# Ensure 'user_id' and 'news_id' are strings
clicked_news['user_id'] = clicked_news['user_id'].astype(str)
clicked_news['clicked_news'] = clicked_news['clicked_news'].astype(str)

# Rename 'clicked_news' column to 'news_id'
clicked_news = clicked_news.rename(columns={'clicked_news': 'news_id'})

In [7]:
# Ensure 'user_id' and 'news_id' are categorical and encode them as integer codes
clicked_news['user_id'] = clicked_news['user_id'].astype("category").cat.codes
clicked_news['news_id'] = clicked_news['news_id'].astype("category").cat.codes

In [8]:
# Creating mappings from encoded IDs to original IDs
id_to_user = dict(enumerate(clicked_news['user_id'].astype("category").cat.categories))
id_to_news = dict(enumerate(clicked_news['news_id'].astype("category").cat.categories))

# Creating reverse mappings from original IDs to encoded IDs
user_to_id = {v: k for k, v in id_to_user.items()}
news_to_id = {v: k for k, v in id_to_news.items()}

In [9]:
# Create a sparse user-item interaction matrix
interaction_matrix = coo_matrix((np.ones(clicked_news.shape[0]), 
                                 (clicked_news['user_id'], clicked_news['news_id'])))

print(f"users: {interaction_matrix.shape[0]} \nitems: {interaction_matrix.shape[1]}")

users: 49445 
items: 7307


In [11]:
#Implementing Collaborative Filtering Model
# Initialize user-user collaborative filtering model
user_user = UserUser(15, min_nbrs=3)  # 15 neighbors, minimum 3 neighbors for prediction

In [16]:
# Implementing Content-Based Model
# Compute item-item similarities from item profiles
item_similarities = lil_matrix((item_profiles.shape[0], item_profiles.shape[0]))

In [None]:
# Compute and store only the top-k most similar items for each item
k = 1

for i in range(item_profiles.shape[0]):
    # Compute similarities between item i and all other items
    similarities = cosine_similarity(item_profiles[i], item_profiles).flatten()
    
    # Get the top-k most similar items
    top_k_indices = np.argpartition(similarities, -k)[-k:]
    
    # Store the top-k similarities in the sparse matrix
    item_similarities[i, top_k_indices] = similarities[top_k_indices]

In [None]:
print("a")