## Collaborative Filtering

Due to the relatively high number of binary ratings per user, we've chosen to utilize User-Based Collaborative Filtering to suggest news articles

User-Based Collaborative Filtering Methodology in a nutshell:
1. Calculate User Similarity
2. Find the k most similar users to the target user
3. Extract the articles the target user has not interacted with for k most similar users
4. Sort article recommendations by score in descending order


### Installations
pip install pandas
pip install numpy
pip install matplotlib
pip install -U scikit-learn

In [32]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt

# load the dataset

behavior = pd.read_csv("data/MINDsmall_dev/behaviors.tsv", sep="\t", header=None, names=["ImpressionID", "UserID", "Time", "History", "Impressions"])
news = pd.read_csv("data/MINDsmall_dev/news.tsv", sep="\t", header=None, names=["NewsID", "Category", "Subcategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities", "TitleTopics", "AbstractTopics"])

print(behavior)
print("\n\n")
print(news)


       ImpressionID  UserID                    Time  \
0                 1  U80234  11/15/2019 12:37:50 PM   
1                 2  U60458   11/15/2019 7:11:50 AM   
2                 3  U44190   11/15/2019 9:55:12 AM   
3                 4  U87380   11/15/2019 3:12:46 PM   
4                 5   U9444   11/15/2019 8:25:46 AM   
...             ...     ...                     ...   
73147         73148  U77536   11/15/2019 8:40:16 PM   
73148         73149  U56193   11/15/2019 1:11:26 PM   
73149         73150  U16799   11/15/2019 3:37:06 PM   
73150         73151   U8786   11/15/2019 8:29:26 AM   
73151         73152  U68182  11/15/2019 11:54:34 AM   

                                                 History  \
0      N55189 N46039 N51741 N53234 N11276 N264 N40716...   
1      N58715 N32109 N51180 N33438 N54827 N28488 N611...   
2      N56253 N1150 N55189 N16233 N61704 N51706 N5303...   
3      N63554 N49153 N28678 N23232 N43369 N58518 N444...   
4                     N51692 N18285 N26

In [33]:
# Preprocess data: 
# The goal is to create a matrix where rows represent users, columns represent news articles, and values indicate whether a user clicked on an article.

# Expand the Impressions into separate rows
impressions_expanded = behavior['Impressions'].str.split(' ', expand=True).stack().reset_index(level=1, drop=True).to_frame('Impression')
impressions_expanded['NewsID'] = impressions_expanded['Impression'].apply(lambda x: x.split('-')[0])
impressions_expanded['Clicked'] = impressions_expanded['Impression'].apply(lambda x: int(x.split('-')[1]))

# Filter for clicked articles only
clicked_impressions = impressions_expanded[impressions_expanded['Clicked'] == 1].drop('Impression', axis=1)

# Merge back to get the UserID for each click
clicked_data = clicked_impressions.merge(behavior[['UserID']], left_index=True, right_index=True)

print(clicked_data)


       NewsID  Clicked  UserID
0      N31958        1  U80234
1      N23513        1  U60458
2       N5940        1  U44190
3      N15347        1  U87380
4       N5940        1   U9444
...       ...      ...     ...
73148  N11390        1  U56193
73149  N60215        1  U16799
73149  N54562        1  U16799
73150  N20036        1   U8786
73151  N21679        1  U68182

[111383 rows x 3 columns]


In [38]:
# Pivot table to create a user-item matrix
user_news_matrix = clicked_data.pivot_table(index='UserID', columns='NewsID', values='Clicked', fill_value=0)
print(user_news_matrix)

# Subset of the user-news matrix to run code due to memory usage
# subset_user_news_matrix = user_news_matrix.iloc[:1000, :1000]
#print(subset_user_news_matrix)

NewsID  N10032  N10050  N10051  N10083  N10142  N10226  N10278  N10282  N103  \
UserID                                                                         
U1         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   0.0   
U10        0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   0.0   
U10000     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   0.0   
U10002     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   0.0   
U10004     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   0.0   
...        ...     ...     ...     ...     ...     ...     ...     ...   ...   
U9990      0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   0.0   
U9994      0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   0.0   
U9996      0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   0.0   
U9998      0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   0.0   
U9999      0.0     0.0     0.0     0.0  

In [39]:
# Convert to sparse matrix for efficiency
sparse_user_news_matrix = csr_matrix(user_news_matrix.values)

# Calculate cosine similarity between users
user_similarity = cosine_similarity(sparse_user_news_matrix)

# Convert similarity matrix to DataFrame for easier manipulation
user_similarity_df = pd.DataFrame(user_similarity, index=user_news_matrix.index, columns=user_news_matrix.index)

print(user_similarity_df)

MemoryError: Unable to allocate 18.6 GiB for an array with shape (50000, 50000) and data type float64

In [None]:
def recommend_articles(user_id, user_similarity_df, user_news_matrix, top_n=5):
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]  # exclude self
    recommended_articles = pd.Series(dtype='float64')
    for similar_user in similar_users:
        # Get articles rated by similar users but not yet by the target user
        articles_to_consider = user_news_matrix.loc[similar_user]
        articles_to_consider = articles_to_consider[articles_to_consider > 0]  # only consider clicked articles
        recommended_articles = recommended_articles.append(articles_to_consider)
    
    # Sum the scores for articles recommended by multiple users
    recommended_articles = recommended_articles.groupby(recommended_articles.index).sum()
    
    # Exclude articles already seen by the user
    already_seen = user_news_matrix.loc[user_id][user_news_matrix.loc[user_id] > 0].index
    recommended_articles = recommended_articles.drop(already_seen, errors='ignore')
    
    return recommended_articles.sort_values(ascending=False).head(top_n).index.tolist()

# Example usage
user_id = 'U19739'
recommended_articles = recommend_articles(user_id, user_similarity_df, user_news_matrix)
print(f"Recommended articles for user {user_id}: {recommended_articles}")

KeyError: 'U19739'