# News Recommender System. Collaborative Filtering (Item-Based)

This the next part of the project for the AI Course at UCU, 2021.    

In this section, we will implement the collaborative filtering recommender based on item ratings similarity. Additionally, the model's performance will be evaluated and later on compared to other recommendation approaches.

**Authors**: Dmytro Lopushanskyy, Volodymyr Savchuk.

## Imports

In [1]:
import pandas as pd
import random
import time
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

## Load The Data

We are using MIND data set for our recommendation system. It contains two main files: behaviors and English news articles data.

In [2]:
filtered_behaviors = pd.read_csv('files/filtered_behaviours.csv', sep='\t')
filtered_articles = pd.read_csv('files/filtered_articles.csv', sep='\t')

del filtered_behaviors['Unnamed: 0']
del filtered_articles['Unnamed: 0']

train_filtered_behaviours = pd.read_csv('files/train_filtered_behaviours.csv', sep='\t').set_index('UserID')
test_filtered_behaviours = pd.read_csv('files/test_filtered_behaviours.csv', sep='\t').set_index('UserID')
full_filtered_behaviours = train_filtered_behaviours.append(test_filtered_behaviours)

In [3]:
# group by userID back to aggregated values
train_filtered_behaviours = train_filtered_behaviours.groupby(['UserID'])['NewsID'].apply(list).reset_index().set_index('UserID')
# train_filtered_behaviours.rename(columns={'NewsID': 'All_History'}, inplace=True)

test_filtered_behaviours = test_filtered_behaviours.groupby(['UserID'])['NewsID'].apply(list).reset_index().set_index('UserID')
# test_filtered_behaviours.rename(columns={'NewsID': 'All_History'}, inplace=True)

In [4]:
# implement filtering
train_filtered_behaviours = train_filtered_behaviours[train_filtered_behaviours.index.isin(test_filtered_behaviours.index.values.tolist())]
test_filtered_behaviours = test_filtered_behaviours[test_filtered_behaviours.index.isin(train_filtered_behaviours.index.values.tolist())]

## Collaborative Filtering

We need to take all of the news articles available to us and the train behaviours dataset.

Since CF is taking quite a lot of memory, we will start by using 100 users and all articles.

In [5]:
LIMIT = 50
limited_users = train_filtered_behaviours.index[:LIMIT]

ratings_df = pd.DataFrame(data=0, columns=filtered_articles.NewsID, index=limited_users)

for i in range(LIMIT):
    user_history = train_filtered_behaviours.iloc[i].tolist()[0]
    for news_id in user_history:
        ratings_df.iloc[i][news_id] = 1

In [6]:
ratings_df = ratings_df.T
ratings_df

UserID,U1,U10,U10000,U10002,U10004,U10006,U10008,U10009,U10012,U10013,...,U1009,U10090,U10093,U10095,U10097,U101,U1010,U10103,U10114,U10116
NewsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N55528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
N61837,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
N53526,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
N38324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
N2073,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N42491,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
N13097,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
N63550,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
N30345,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(ratings_df.values)
distances, indices = knn.kneighbors(ratings_df.values, n_neighbors=10)

In [8]:
# get the index for article
index_for_article = ratings_df.index.tolist().index('N61837')

# find the indices for the similar articles
sim_articles = indices[index_for_article].tolist()

# distances between article and the similar article
articles_distances = distances[index_for_article].tolist()

if index_for_article in sim_articles:  # might sometimes happen
    # the position of article in the list sim_articles
    id_article = sim_articles.index(index_for_article)

    # remove user from the list sim_users 
    sim_articles.remove(index_for_article)

    # remove user from the list user_distances
    articles_distances.pop(id_article)

print('The Nearest Users to N61837:', sim_articles)
print('The Distance from N61837:', articles_distances)

The Nearest Users to N61837: [26486, 26488, 26480, 26481, 26482, 26483, 26484, 26485, 26478, 26487]
The Distance from N61837: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [9]:
def augment_rating_df(ratings):
    augmented_ratings = ratings.copy()

    # find the nearest neighbors using NearestNeighbors(n_neighbors=10)
    number_neighbors = 10
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(ratings.values)
    distances, indices = knn.kneighbors(ratings.values, n_neighbors=number_neighbors)
    
    start = time.time()

    for user_loc, user_id in list(enumerate(ratings.columns)):
        if user_loc % 4 == 0:
                print(f'Number of users processed: {user_loc} / {len(ratings.columns)}. Minutes passed: {int((time.time() - start) / 60)}')
                
        for news_idx, news_id in enumerate(ratings.index):
            news_index = filtered_articles.NewsID.tolist().index(news_id)

            # find news without ratings by user
            if ratings.iloc[news_index, user_loc] == 0:
                sim_articles = indices[news_idx].tolist()
                articles_distances = distances[news_idx].tolist()
                
                # Generally, this is the case. The user itself is in the first place.
                if news_id in sim_articles:
                    news_idxx = sim_articles.index(news_id)
                    sim_articles.remove(news_id)
                    articles_distances.pop(news_idxx) 

                # However, sometimes even the movie itself cannot be included in the indices.
                # In that case, we take off the farthest movie in the list.
                else:
                    sim_articles = sim_articles[:number_neighbors - 1]
                    articles_distances = articles_distances[:number_neighbors - 1]
                    
                # article_similarity = 1 - article_similarity
                article_similarity = [1 - x for x in articles_distances]
                article_similarity_copy = article_similarity.copy()
                nominator = 0

                # for each similar article
                for i in range(len(sim_articles)):
                    # check if the rating of a similar news is zero
                    if ratings.iloc[sim_articles[i], user_loc] == 0:
                        # if the rating is zero, ignore the rating and the similarity in calculating the predicted rating
                        if len(article_similarity_copy) == (number_neighbors - 1):
                            article_similarity_copy.pop(i)
                        else:
                            article_similarity_copy.pop(i - (len(article_similarity) - len(article_similarity_copy)))

                    # if the rating is not zero, use the rating and similarity in the calculation
                    else:
                        nominator += article_similarity[i] * ratings.iloc[sim_articles[i], user_loc]

                # check if the number of the ratings with non-zero is positive
                if len(article_similarity_copy) > 0:
                    # check if the sum of the ratings of the similar movies is positive.
                    if sum(article_similarity_copy) > 0:
                        predicted_r = nominator / sum(article_similarity_copy)

                    # Even if there are some news for which the ratings are positive, some movies have zero similarity even though they are selected as similar movies.
                    # in this case, the predicted rating becomes zero as well  
                    else:
                        predicted_r = 0

                # if all the ratings of the similar news are zero, then predicted rating should be zero
                else:
                    predicted_r = 0

                # place the predicted rating into the augmented original dataset
                augmented_ratings.iloc[news_index, user_loc] = predicted_r
    end = time.time()
    print(f"Processing finished. Total time: {int((end - start) / 60)} minutes")
    return augmented_ratings
            

In [10]:
augmented_ratings_item_based = augment_rating_df(ratings_df)
augmented_ratings_item_based

Number of users processed: 0 / 50. Minutes passed: 0
Number of users processed: 4 / 50. Minutes passed: 4
Number of users processed: 8 / 50. Minutes passed: 9
Number of users processed: 12 / 50. Minutes passed: 12
Number of users processed: 16 / 50. Minutes passed: 16
Number of users processed: 20 / 50. Minutes passed: 20
Number of users processed: 24 / 50. Minutes passed: 24
Number of users processed: 28 / 50. Minutes passed: 27
Number of users processed: 32 / 50. Minutes passed: 31
Number of users processed: 36 / 50. Minutes passed: 35
Number of users processed: 40 / 50. Minutes passed: 39
Number of users processed: 44 / 50. Minutes passed: 43
Number of users processed: 48 / 50. Minutes passed: 47
Processing finished. Total time: 49 minutes


UserID,U1,U10,U10000,U10002,U10004,U10006,U10008,U10009,U10012,U10013,...,U1009,U10090,U10093,U10095,U10097,U101,U1010,U10103,U10114,U10116
NewsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N55528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
N61837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
N53526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
N38324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
N2073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
N42491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
N13097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
N63550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
N30345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0


In [11]:
def recommend_news(user, items_to_ignore, num_recommended_news, ignore_interacted=True, verbose=False):
    if verbose:
        print('The list of the News {} Has Clicked on \n'.format(user))

        for news_id in ratings_df.loc[:, user][ratings_df.loc[:, user] > 0].index.tolist():
            print(news_id)

        print('\n')

    recommended_news = []

    for news_id in ratings_df.loc[:, user][ratings_df.loc[:, user] == 0].index.tolist():
        predicted_rating = augmented_ratings_item_based.loc[news_id, user]
        recommended_news.append((news_id, predicted_rating))

    sorted_rm = sorted(recommended_news, key=lambda x: x[1], reverse=True)
    if not ignore_interacted:
        # filter from items to ignore
        sorted_rm = list(filter(lambda x: x[0] not in items_to_ignore, sorted_rm)) 
        
    # filter from non-clickable news
    # sorted_rm = list(filter(lambda x: x[1] != 0, sorted_rm))  
    
    if verbose:
        print('The list of the Recommended News \n')
        rank = 1
        for recommended_news in sorted_rm[:num_recommended_news]:
            print('{}: {} - predicted rating: {}'.format(rank, recommended_news[0], recommended_news[1]))
            rank = rank + 1
        
    return [news[0] for news in sorted_rm[:num_recommended_news]]

In [12]:
recommend_news('U1', [], 5, verbose=True)

The list of the News U1 Has Clicked on 

N596
N52301
N13374
N24356
N32607
N57737
N40207
N62058
N10646
N25682


The list of the Recommended News 

1: N8179 - predicted rating: 1.0
2: N16771 - predicted rating: 1.0
3: N56490 - predicted rating: 1.0
4: N30637 - predicted rating: 1.0
5: N30662 - predicted rating: 1.0


['N8179', 'N16771', 'N56490', 'N30637', 'N30662']

In [23]:
# Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluatorCF:
    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = self.get_items_interacted(person_id, full_filtered_behaviours)
        all_items = set(filtered_articles['NewsID'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)
    
    def get_items_interacted(self, person_id, interactions_df):
        # Get the user's data and merge in the news information.
        interacted_items = interactions_df.loc[person_id]['NewsID']
        return set(interacted_items if type(interacted_items) == pd.Series else interacted_items)

    def _verify_hit_top_n(self, item_id, recommended_items, topn): 
        try:
            item_idx = recommended_items.index(item_id)
        except:
            item_idx = -1
        hit = int(item_idx in range(0, topn))
        return hit, item_idx

    def evaluate_model_for_user(self, person_id):
        # Getting the items in test set
        interacted_values_testset = test_filtered_behaviours.loc[person_id]
        if type(interacted_values_testset['NewsID']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['NewsID'])
        else:
            person_interacted_items_testset = set(interacted_values_testset['NewsID'])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        # Getting a ranked recommendation list from a model for a given user
        person_recs = recommend_news(
            person_id, 
            items_to_ignore=self.get_items_interacted(person_id, train_filtered_behaviours), 
            num_recommended_news=100000, ignore_interacted=False)
        
        hits_at_5_count = 0
        hits_at_10_count = 0
        # For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            # Getting a random sample (100) items the user has not interacted 
            # (to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=random.randint(0, 2**32))

            # Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            # Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs = list(filter(lambda x : x in items_to_filter_recs, person_recs))
            # Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        # Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        # when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count': hits_at_5_count, 
                          'hits@10_count': hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self):
        print('Running evaluation for users')
        people_metrics = []
        filtered_users = list(filter(lambda user_id : user_id in limited_users, list(test_filtered_behaviours.index.unique().values[:])))
        for idx, person_id in enumerate(filtered_users):
            if idx % 10 == 0 and idx > 0:
                print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % len(filtered_users))

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': 'Item-Based CF',
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluatorCF()    


In [24]:
print('Evaluating Collaborative Item-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model()

Evaluating Collaborative Item-Based Filtering model...
Running evaluation for users
10 users processed
20 users processed
30 users processed
40 users processed
50 users processed


In [25]:
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.sort_values('recall@10', ascending=False).head(20)


Global metrics:
{'modelName': 'Item-Based CF', 'recall@5': 0.04918032786885246, 'recall@10': 0.09508196721311475}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
14,0,1,2,0.0,0.5,U10029
13,1,1,2,0.5,0.5,U10028
32,0,1,2,0.0,0.5,U10073
15,1,1,2,0.5,0.5,U10031
23,0,1,2,0.0,0.5,U10050
8,2,2,4,0.5,0.5,U10012
42,1,1,2,0.5,0.5,U10093
37,1,1,3,0.333333,0.333333,U10084
41,2,2,6,0.333333,0.333333,U10090
35,0,1,4,0.0,0.25,U1008
