# News Recommender System. Collaborative Filtering (User-Based)

This the next part of the project for the AI Course at UCU, 2021.    

In this section, we will implement the collaborative filtering recommender based on user similarity. Additionally, the model's performance will be evaluated and leter on compared to other recommendation approaches.

**Authors**: Dmytro Lopushanskyy, Volodymyr Savchuk.

## Imports

In [11]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [12]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dmytrolopushanskyy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dmytrolopushanskyy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/dmytrolopushanskyy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Load The Data

We are using MIND data set for our recommendation system. It contains two main files: behaviors and English news articles data.

In [13]:
behaviors_header = ['ImpressionID', 'UserID', 'Time', 'History', 'Impressions']
behaviors_df = pd.read_csv('./MINDsmall_dev/behaviors.tsv', sep='\t', names=behaviors_header)

news_header = ['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities']
articles_df = pd.read_csv('./MINDsmall_dev/news.tsv', sep='\t', names=news_header)

Filtering

In [14]:
filtered_behaviors = behaviors_df[behaviors_df.History.str.split().str.len() > 5]

# Drop articles with duplicate titles
filtered_articles = articles_df.drop_duplicates(subset=['Title'])

# Drop articles with empty Abstract
filtered_articles = filtered_articles[filtered_articles.Abstract.notna()]

# Drop articles with titles less than 4 words
filtered_articles = filtered_articles[filtered_articles['Title'].apply((lambda x: len(x.split())>=4))]

## Text Preprocessing

In [15]:
# This function is to remove stopwords from a particular column and to tokenize it
def rem_stopwords_tokenize(data,name):
      
    def getting(sen):
        example_sent = sen

        stop_words = set(stopwords.words('english')) 

        word_tokens = word_tokenize(example_sent) 

        filtered_sentence = [w for w in word_tokens if not w in stop_words] 

        filtered_sentence = [] 

        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w) 
        return filtered_sentence
    x=[]
    for i in data[name].values:
        x.append(getting(i))
    data[name]=x

In [16]:
# Making a function to lemmatize all the words
lemmatizer = WordNetLemmatizer() 
def lemmatize_all(data,name):
    arr = data[name]
    a = []
    for i in arr:
        b = []
        for j in i:
            x = lemmatizer.lemmatize(j,pos='a')
            x = lemmatizer.lemmatize(x)
            b.append(x)
        a.append(b)
    data[name] = a

In [17]:
def convert_to_string(data,name):
    t=data[name].values
    p=[]
    for i in t:
        listToStr = ' '.join(map(str, i))
        p.append(listToStr)
    data[name]=p

In [18]:
# Removing Stop words from Title Column
rem_stopwords_tokenize(filtered_articles, 'Title')

# Lemmatizing the Title column
lemmatize_all(filtered_articles, 'Title')

# Back to string
convert_to_string(filtered_articles, 'Title')

In [19]:
# Removing Stop words from Abstract Column
rem_stopwords_tokenize(filtered_articles, 'Abstract')

# Lemmatizing the Abstract column
lemmatize_all(filtered_articles, 'Abstract')

# Back to string
convert_to_string(filtered_articles, 'Abstract')

## Split into train and test data sets

In [31]:
filtered_behaviors.set_index('UserID')
filtered_behaviors['All_History'] = filtered_behaviors.groupby(['UserID']).History.transform(lambda x: ' '.join(x)).transform(lambda x: list(set(x.split())))

In [21]:
all_history = filtered_behaviors.drop_duplicates(subset=['UserID'])
all_history = all_history.filter(['UserID', 'All_History'])
all_history = all_history.set_index('UserID')
all_history

Unnamed: 0_level_0,All_History
UserID,Unnamed: 1_level_1
U80234,"[N38895, N30924, N35671, N6616, N51741, N55189..."
U60458,"[N30924, N28488, N58715, N6778, N50020, N33742..."
U44190,"[N56253, N1150, N16233, N3259, N51706, N55189,..."
U87380,"[N45794, N49153, N43369, N30765, N63554, N3445..."
U69606,"[N4607, N34140, N19591, N879, N21503, N43142, ..."
...,...
U11,"[N49023, N4647, N31820, N18870, N33271, N5905]"
U77536,"[N46259, N55438, N55024, N16912, N60340, N8024..."
U56193,"[N31099, N53531, N38311, N58782, N28088, N4705..."
U16799,"[N42078, N15670, N46845, N64536, N52294, N1529..."


In [22]:
expanded_behaviors = all_history.explode('All_History').reset_index() 
expanded_behaviors.rename(columns={'All_History': 'NewsID'}, inplace=True)

In [23]:
behaviours_train_df, behaviours_test_df = train_test_split(expanded_behaviors,
                                   stratify=expanded_behaviors['UserID'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(behaviours_train_df))
print('# interactions on Test set: %d' % len(behaviours_test_df))

# interactions on Train set: 983294
# interactions on Test set: 245824


In [24]:
# Indexing by UserID to speed up the searches during evaluation
behaviours_full_indexed_df = expanded_behaviors.set_index('UserID')
behaviours_train_indexed_df = behaviours_train_df.set_index('UserID')
behaviours_test_indexed_df = behaviours_test_df.set_index('UserID')

In [25]:
# group by userID back to aggregated values
history_train_indexed_df = behaviours_train_indexed_df.groupby(['UserID'])['NewsID'].apply(list).reset_index().set_index('UserID')
history_train_indexed_df.rename(columns={'NewsID': 'All_History'}, inplace=True)

history_test_indexed_df = behaviours_test_indexed_df.groupby(['UserID'])['NewsID'].apply(list).reset_index().set_index('UserID')
history_test_indexed_df.rename(columns={'NewsID': 'All_History'}, inplace=True)

In [27]:
# implement filtering
history_test_indexed_df = history_test_indexed_df[history_test_indexed_df.index.isin(history_train_indexed_df.index.values.tolist())]
behaviours_test_indexed_df = behaviours_test_indexed_df[behaviours_test_indexed_df.index.isin(history_train_indexed_df.index.values.tolist())]

In [28]:
print('Full size: ', behaviours_full_indexed_df.size)
print('Train size: ', behaviours_train_indexed_df.size)
print('Test size: ', behaviours_test_indexed_df.size)

Full size:  1229118
Train size:  983294
Test size:  245824


# Collaborative Filtering

We need to take all of the news articles available to us and the train behaviours dataset.

Since CF is taking quite a lot of memory, we will start by using 100 users and all articles.

In [37]:
LIMIT = 100
limited_users = history_train_indexed_df.index[:LIMIT]

ratings_df = pd.DataFrame(data=0, columns=filtered_articles.NewsID, index=limited_users)

for i in range(LIMIT):
    user_history = history_train_indexed_df.iloc[i].tolist()[0]
    for news_id in user_history:
        ratings_df.iloc[i][news_id] = 1

In [38]:
ratings_df

NewsID,N55528,N61837,N53526,N38324,N2073,N11429,N49186,N2131,N59295,N24510,...,N16016,N25854,N7618,N16804,N19926,N42491,N13097,N63550,N30345,N30135
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10000,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U10227,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10228,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U1023,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10233,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(ratings_df.values)
distances, indices = knn.kneighbors(ratings_df.values, n_neighbors=10)

In [45]:
# get the index for user
index_for_user = ratings_df.index.tolist().index('U1')

# find the indices for the similar users
sim_users = indices[index_for_user].tolist()

# distances between user and the similar users
user_distances = distances[index_for_user].tolist()

# the position of user in the list sim_users
id_user = sim_users.index(index_for_user)

# remove user from the list sim_users
sim_users.remove(index_for_user)

# remove user from the list user_distances
user_distances.pop(id_user)

print('The Nearest Users to U1:', sim_users)
print('The Distance from U1:', user_distances)

The Nearest Users to U1: [5, 54, 42, 73, 95, 91, 3, 70, 2]
The Distance from U1: [0.8585786437626906, 0.8881966011250105, 0.894590744661054, 0.9183503419072274, 0.9233035011152629, 0.9309934440657646, 0.9344174164216047, 0.9354502775632098, 0.9379826327053957]


In [47]:
def augment_rating_df(ratings):
    augmented_ratings = ratings.copy()

    # find the nearest neighbors using NearestNeighbors(n_neighbors=10)
    number_neighbors = 10
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(ratings.values)
    distances, indices = knn.kneighbors(ratings.values, n_neighbors=number_neighbors)

    for news_id in filtered_articles.NewsID[:1000]:
        news_index = filtered_articles.NewsID.tolist().index(news_id)

        for user_loc, user_id in list(enumerate(ratings.index)):
            # find news without ratings by user
            if ratings.iloc[user_loc, news_index] == 0:
                sim_users = indices[user_loc].tolist()
                users_distances = distances[user_loc].tolist()
                
                # Generally, this is the case. The user itself is in the first place.
                if user_id in sim_users:
                    user_idx = sim_users.index(user_id)
                    sim_users.remove(user_id)
                    users_distances.pop(user_idx) 

                # However, sometimes even the movie itself cannot be included in the indices.
                # In that case, we take off the farthest movie in the list.
                else:
                    sim_users = sim_users[:number_neighbors - 1]
                    users_distances = users_distances[:number_neighbors - 1]
                    
                # user_similarity = 1 - users_distances
                user_similarity = [1 - x for x in users_distances]
                user_similarity_copy = user_similarity.copy()
                nominator = 0

                # for each similar user
                for i in range(len(sim_users)):
                    # check if the rating of a similar news is zero
                    if ratings.iloc[sim_users[i], news_index] == 0:
                        # if the rating is zero, ignore the rating and the similarity in calculating the predicted rating
                        if len(user_similarity_copy) == (number_neighbors - 1):
                            user_similarity_copy.pop(i)
                        else:
                            user_similarity_copy.pop(i - (len(user_similarity) - len(user_similarity_copy)))

                    # if the rating is not zero, use the rating and similarity in the calculation
                    else:
                        nominator += user_similarity[i] * ratings.iloc[sim_users[i], news_index]

                # check if the number of the ratings with non-zero is positive
                if len(user_similarity_copy) > 0:
                    # check if the sum of the ratings of the similar movies is positive.
                    if sum(user_similarity_copy) > 0:
                        predicted_r = nominator / sum(user_similarity_copy)

                    # Even if there are some news for which the ratings are positive, some movies have zero similarity even though they are selected as similar movies.
                    # in this case, the predicted rating becomes zero as well  
                    else:
                        predicted_r = 0

                # if all the ratings of the similar news are zero, then predicted rating should be zero
                else:
                    predicted_r = 0

                # place the predicted rating into the augmented original dataset
                augmented_ratings.iloc[user_loc, news_index] = predicted_r
    return augmented_ratings
            

In [49]:
augmented_ratings_user_based = augment_rating_df(ratings_df)
augmented_ratings_user_based

NewsID,N55528,N61837,N53526,N38324,N2073,N11429,N49186,N2131,N59295,N24510,...,N16016,N25854,N7618,N16804,N19926,N42491,N13097,N63550,N30345,N30135
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
U1,0,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10,0,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10000,0,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10002,0,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10004,0,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
U10227,0,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10228,0,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U1023,0,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
U10233,0,0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [111]:
def recommend_news(user, items_to_ignore, num_recommended_news, ignore_interacted=True, verbose=False):
    if verbose:
        print('The list of the News {} Has Clicked on \n'.format(user))

        for news_id in ratings_df.loc[user, :][ratings_df.loc[user, :] > 0].index.tolist():
            print(news_id)

        print('\n')

    recommended_news = []

    for news_id in ratings_df.loc[user, :][ratings_df.loc[user, :] == 0].index.tolist():
        predicted_rating = augmented_ratings_user_based.loc[user, news_id]
        recommended_news.append((news_id, predicted_rating))

    sorted_rm = sorted(recommended_news, key=lambda x: x[1], reverse=True)
    if not ignore_interacted:
        # filter from items to ignore
        sorted_rm = list(filter(lambda x: x[0] not in items_to_ignore, sorted_rm)) 
        
    # filter from non-clickable news
    # sorted_rm = list(filter(lambda x: x[1] != 0, sorted_rm))  
    
    if verbose:
        print('The list of the Recommended News \n')
        rank = 1
        for recommended_news in sorted_rm[:num_recommended_news]:
            print('{}: {} - predicted rating: {}'.format(rank, recommended_news[0], recommended_news[1]))
            rank = rank + 1
        
    return [news[0] for news in sorted_rm[:num_recommended_news]]

In [112]:
recommend_news('U1', [], 5, verbose=True)

The list of the News U1 Has Clicked on 

N596
N52301
N13374
N23571
N24356
N32607
N40207
N62058
N10646
N58267


The list of the Recommended News 

1: N12676 - predicted rating: 1.0
2: N27608 - predicted rating: 1.0
3: N50333 - predicted rating: 1.0
4: N55528 - predicted rating: 0
5: N61837 - predicted rating: 0


['N12676', 'N27608', 'N50333', 'N55528', 'N61837']

In [143]:
# Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluatorCF:
    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = self.get_items_interacted(person_id, behaviours_full_indexed_df)
        all_items = set(filtered_articles['NewsID'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)
    
    def get_items_interacted(self, person_id, interactions_df):
        # Get the user's data and merge in the news information.
        interacted_items = interactions_df.loc[person_id]['NewsID']
        return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

    def _verify_hit_top_n(self, item_id, recommended_items, topn): 
        try:
            item_idx = recommended_items.index(item_id)
        except:
            item_idx = -1
        hit = int(item_idx in range(0, topn))
        return hit, item_idx

    def evaluate_model_for_user(self, person_id):
        # Getting the items in test set
        interacted_values_testset = behaviours_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['NewsID']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['NewsID'])
        else:
            person_interacted_items_testset = set([interacted_values_testset['NewsID']])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        # Getting a ranked recommendation list from a model for a given user
        person_recs = recommend_news(
            person_id, 
            items_to_ignore=self.get_items_interacted(person_id, behaviours_train_indexed_df), 
            num_recommended_news=1000, ignore_interacted=False)
        
        hits_at_5_count = 0
        hits_at_10_count = 0
        # For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            # Getting a random sample (100) items the user has not interacted 
            # (to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=random.randint(0, 2**32))

            # Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            # Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs = list(filter(lambda x : x in items_to_filter_recs, person_recs))
            # Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        # Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        # when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count': hits_at_5_count, 
                          'hits@10_count': hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self):
        print('Running evaluation for users')
        people_metrics = []
        filtered_users = list(filter(lambda user_id : user_id in limited_users, list(behaviours_test_indexed_df.index.unique().values[:])))
        for idx, person_id in enumerate(filtered_users):
            if idx % 10 == 0 and idx > 0:
                print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % len(filtered_users))

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': 'User-Based CF',
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluatorCF()    


In [144]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model()

Evaluating Content-Based Filtering model...
Running evaluation for users
10 users processed
20 users processed
30 users processed
40 users processed
50 users processed
60 users processed
70 users processed
80 users processed
90 users processed
100 users processed


In [145]:
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.sort_values('recall@10', ascending=False).head(20)


Global metrics:
{'modelName': 'User-Based CF', 'recall@5': 0.02377179080824089, 'recall@10': 0.02377179080824089}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
11,1,1,5,0.2,0.2,U102
43,1,1,5,0.2,0.2,U10158
73,1,1,5,0.2,0.2,U10008
61,1,1,6,0.166667,0.166667,U10222
83,1,1,6,0.166667,0.166667,U10090
6,1,1,7,0.142857,0.142857,U10037
48,1,1,8,0.125,0.125,U10156
20,3,3,25,0.12,0.12,U10173
7,2,2,23,0.086957,0.086957,U10208
5,1,1,13,0.076923,0.076923,U1009
