# News Recommender System. Collaborative Filtering (UV Decomposition)

This the next part of the project for the AI Course at UCU, 2021.    

In this section, we will implement the collaborative filtering recommender based on UV decomposition. Additionally, the model's performance will be evaluated and later on compared to other recommendation approaches.

**Authors**: Dmytro Lopushanskyy, Volodymyr Savchuk.

## Imports

In [1]:
import pandas as pd
import numpy as np
import random
import time
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

### Loading Data

In [2]:
filtered_behaviors = pd.read_csv('files/filtered_behaviours.csv', sep='\t')
filtered_articles = pd.read_csv('files/filtered_articles.csv', sep='\t')

behaviours_train_indexed_df = pd.read_csv('files/train_filtered_behaviours.csv', sep='\t')
behaviours_test_indexed_df = pd.read_csv('files/test_filtered_behaviours.csv', sep='\t')

In [3]:
filtered_behaviors.set_index('UserID')
filtered_behaviors['All_History'] = filtered_behaviors.groupby(['UserID']).History.transform(lambda x: ' '.join(x)).transform(lambda x: list(set(x.split())))

In [4]:
all_history = filtered_behaviors.drop_duplicates(subset=['UserID'])
all_history = all_history.filter(['UserID', 'All_History'])
all_history = all_history.set_index('UserID')

In [5]:
expanded_behaviors = all_history.explode('All_History').reset_index() 
expanded_behaviors.rename(columns={'All_History': 'NewsID'}, inplace=True)

In [6]:
behaviours_train_df, behaviours_test_df = train_test_split(expanded_behaviors,
                                   stratify=expanded_behaviors['UserID'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(behaviours_train_df))
print('# interactions on Test set: %d' % len(behaviours_test_df))

# interactions on Train set: 983294
# interactions on Test set: 245824


In [7]:
# Indexing by UserID to speed up the searches during evaluation
behaviours_full_indexed_df = expanded_behaviors.set_index('UserID')
behaviours_train_indexed_df = behaviours_train_df.set_index('UserID')
behaviours_test_indexed_df = behaviours_test_df.set_index('UserID')

In [8]:
# group by userID back to aggregated values
history_train_indexed_df = behaviours_train_indexed_df.groupby(['UserID'])['NewsID'].apply(list).reset_index().set_index('UserID')
history_train_indexed_df.rename(columns={'NewsID': 'All_History'}, inplace=True)

history_test_indexed_df = behaviours_test_indexed_df.groupby(['UserID'])['NewsID'].apply(list).reset_index().set_index('UserID')
history_test_indexed_df.rename(columns={'NewsID': 'All_History'}, inplace=True)

In [9]:
# implement filtering
history_test_indexed_df = history_test_indexed_df[history_test_indexed_df.index.isin(history_train_indexed_df.index.values.tolist())]
behaviours_test_indexed_df = behaviours_test_indexed_df[behaviours_test_indexed_df.index.isin(history_train_indexed_df.index.values.tolist())]

In [10]:
LIMIT = 200 # len(history_train_indexed_df)  # no limit
limited_users = history_train_indexed_df.index[:LIMIT]

ratings_df = pd.DataFrame(data=0, columns=filtered_articles.NewsID, index=limited_users.unique())

for i in range(LIMIT):
    user_history = history_train_indexed_df.iloc[i].tolist()[0]
    for news_id in user_history:
        ratings_df.iloc[i][news_id] = 1

In [11]:
users_items_pivot_sparse_matrix = csr_matrix(ratings_df)
users_items_pivot_sparse_matrix

<100x39726 sparse matrix of type '<class 'numpy.int64'>'
	with 2448 stored elements in Compressed Sparse Row format>

In [12]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 20
# Performs matrix factorization of the original user item matrix
# U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
users_items_pivot_sparse_matrix = users_items_pivot_sparse_matrix.asfptype()
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [13]:
U.shape

(100, 20)

In [14]:
Vt.shape

(20, 39726)

In [15]:
sigma = np.diag(sigma)
sigma.shape

(20, 20)

In [16]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 0.00000000e+00,  0.00000000e+00,  7.01605538e-04, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00, -2.50018511e-04, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  3.00362326e-03, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  1.88446722e-03, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00, -8.76478207e-04, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  1.78049381e-17, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [17]:
# Converting the reconstructed matrix back to a Pandas dataframe
users_ids = list(ratings_df.index)
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns=ratings_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,U1,U10,U10000,U10002,U10004,U10006,U10008,U10009,U10012,U10013,...,U10221,U10222,U10223,U10224,U10225,U10227,U10228,U1023,U10233,U10236
NewsID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
N55528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N61837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N53526,0.000702,-0.00025,0.003004,-0.003763,-3.2795989999999996e-19,0.000666,-0.000307,-0.007043,-0.000475,0.002607,...,-0.011321,0.009184,0.000583,-0.000384,0.001638,0.001036,-5.7e-05,0.001884,-0.000876,1.7804940000000002e-17
N38324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N2073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N11429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N49186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N2131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N59295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
N24510,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
class CFRecommender:
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'Click'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['NewsID'].isin(items_to_ignore)] \
                               .sort_values('Click', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'NewsID', 
                                                          right_on = 'NewsID')[['Click', 'NewsID', 'Title']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, filtered_articles)

In [19]:
cf_recommender_model.recommend_items('U10006')

Unnamed: 0,NewsID,Click
0,N12907,0.072231
1,N4607,0.07078
2,N33096,0.052958
3,N38701,0.049284
4,N49475,0.047415
5,N24591,0.04502
6,N55743,0.044639
7,N24356,0.042891
8,N58641,0.041417
9,N8448,0.041389


In [20]:
behaviours_test_indexed_df

Unnamed: 0_level_0,NewsID
UserID,Unnamed: 1_level_1
U81837,N56109
U10057,N21005
U15329,N45437
U85850,N63855
U82226,N22007
...,...
U20689,N63411
U28431,N36971
U88752,N19620
U67693,N33286


In [35]:
# behaviours_test_indexed_df = behaviours_test_indexed_df.set_index('UserID')
# behaviours_train_indexed_df = behaviours_train_indexed_df.set_index('UserID')
# history_test_indexed_df = history_test_indexed_df.set_index('UserID')
# all_history.rename(columns={'All_History': 'NewsID'}, inplace=True)
# history_test_indexed_df.rename(columns={'All_History': 'NewsID'}, inplace=True)
history_train_indexed_df = history_train_indexed_df.set_index('UserID')
history_train_indexed_df.rename(columns={'All_History': 'NewsID'}, inplace=True)
    

KeyError: "None of ['UserID'] are in the columns"

In [41]:
# Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluatorCF:
    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = self.get_items_interacted(person_id, all_history)
        all_items = set(filtered_articles['NewsID'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)
    
    def get_items_interacted(self, person_id, interactions_df):
        # Get the user's data and merge in the news information.
        interacted_items = interactions_df.loc[person_id]['NewsID']
        return set(interacted_items if type(interacted_items) == pd.Series else interacted_items)
 
    def _verify_hit_top_n(self, item_id, recommended_items, topn): 
        try:
            item_idx = recommended_items.index(item_id)
        except:
            item_idx = -1
        hit = int(item_idx in range(0, topn))
        return hit, item_idx

    def evaluate_model_for_user(self, person_id):
        # Getting the items in test set
        interacted_values_testset = history_test_indexed_df.loc[person_id]
        
        person_interacted_items_testset = set(interacted_values_testset['NewsID'])
        
        interacted_items_count_testset = len(person_interacted_items_testset) 

        # Getting a ranked recommendation list from a model for a given user
        person_recs = cf_recommender_model.recommend_items(
            person_id, 
            items_to_ignore=self.get_items_interacted(person_id, history_train_indexed_df), topn=100000)
        
        hits_at_5_count = 0
        hits_at_10_count = 0
        # For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            # Getting a random sample (100) items the user has not interacted 
            # (to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=random.randint(0, 2**32))

            # Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))           
            # Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs[person_recs['NewsID'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['NewsID'].values.tolist()
            # Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        # Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        # when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count': hits_at_5_count, 
                          'hits@10_count': hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self):
        print('Running evaluation for users')
        people_metrics = []
        filtered_users = history_train_indexed_df.index[:LIMIT] # list(filter(lambda user_id : user_id in limited_users, list(history_test_indexed_df.index.unique())))
        for idx, person_id in enumerate(filtered_users[:LIMIT]):
            person_metrics = self.evaluate_model_for_user(person_id) 
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': 'UV decomposition CF',
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluatorCF() 

In [42]:
print('Evaluating Collaborative Filtering model with UV decomposition ...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model()

Evaluating Collaborative Filtering model with UV decomposition ...
Running evaluation for users


In [43]:
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.sort_values('recall@10', ascending=False).head(20)


Global metrics:
{'modelName': 'UV decomposition CF', 'recall@5': 0.25673534072900156, 'recall@10': 0.26782884310618066}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
82,2,2,2,1.0,1.0,U10195
93,1,1,1,1.0,1.0,U10224
22,2,2,2,1.0,1.0,U1005
5,1,1,1,1.0,1.0,U10006
45,2,2,2,1.0,1.0,U101
48,1,1,1,1.0,1.0,U10114
1,1,1,1,1.0,1.0,U10
78,1,1,1,1.0,1.0,U1019
50,1,1,1,1.0,1.0,U10117
12,2,2,2,1.0,1.0,U10020
