In [156]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from datetime import datetime
import time

In [157]:
#read dfs
shared_articles_df = pd.read_csv('shared_articles.csv')
users_interactions_df = pd.read_csv('users_interactions.csv')

In [158]:
event_type_score = {
   'VIEW': 1.0,
   'LIKE': 4.0, 
   'COMMENT CREATED': 10.0,
   'FOLLOW': 25.0,
   'BOOKMARK': 100.0,
}

users_interactions_df['eventScore'] = users_interactions_df['eventType'].apply(lambda x: event_type_score[x])

In [159]:
shared_articles_earliest_ts, shared_articles_latest_ts = min(shared_articles_df['timestamp']),max(shared_articles_df['timestamp'])
earliest_ts = int(shared_articles_earliest_ts)
latest_ts = int(shared_articles_latest_ts)
print("shared articles earliest timestamp is ", datetime.utcfromtimestamp(earliest_ts).strftime('%Y-%m-%d %H:%M:%S'))
print("shared articles latest timestamp is ", datetime.utcfromtimestamp(latest_ts).strftime('%Y-%m-%d %H:%M:%S'))

shared articles earliest timestamp is  2016-03-28 19:19:39
shared articles latest timestamp is  2017-02-28 18:51:11


In [160]:
users_interactions_earliest_ts, users_interactions_latest_ts = min(users_interactions_df['timestamp']),max(users_interactions_df['timestamp'])
earliest_ts = int(users_interactions_earliest_ts)
latest_ts = int(users_interactions_latest_ts)
print("users_interactions earliest timestamp is ", datetime.utcfromtimestamp(earliest_ts).strftime('%Y-%m-%d %H:%M:%S'))
print("users_interactions latest timestamp is ", datetime.utcfromtimestamp(latest_ts).strftime('%Y-%m-%d %H:%M:%S'))

users_interactions earliest timestamp is  2016-03-14 13:54:36
users_interactions latest timestamp is  2017-02-28 19:21:51


## setting reference date for train/test split

In [161]:
reference_date = '2017-01-01'
#convert to unix timestamp
reference_ts = time.mktime(datetime.strptime(reference_date, "%Y-%m-%d").timetuple())

## train test split based on reference date

In [162]:
#split into train,test
interactions_train_df = users_interactions_df[users_interactions_df['timestamp']<reference_ts]
interactions_test_df = users_interactions_df[users_interactions_df['timestamp']>reference_ts]

In [163]:
interactions_train_count_df = interactions_train_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
print('Total no. of users in training set: %d' % len(interactions_train_count_df))
train_users_with_enough_interactions_df = interactions_train_count_df[interactions_train_count_df >= 10].reset_index()[['personId']]
print('No. of users in training set with at least 10 interactions: %d' % len(train_users_with_enough_interactions_df))

Total no. of users in training set: 1823
No. of users in training set with at least 10 interactions: 704


In [164]:
interactions_test_count_df = interactions_test_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
print('Total no. of users in test set: %d' % len(interactions_test_count_df))
test_users_with_enough_interactions_df = interactions_test_count_df[interactions_test_count_df >= 10].reset_index()[['personId']]
print('No. of users in test set with at least 10 interactions: %d' % len(test_users_with_enough_interactions_df))

Total no. of users in test set: 965
No. of users in test set with at least 10 interactions: 89


In [165]:
print('# of interactions: %d' % len(users_interactions_df))
train_interactions_from_selected_users_df = users_interactions_df.merge(train_users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')
print('# of interactions from train users with at least 10 interactions: %d' % len(train_interactions_from_selected_users_df))

# of interactions: 72312
# of interactions from train users with at least 10 interactions: 64901


In [166]:
print('# of interactions: %d' % len(users_interactions_df))
test_interactions_from_selected_users_df = users_interactions_df.merge(test_users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')
print('# of interactions from test users with at least 10 interactions: %d' % len(test_interactions_from_selected_users_df))

# of interactions: 72312
# of interactions from test users with at least 10 interactions: 22349


# Final Processed train and test df

In [167]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_train_df = train_interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['eventScore'].sum() \
                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_train_df))
interactions_train_df.head(10)

# of unique user/item interactions: 35871


Unnamed: 0,personId,contentId,eventScore
0,-9223121837663643404,-8949113594875411859,1.0
1,-9223121837663643404,-8377626164558006982,1.0
2,-9223121837663643404,-8208801367848627943,1.0
3,-9223121837663643404,-8187220755213888616,1.0
4,-9223121837663643404,-7423191370472335463,3.169925
5,-9223121837663643404,-7331393944609614247,1.0
6,-9223121837663643404,-6872546942144599345,1.0
7,-9223121837663643404,-6728844082024523434,1.0
8,-9223121837663643404,-6590819806697898649,1.0
9,-9223121837663643404,-6558712014192834002,1.584963


In [168]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_test_df = test_interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['eventScore'].sum() \
                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_test_df))
interactions_test_df.head(10)

# of unique user/item interactions: 12211


Unnamed: 0,personId,contentId,eventScore
0,-9016528795238256703,-9192549002213406534,1.0
1,-9016528795238256703,-9176143510534135851,1.0
2,-9016528795238256703,-9019233957195913605,2.0
3,-9016528795238256703,-8954346068661072425,1.584963
4,-9016528795238256703,-8800029253812071912,1.0
5,-9016528795238256703,-8796191278504623694,1.0
6,-9016528795238256703,-8762137947059829459,1.0
7,-9016528795238256703,-8728075196312712282,5.209453
8,-9016528795238256703,-8657415528200615063,1.0
9,-9016528795238256703,-8627051188605351707,5.426265


In [169]:
#ensure same persons are present in both train and test set
# train_personId_idx = pd.Index(set(interactions_train_df['personId']))
# test_personId_idx = pd.Index(set(interactions_test_df['personId']))
personIds_train = list(set(interactions_train_df['personId']))

#print("before filtering",len(interactions_train_df),len(interactions_test_df ))
#keep only common personIds in both train and test set
#interactions_train_df = interactions_train_df[interactions_train_df.personId.isin(personIds_common)]
interactions_test_df = interactions_test_df[interactions_test_df.personId.isin(personIds_train)]
#print("after filtering",len(interactions_train_df),len(interactions_test_df))

# Evaluation

In [170]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_df = pd.concat([interactions_train_df,interactions_test_df])
print(len(interactions_full_df))
interactions_full_indexed_df = interactions_full_df.set_index('personId')
interactions_train_indexed_df = interactions_train_df.set_index('personId')
interactions_test_indexed_df = interactions_test_df.set_index('personId')

47998


In [171]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['contentId']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [172]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(shared_articles_df['contentId'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    interactions_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['contentId'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['contentId'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator() 

# Popularity model

A common (and usually hard-to-beat) baseline approach is the Popularity model. This model is not actually personalized - it simply recommends to a user the most popular items that the user has not previously consumed. As the popularity accounts for the "wisdom of the crowds", it usually provides good recommendations, generally interesting for most people.
Ps. The main objective of a recommender system is to leverage the long-tail items to the users with very specific interests, which goes far beyond this simple technique.

In [173]:
#Computes the most popular items
item_popularity_df = interactions_full_df.groupby('contentId')['eventScore'].sum().sort_values(ascending=False).reset_index()
item_popularity_df.head(10)

Unnamed: 0,contentId,eventScore
0,-4029704725707465084,356.72527
1,-8208801367848627943,293.850208
2,-6843047699859121724,291.589624
3,2581138407738454418,270.316315
4,1469580151036142903,266.701445
5,3367026768872537336,262.210398
6,-6783772548752091658,259.404772
7,-6156751702010469220,248.239362
8,-133139342397538859,244.519064
9,8224860111193157980,243.76814


In [174]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['contentId'].isin(items_to_ignore)] \
                               .sort_values('eventScore', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['eventScore', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
popularity_model = PopularityRecommender(item_popularity_df, shared_articles_df)

In [175]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...
84 users processed

Global metrics:
{'modelName': 'Popularity', 'recall@5': 0.0, 'recall@10': 0.0}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
62,0,0,961,0.0,0.0,3609194402293569455
40,0,0,648,0.0,0.0,-1032019229384696495
30,0,0,437,0.0,0.0,-2979881261169775358
24,0,0,401,0.0,0.0,-3596626804281480007
50,0,0,366,0.0,0.0,1116121227607581999
48,0,0,346,0.0,0.0,692689608292948411
0,0,0,342,0.0,0.0,-9016528795238256703
63,0,0,339,0.0,0.0,3636910968448833585
55,0,0,320,0.0,0.0,2416280733544962613
67,0,0,291,0.0,0.0,4313045637915476309


# Content-Based Filtering model

In [135]:
#Ignoring stopwords (words with no semantics) from English and Portuguese (as we have a corpus with mixed languages)
stopwords_list = stopwords.words('english') + stopwords.words('portuguese')

#Trains a model whose vectors size is 5000, composed by the main unigrams and bigrams found in the corpus, ignoring stopwords
vectorizer = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 2),
                     min_df=0.003,
                     max_df=0.5,
                     max_features=5000,
                     stop_words=stopwords_list)

item_ids = shared_articles_df['contentId'].tolist()
tfidf_matrix = vectorizer.fit_transform(shared_articles_df['title'] + "" + shared_articles_df['text'])
tfidf_feature_names = vectorizer.get_feature_names()
tfidf_matrix

<3122x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 652972 stored elements in Compressed Sparse Row format>

In [136]:
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_users_profile(person_id, interactions_indexed_df):
    interactions_person_df = interactions_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(interactions_person_df['contentId'])
    
    user_item_strengths = np.array(interactions_person_df['eventScore']).reshape(-1,1)
    #Weighted average of item profiles by the interactions strength
    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm

def build_users_profiles(): 
    interactions_indexed_df = interactions_train_df[interactions_train_df['contentId'] \
                                                   .isin(shared_articles_df['contentId'])].set_index('personId')
    user_profiles = {}
    for person_id in interactions_indexed_df.index.unique():
        user_profiles[person_id] = build_users_profile(person_id, interactions_indexed_df)
    return user_profiles

In [137]:
user_profiles = build_users_profiles()
len(user_profiles)

1070

In [138]:
myprofile = user_profiles[-9223121837663643404]
print(myprofile.shape)
pd.DataFrame(sorted(zip(tfidf_feature_names, 
                        user_profiles[-9223121837663643404].flatten().tolist()), key=lambda x: -x[1])[:20],
             columns=['token', 'relevance'])

(1, 5000)


Unnamed: 0,token,relevance
0,google,0.261154
1,android,0.138379
2,teste,0.116157
3,code,0.114111
4,candidate,0.104937
5,cloud,0.104553
6,espresso,0.103203
7,app,0.100094
8,apple,0.095503
9,digital,0.091929


In [139]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        #Computes the cosine similarity between the user profile and all item profiles
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)
        #Gets the top similar items
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        #Sort the similar items by similarity
        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])
        return similar_items
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        #Ignores items the user has already interacted
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['contentId', 'recStrength']) \
                                    .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
content_based_recommender_model = ContentBasedRecommender(shared_articles_df)

In [140]:
print('Evaluating Content-Based Filtering model...')
cb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)
print('\nGlobal metrics:\n%s' % cb_global_metrics)
cb_detailed_results_df.head(10)

Evaluating Content-Based Filtering model...
686 users processed

Global metrics:
{'modelName': 'Content-Based', 'recall@5': 0.0, 'recall@10': 0.0}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
463,0,0,961,0.0,0.0,3609194402293569455
243,0,0,669,0.0,0.0,-2626634673110551643
293,0,0,648,0.0,0.0,-1032019229384696495
278,0,0,585,0.0,0.0,-1443636648652872475
233,0,0,437,0.0,0.0,-2979881261169775358
207,0,0,401,0.0,0.0,-3596626804281480007
371,0,0,366,0.0,0.0,1116121227607581999
355,0,0,346,0.0,0.0,692689608292948411
5,0,0,342,0.0,0.0,-9016528795238256703
466,0,0,339,0.0,0.0,3636910968448833585


# Collaborative Filtering model

## Matrix Factorization

Latent factor models compress user-item matrix into a low-dimensional representation in terms of latent factors. One advantage of using this approach is that instead of having a high dimensional matrix containing abundant number of missing values we will be dealing with a much smaller matrix in lower-dimensional space.
A reduced presentation could be utilized for either user-based or item-based neighborhood algorithms that are presented in the previous section. There are several advantages with this paradigm. It handles the sparsity of the original matrix better than memory based ones. Also comparing similarity on the resulting matrix is much more scalable especially in dealing with large sparse datasets.

Here we a use popular latent factor model named Singular Value Decomposition (SVD). There are other matrix factorization frameworks more specific to CF you might try, like surprise, mrec or python-recsys. We chose a SciPy implemenation of SVD because it is available on Kaggle kernels. P.s. See an example of SVD on a movies dataset in this blog post.

An important decision is the number of factors to factor the user-item matrix. The higher the number of factors, the more precise is the factorization in the original matrix reconstructions. Therefore, if the model is allowed to memorize too much details of the original matrix, it may not generalize well for data it was not trained on. Reducing the number of factors increases the model generalization.

In [142]:
#Creating a sparse pivot table with users in rows and items in columns
users_items_pivot_matrix_df = interactions_train_df.pivot(index='personId', 
                                                          columns='contentId', 
                                                          values='eventScore').fillna(0)

users_items_pivot_matrix_df.head(10)

contentId,-9222795471790223670,-9216926795620865886,-9194572880052200111,-9192549002213406534,-9190737901804729417,-9189659052158407108,-9184137057748005562,-9176143510534135851,-9172673334835262304,-9171475473795142532,...,9191014301634017491,9207286802575546269,9208127165664287660,9209629151177723638,9209886322932807692,9213260650272029784,9215261273565326920,9217155070834564627,9220445660318725468,9222265156747237864
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223121837663643404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9212075797126931087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9207251133131336884,0.0,2.584963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9199575329909162940,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9188188261933657343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9172914609055320039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9156344805277471150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9120685872592674274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9109785559521267180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-9060214117327732109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [143]:
users_items_pivot_matrix = users_items_pivot_matrix_df.as_matrix()
users_items_pivot_matrix[:10]

  """Entry point for launching an IPython kernel.


array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 2.5849625, 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [144]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

[-9223121837663643404,
 -9212075797126931087,
 -9207251133131336884,
 -9199575329909162940,
 -9188188261933657343,
 -9172914609055320039,
 -9156344805277471150,
 -9120685872592674274,
 -9109785559521267180,
 -9060214117327732109]

In [145]:
users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix)
users_items_pivot_sparse_matrix

<1070x2983 sparse matrix of type '<class 'numpy.float64'>'
	with 38672 stored elements in Compressed Sparse Row format>

In [146]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
#U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [147]:
U.shape

(1070, 15)

In [148]:
Vt.shape

(15, 2983)

In [149]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [150]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 0.07744013,  0.00193721,  0.00516063, ...,  0.02161582,
         0.01877247,  0.00329719],
       [-0.00076648,  0.00054891, -0.01674541, ..., -0.00245276,
         0.00377253, -0.00076424],
       [-0.01716068,  0.01915315, -0.04000176, ..., -0.00651959,
        -0.03538629,  0.00636385],
       ...,
       [-0.04087491,  0.00533284, -0.06356695, ..., -0.03337506,
         0.00236133,  0.00415458],
       [ 0.01523417,  0.00658399,  0.01581059, ...,  0.00970053,
         0.00683775,  0.00058306],
       [-0.06148586,  0.00669025,  0.49163742, ..., -0.09885302,
         0.03175752,  0.03020554]])

In [151]:
all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

In [152]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,-9223121837663643404,-9212075797126931087,-9207251133131336884,-9199575329909162940,-9188188261933657343,-9172914609055320039,-9156344805277471150,-9120685872592674274,-9109785559521267180,-9060214117327732109,...,9105269044962898535,9109075639526981934,9135582630122950040,9137372837662939523,9148269800512008413,9165571805999894845,9187866633451383747,9191849144618614467,9199170757466086545,9210530975708218054
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9222795471790223670,0.199385,0.194501,0.193477,0.19742,0.19417,0.196551,0.196611,0.192871,0.197004,0.193715,...,0.195686,0.192655,0.194222,0.194509,0.190261,0.193964,0.194354,0.191996,0.1955,0.190709
-9216926795620865886,0.19467,0.194583,0.195745,0.194604,0.194611,0.194632,0.194705,0.194395,0.194681,0.194497,...,0.194732,0.196488,0.194689,0.194598,0.197332,0.194919,0.194937,0.194882,0.19496,0.194966
-9194572880052200111,0.194871,0.193503,0.19205,0.193216,0.19427,0.196472,0.193408,0.186712,0.196482,0.196432,...,0.194073,0.208786,0.194643,0.196479,0.2152,0.19935,0.195197,0.190579,0.195536,0.225252
-9192549002213406534,0.195774,0.19524,0.194525,0.194274,0.195553,0.19565,0.197632,0.197719,0.193544,0.194919,...,0.195652,0.203992,0.195046,0.194652,0.204588,0.196174,0.19579,0.193896,0.195442,0.201857
-9190737901804729417,0.195465,0.194511,0.195772,0.194979,0.194525,0.194684,0.194798,0.193176,0.195307,0.194201,...,0.195035,0.193887,0.194592,0.194566,0.193392,0.194958,0.194609,0.194617,0.194885,0.196696
-9189659052158407108,0.196398,0.194566,0.196242,0.194365,0.19468,0.194642,0.195218,0.197707,0.194936,0.194912,...,0.195156,0.20313,0.194294,0.194731,0.199489,0.196965,0.194555,0.196263,0.195365,0.207829
-9184137057748005562,0.194637,0.194549,0.194696,0.194547,0.194545,0.194575,0.194538,0.194543,0.194694,0.19455,...,0.1946,0.194828,0.194552,0.194546,0.194406,0.194563,0.194578,0.194652,0.194647,0.194568
-9176143510534135851,0.197068,0.194462,0.195594,0.19539,0.194364,0.195189,0.195589,0.198061,0.197459,0.194635,...,0.194906,0.198362,0.194151,0.194536,0.191126,0.193888,0.19438,0.196609,0.195299,0.19253
-9172673334835262304,0.194384,0.194709,0.194387,0.194535,0.194672,0.194605,0.194674,0.194265,0.194499,0.194554,...,0.194787,0.194715,0.194713,0.194436,0.195678,0.194484,0.194933,0.194364,0.194779,0.192776
-9171475473795142532,0.195238,0.194487,0.194149,0.194763,0.194806,0.194803,0.194222,0.192212,0.195794,0.19466,...,0.194785,0.194765,0.194821,0.19461,0.194595,0.19502,0.19454,0.193822,0.194759,0.19673


In [153]:
len(cf_preds_df.columns)

1070

In [154]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['contentId'].isin(items_to_ignore)] \
                               .sort_values('recStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, shared_articles_df)

In [155]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
686 users processed

Global metrics:
{'modelName': 'Collaborative Filtering', 'recall@5': 0.0, 'recall@10': 0.0}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
463,0,0,961,0.0,0.0,3609194402293569455
243,0,0,669,0.0,0.0,-2626634673110551643
293,0,0,648,0.0,0.0,-1032019229384696495
278,0,0,585,0.0,0.0,-1443636648652872475
233,0,0,437,0.0,0.0,-2979881261169775358
207,0,0,401,0.0,0.0,-3596626804281480007
371,0,0,366,0.0,0.0,1116121227607581999
355,0,0,346,0.0,0.0,692689608292948411
5,0,0,342,0.0,0.0,-9016528795238256703
466,0,0,339,0.0,0.0,3636910968448833585
