In [2]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# from scipy.sparse.linalg import svds{"cells":[{"metadata":{"_uuid":"2d65d52b-5c0e-44a0-8898-85efceedda2f","_cell_guid":"ae548cb7-2fee-47a9-a52d-ee921c015bfa","trusted":true},"cell_type":"code","source":"# %% [markdown]\n# # Recommender Systems in Python 101\n\n# %% [markdown]\n# This notebook is a practical introduction to the main [Recommender System](https://en.wikipedia.org/wiki/Recommender_system) (RecSys) techniques. The objective of a RecSys is to recommend relevant items for users, based on their preference. Preference and relevance are subjective, and they are generally inferred by items users have consumed previously.  \n# The main families of methods for RecSys are:  \n# - [**Collaborative Filtering**](https://en.wikipedia.org/wiki/Collaborative_filtering): This method makes automatic predictions (filtering) about the interests of a user by collecting preferences or taste information from many users (collaborating). The underlying assumption of the collaborative filtering approach is that if a person A has the same opinion as a person B on a set of items, A is more likely to have B's opinion for a given item than that of a randomly chosen person.   \n# - [**Content-Based Filtering**](http://recommender-systems.org/content-based-filtering/): This method uses only information about the description and attributes of the items users has previously consumed to model user's preferences. In other words, these algorithms try to recommend items that are similar to those that a user liked in the past (or is examining in the present). In particular, various candidate items are compared with items previously rated by the user and the best-matching items are recommended.  \n# - **Hybrid methods**:  Recent research has demonstrated that a hybrid approach, combining collaborative filtering and content-based filtering could be more effective than pure approaches in some cases. These methods can also be used to overcome some of the common problems in recommender systems such as cold start and the sparsity problem.\n\n# %% [markdown]\n# In this notebook, we use a dataset we've shared on Kaggle Datasets: [Articles Sharing and Reading from CI&T Deskdrop](https://www.kaggle.com/gspmoreira/articles-sharing-reading-from-cit-deskdrop).  \n# We will demonstrate how to implement **Collaborative Filtering**, **Content-Based Filtering** and **Hybrid methods** in Python, for the task of providing personalized recommendations to the users.\n\n# %% [code]\nimport numpy as np\nimport scipy\nimport pandas as pd\nimport math\nimport random\nimport sklearn\nfrom nltk.corpus import stopwords\nfrom scipy.sparse import csr_matrix\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics.pairwise import cosine_similarity\nfrom scipy.sparse.linalg import svds\nfrom sklearn.preprocessing import MinMaxScaler\nimport matplotlib.pyplot as plt\n\n# %% [markdown]\n# # Loading data: CI&T Deskdrop dataset\n\n# %% [markdown]\n# In this section, we load the [Deskdrop dataset](https://www.kaggle.com/gspmoreira/articles-sharing-reading-from-cit-deskdrop), which contains a real sample of 12 months logs (Mar. 2016 - Feb. 2017) from CI&T's Internal Communication platform (DeskDrop). It contains about 73k logged users interactions on more than 3k public articles shared in the platform.\n# It is composed of two CSV files:  \n# - **shared_articles.csv**\n# - **users_interactions.csv**\n# \n# Take a look in this kernels for a better picture of the dataset: \n# - Deskdrop datasets EDA \n# - DeskDrop Articles Topic Modeling\n\n# %% [markdown]\n# ## shared_articles.csv\n\n# %% [markdown]\n# Contains information about the articles shared in the platform. Each article has its sharing date (timestamp), the original url, title, content in plain text, the article' lang (Portuguese: pt or English: en) and information about the user who shared the article (author).\n# \n# There are two possible event types at a given timestamp: \n# - CONTENT SHARED: The article was shared in the platform and is available for users. \n# - CONTENT REMOVED: The article was removed from the platform and not available for further recommendation.\n# \n# For the sake of simplicity, we only consider here the \"CONTENT SHARED\" event type, assuming (naively) that all articles were available during the whole one year period. For a more precise evaluation (and higher accuracy), only articles that were available at a given time should be recommended, but we let this exercice for you.\n\n# %% [code]\narticles_df = pd.read_csv('../input/shared_articles.csv')\narticles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']\narticles_df.head(10)\n\n# %% [markdown]\n# ## users_interactions.csv\n\n# %% [markdown]\n# Contains logs of user interactions on shared articles. It can be joined to **articles_shared.csv** by **contentId** column.\n# \n# The eventType values are:  \n# - **VIEW**: The user has opened the article. \n# - **LIKE**: The user has liked the article. \n# - **COMMENT CREATED**: The user created a comment in the article. \n# - **FOLLOW**: The user chose to be notified on any new comment in the article. \n# - **BOOKMARK**: The user has bookmarked the article for easy return in the future.\n\n# %% [code]\ninteractions_df = pd.read_csv('../input/users_interactions.csv')\ninteractions_df.head(20)\n\n# %% [markdown]\n# [](http://)## Data munging\n\n# %% [markdown]\n# As there are different interactions types, we associate them with a weight or strength, assuming that, for example, a comment in an article indicates a higher interest of the user on the item than a like, or than a simple view.\n\n# %% [code]\nevent_type_strength = {\n   'VIEW': 1.0,\n   'LIKE': 2.0, \n   'BOOKMARK': 2.5, \n   'FOLLOW': 3.0,\n   'COMMENT CREATED': 4.0,  \n}\n\n# interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])\ninteractions_df['eventStrength'] = interactions_df['eventType']\nprint(interactions_df)\n\n# %% [markdown]\n# Recommender systems have a problem known as ***user cold-start***, in which is hard do provide personalized recommendations for users with none or a very few number of consumed items, due to the lack of information to model their preferences.  \n# For this reason, we are keeping in the dataset only users with at least 5 interactions.\n\n# %% [code]\nusers_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()\nprint('# users: %d' % len(users_interactions_count_df))\nusers_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]\nprint('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))\n\n# %% [code]\nprint('# of interactions: %d' % len(interactions_df))\ninteractions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df, \n               how = 'right',\n               left_on = 'personId',\n               right_on = 'personId')\nprint('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))\n\n# %% [markdown]\n# In Deskdrop, users are allowed to view an article many times, and interact with them in different ways (eg. like or comment). Thus, to model the user interest on a given article, we aggregate all the interactions the user has performed in an item by a weighted sum of interaction type strength and apply a log transformation to smooth the distribution.\n\n# %% [code]\ndef smooth_user_preference(x):\n    return math.log(1+x, 2)\n    \ninteractions_full_df = interactions_from_selected_users_df \\\n                    .groupby(['personId', 'contentId'])['eventStrength'].sum() \\\n                    .apply(smooth_user_preference).reset_index()\nprint('# of unique user/item interactions: %d' % len(interactions_full_df))\ninteractions_full_df.head(10)\n\n# %% [markdown]\n# # Evaluation\n\n# %% [markdown]\n# Evaluation is important for machine learning projects, because it allows to compare objectivelly different algorithms and hyperparameter choices for models.  \n# One key aspect of evaluation is to ensure that the trained model generalizes for data it was not trained on, using **Cross-validation** techniques. We are using here a simple cross-validation approach named **holdout**, in which a random data sample (20% in this case) are kept aside in the training process, and exclusively used for evaluation. All evaluation metrics reported here are computed using the **test set**.\n# \n# Ps. A more robust evaluation approach could be to split train and test sets by a reference date, where the train set is composed by all interactions before that date, and the test set are interactions after that date. For the sake of simplicity, we chose the first random approach for this notebook, but you may want to try the second approach to better simulate how the recsys would perform in production predicting \"future\" users interactions.\n\n# %% [code]\ninteractions_train_df, interactions_test_df = train_test_split(interactions_full_df,\n                                   stratify=interactions_full_df['personId'], \n                                   test_size=0.20,\n                                   random_state=42)\n\nprint('# interactions on Train set: %d' % len(interactions_train_df))\nprint('# interactions on Test set: %d' % len(interactions_test_df))\n\n# %% [markdown]\n# In Recommender Systems, there are a set metrics commonly used for evaluation. We chose to work with **Top-N accuracy metrics**, which evaluates the accuracy of the top recommendations provided to a user, comparing to the items the user has actually interacted in test set.  \n# This evaluation method works as follows:\n# \n# * For each user\n#     * For each item the user has interacted in test set\n#         * Sample 100 other items the user has never interacted.   \n#         Ps. Here we naively assume those non interacted items are not relevant to the user, which might not be true, as the user may simply not be aware of those not interacted items. But let's keep this assumption.\n#         * Ask the recommender model to produce a ranked list of recommended items, from a set composed one interacted item and the 100 non-interacted (\"non-relevant!) items\n#         * Compute the Top-N accuracy metrics for this user and interacted item from the recommendations ranked list\n# * Aggregate the global Top-N accuracy metrics\n\n# %% [markdown]\n# The Top-N accuracy metric choosen was **Recall@N** which evaluates whether the interacted item is among the top N items (hit) in the ranked list of 101 recommendations for a user.  \n# Ps. Other popular ranking metrics are **NDCG@N** and **MAP@N**, whose score calculation takes into account the position of the relevant item in the ranked list (max. value if relevant item is in the first position). You can find a reference to implement this metrics in this [post](http://fastml.com/evaluating-recommender-systems/).\n\n# %% [code]\n#Indexing by personId to speed up the searches during evaluation\ninteractions_full_indexed_df = interactions_full_df.set_index('personId')\ninteractions_train_indexed_df = interactions_train_df.set_index('personId')\ninteractions_test_indexed_df = interactions_test_df.set_index('personId')\n\n# %% [code]\ndef get_items_interacted(person_id, interactions_df):\n    # Get the user's data and merge in the movie information.\n    interacted_items = interactions_df.loc[person_id]['contentId']\n    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])\n\n# %% [code]\n#Top-N accuracy metrics consts\nEVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100\n\nclass ModelEvaluator:\n\n\n    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):\n        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)\n        all_items = set(articles_df['contentId'])\n        non_interacted_items = all_items - interacted_items\n\n        random.seed(seed)\n        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)\n        return set(non_interacted_items_sample)\n\n    def _verify_hit_top_n(self, item_id, recommended_items, topn):        \n            try:\n                index = next(i for i, c in enumerate(recommended_items) if c == item_id)\n            except:\n                index = -1\n            hit = int(index in range(0, topn))\n            return hit, index\n\n    def evaluate_model_for_user(self, model, person_id):\n        #Getting the items in test set\n        interacted_values_testset = interactions_test_indexed_df.loc[person_id]\n        if type(interacted_values_testset['contentId']) == pd.Series:\n            person_interacted_items_testset = set(interacted_values_testset['contentId'])\n        else:\n            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])  \n        interacted_items_count_testset = len(person_interacted_items_testset) \n\n        #Getting a ranked recommendation list from a model for a given user\n        person_recs_df = model.recommend_items(person_id, \n                                               items_to_ignore=get_items_interacted(person_id, \n                                                                                    interactions_train_indexed_df), \n                                               topn=10000000000)\n\n        hits_at_5_count = 0\n        hits_at_10_count = 0\n        #For each item the user has interacted in test set\n        for item_id in person_interacted_items_testset:\n            #Getting a random sample (100) items the user has not interacted \n            #(to represent items that are assumed to be no relevant to the user)\n            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, \n                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, \n                                                                          seed=item_id%(2**32))\n\n            #Combining the current interacted item with the 100 random items\n            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))\n\n            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items\n            valid_recs_df = person_recs_df[person_recs_df['contentId'].isin(items_to_filter_recs)]                    \n            valid_recs = valid_recs_df['contentId'].values\n            #Verifying if the current interacted item is among the Top-N recommended items\n            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)\n            hits_at_5_count += hit_at_5\n            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)\n            hits_at_10_count += hit_at_10\n\n        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, \n        #when mixed with a set of non-relevant items\n        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)\n        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)\n\n        person_metrics = {'hits@5_count':hits_at_5_count, \n                          'hits@10_count':hits_at_10_count, \n                          'interacted_count': interacted_items_count_testset,\n                          'recall@5': recall_at_5,\n                          'recall@10': recall_at_10}\n        return person_metrics\n\n    def evaluate_model(self, model):\n        #print('Running evaluation for users')\n        people_metrics = []\n        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):\n            #if idx % 100 == 0 and idx > 0:\n            #    print('%d users processed' % idx)\n            person_metrics = self.evaluate_model_for_user(model, person_id)  \n            person_metrics['_person_id'] = person_id\n            people_metrics.append(person_metrics)\n        print('%d users processed' % idx)\n\n        detailed_results_df = pd.DataFrame(people_metrics) \\\n                            .sort_values('interacted_count', ascending=False)\n        \n        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())\n        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())\n        \n        global_metrics = {'modelName': model.get_model_name(),\n                          'recall@5': global_recall_at_5,\n                          'recall@10': global_recall_at_10}    \n        return global_metrics, detailed_results_df\n    \nmodel_evaluator = ModelEvaluator()    \n\n# %% [markdown]\n# # Popularity model\n\n# %% [markdown]\n# A common (and usually hard-to-beat) baseline approach is the Popularity model. This model is not actually personalized - it simply recommends to a user the most popular items that the user has not previously consumed. As the popularity accounts for the \"wisdom of the crowds\", it usually provides good recommendations, generally interesting for most people.   \n# Ps. The main objective of a recommender system is to leverage the long-tail items to the users with very specific interests, which goes far beyond this simple technique.\n\n# %% [code]\n#Computes the most popular items\nitem_popularity_df = interactions_full_df.groupby('contentId')['eventStrength'].sum().sort_values(ascending=False).reset_index()\nitem_popularity_df.head(10)\n\n# %% [code]\nclass PopularityRecommender:\n    \n    MODEL_NAME = 'Popularity'\n    \n    def __init__(self, popularity_df, items_df=None):\n        self.popularity_df = popularity_df\n        self.items_df = items_df\n        \n    def get_model_name(self):\n        return self.MODEL_NAME\n        \n    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):\n        # Recommend the more popular items that the user hasn't seen yet.\n        recommendations_df = self.popularity_df[~self.popularity_df['contentId'].isin(items_to_ignore)] \\\n                               .sort_values('eventStrength', ascending = False) \\\n                               .head(topn)\n\n        if verbose:\n            if self.items_df is None:\n                raise Exception('\"items_df\" is required in verbose mode')\n\n            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', \n                                                          left_on = 'contentId', \n                                                          right_on = 'contentId')[['eventStrength', 'contentId', 'title', 'url', 'lang']]\n\n\n        return recommendations_df\n    \npopularity_model = PopularityRecommender(item_popularity_df, articles_df)\n\n# %% [markdown]\n# Here we perform the evaluation of the Popularity model, according to the method described above.  \n# It achieved the **Recall@5** of **0.2417**, which means that about **24%** of interacted items in test set were ranked by Popularity model among the top-5 items (from lists with 100 random items). And **Recall@10** was even higher (**37%**), as expected.  \n# It might be surprising to you that usually Popularity models could perform so well!\n\n# %% [code]\nprint('Evaluating Popularity recommendation model...')\npop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)\nprint('\\nGlobal metrics:\\n%s' % pop_global_metrics)\npop_detailed_results_df.head(10)\n\n# %% [markdown]\n# # Content-Based Filtering model\n\n# %% [markdown]\n# Content-based filtering approaches leverage description or attributes from items the user has interacted to recommend similar items. It depends only on the user previous choices, making this method robust to avoid the *cold-start* problem.\n# For textual items, like articles, news and books, it is simple to use the raw text to build item profiles and user profiles.  \n# Here we are using a very popular technique in information retrieval (search engines) named [TF-IDF](https://en.wikipedia.org/wiki/Tf%E2%80%93idf). This technique converts unstructured text into a vector structure, where each word is represented by a position in the vector, and the value measures how relevant a given word is for an article. As all items will be represented in the same [Vector Space Model](https://en.wikipedia.org/wiki/Vector_space_model), it is to compute similarity between articles.  \n# See this [presentation](https://www.slideshare.net/gabrielspmoreira/discovering-users-topics-of-interest-in-recommender-systems-tdc-sp-2016) (from slide 30) for more information on TF-IDF and Cosine similarity.\n\n# %% [code]\n#Ignoring stopwords (words with no semantics) from English and Portuguese (as we have a corpus with mixed languages)\nstopwords_list = stopwords.words('english') + stopwords.words('portuguese')\n\n#Trains a model whose vectors size is 5000, composed by the main unigrams and bigrams found in the corpus, ignoring stopwords\nvectorizer = TfidfVectorizer(analyzer='word',\n                     ngram_range=(1, 2),\n                     min_df=0.003,\n                     max_df=0.5,\n                     max_features=5000,\n                     stop_words=stopwords_list)\n\nitem_ids = articles_df['contentId'].tolist()\ntfidf_matrix = vectorizer.fit_transform(articles_df['title'] + \"\" + articles_df['text'])\ntfidf_feature_names = vectorizer.get_feature_names()\ntfidf_matrix\n\n# %% [markdown]\n# To model the user profile, we take all the item profiles the user has interacted and average them. The average is weighted by the interaction strength, in other words, the articles the user has interacted the most (eg. liked or commented) will have a higher strength in the final user profile.   \n\n# %% [code]\ndef get_item_profile(item_id):\n    idx = item_ids.index(item_id)\n    item_profile = tfidf_matrix[idx:idx+1]\n    return item_profile\n\ndef get_item_profiles(ids):\n    item_profiles_list = [get_item_profile(x) for x in ids]\n    item_profiles = scipy.sparse.vstack(item_profiles_list)\n    return item_profiles\n\ndef build_users_profile(person_id, interactions_indexed_df):\n    interactions_person_df = interactions_indexed_df.loc[person_id]\n    user_item_profiles = get_item_profiles(interactions_person_df['contentId'])\n    \n    user_item_strengths = np.array(interactions_person_df['eventStrength']).reshape(-1,1)\n    #Weighted average of item profiles by the interactions strength\n    user_item_strengths_weighted_avg = np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) / np.sum(user_item_strengths)\n    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)\n    return user_profile_norm\n\ndef build_users_profiles(): \n    interactions_indexed_df = interactions_train_df[interactions_train_df['contentId'] \\\n                                                   .isin(articles_df['contentId'])].set_index('personId')\n    user_profiles = {}\n    for person_id in interactions_indexed_df.index.unique():\n        user_profiles[person_id] = build_users_profile(person_id, interactions_indexed_df)\n    return user_profiles\n\n# %% [code]\nuser_profiles = build_users_profiles()\nlen(user_profiles)\n\n# %% [markdown]\n# Let's take a look in the profile. It is a [unit vector](https://en.wikipedia.org/wiki/Unit_vector) of 5000 length. The value in each position represents how relevant is a token (unigram or bigram) for me.  \n# Looking my profile, it appears that the top relevant tokens really represent my professional interests in **machine learning**, **deep learning**, **artificial intelligence** and **google cloud platform**! So we might expect good recommendations here!\n\n# %% [code]\nmyprofile = user_profiles[-1479311724257856983]\nprint(myprofile.shape)\npd.DataFrame(sorted(zip(tfidf_feature_names, \n                        user_profiles[-1479311724257856983].flatten().tolist()), key=lambda x: -x[1])[:20],\n             columns=['token', 'relevance'])\n\n# %% [code]\nclass ContentBasedRecommender:\n    \n    MODEL_NAME = 'Content-Based'\n    \n    def __init__(self, items_df=None):\n        self.item_ids = item_ids\n        self.items_df = items_df\n        \n    def get_model_name(self):\n        return self.MODEL_NAME\n        \n    def _get_similar_items_to_user_profile(self, person_id, topn=1000):\n        #Computes the cosine similarity between the user profile and all item profiles\n        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)\n        #Gets the top similar items\n        similar_indices = cosine_similarities.argsort().flatten()[-topn:]\n        #Sort the similar items by similarity\n        similar_items = sorted([(item_ids[i], cosine_similarities[0,i]) for i in similar_indices], key=lambda x: -x[1])\n        return similar_items\n        \n    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):\n        similar_items = self._get_similar_items_to_user_profile(user_id)\n        #Ignores items the user has already interacted\n        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))\n        \n        recommendations_df = pd.DataFrame(similar_items_filtered, columns=['contentId', 'recStrength']) \\\n                                    .head(topn)\n\n        if verbose:\n            if self.items_df is None:\n                raise Exception('\"items_df\" is required in verbose mode')\n\n            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', \n                                                          left_on = 'contentId', \n                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]\n\n\n        return recommendations_df\n    \ncontent_based_recommender_model = ContentBasedRecommender(articles_df)\n\n# %% [markdown]\n# With personalized recommendations of content-based filtering model, we have a **Recall@5** to about **0.162**, which means that about **16%** of interacted items in test set were ranked by this model among the top-5 items (from lists with 100 random items).\n# And **Recall@10** was **0.261 (52%)**.\n# The lower performance of the Content-Based model compared to the Popularity model may indicate that users are not that fixed in content very similar to their previous reads.\n\n# %% [code]\nprint('Evaluating Content-Based Filtering model...')\ncb_global_metrics, cb_detailed_results_df = model_evaluator.evaluate_model(content_based_recommender_model)\nprint('\\nGlobal metrics:\\n%s' % cb_global_metrics)\ncb_detailed_results_df.head(10)\n\n# %% [markdown]\n# # Collaborative Filtering model\n\n# %% [markdown]\n# Collaborative Filtering (CF) has two main implementation strategies:  \n# - **Memory-based**: This approach uses the memory of previous users interactions to compute users similarities based on items they've interacted (user-based approach) or compute items similarities based on the users that have interacted with them (item-based approach).  \n# A typical example of this approach is User Neighbourhood-based CF, in which the top-N similar users (usually computed using Pearson correlation) for a user are selected and used to recommend items those similar users liked, but the current user have not interacted yet. This approach is very simple to implement, but usually do not scale well for many users. A nice Python implementation of this approach in available in [Crab](http://muricoca.github.io/crab/).\n# - **Model-based**: This approach, models are developed using different machine learning algorithms to recommend items to users. There are many model-based CF algorithms, like neural networks, bayesian networks, clustering models, and latent factor models such as Singular Value Decomposition (SVD) and, probabilistic latent semantic analysis.\n\n# %% [markdown]\n# ## Matrix Factorization\n\n# %% [markdown]\n# Latent factor models compress user-item matrix into a low-dimensional representation in terms of latent factors. One advantage of using this approach is that instead of having a high dimensional matrix containing abundant number of missing values we will be dealing with a much smaller matrix in lower-dimensional space.  \n# A reduced presentation could be utilized for either user-based or item-based neighborhood algorithms that are presented in the previous section. There are several advantages with this paradigm. It handles the sparsity of the original matrix better than memory based ones. Also comparing similarity on the resulting matrix is much more scalable especially in dealing with large sparse datasets.  \n\n# %% [markdown]\n# Here we a use popular latent factor model named [Singular Value Decomposition (SVD)](https://en.wikipedia.org/wiki/Singular_value_decomposition). There are other matrix factorization frameworks more specific to CF you might try, like [surprise](https://github.com/NicolasHug/Surprise), [mrec](https://github.com/Mendeley/mrec) or [python-recsys](https://github.com/ocelma/python-recsys). We chose a [SciPy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.svds.html) implemenation of SVD because it is available on Kaggle kernels. \n# P.s. See an example of SVD on a movies dataset in this [blog post](https://beckernick.github.io/matrix-factorization-recommender/).   \n\n# %% [markdown]\n# An important decision is the number of factors to factor the user-item matrix. The higher the number of factors, the more precise is the factorization in the original matrix reconstructions. Therefore, if the model is allowed to  memorize too much details of the original matrix, it may not generalize well for data it was not trained on. Reducing the number of factors increases the model generalization.\n\n# %% [code]\n#Creating a sparse pivot table with users in rows and items in columns\nusers_items_pivot_matrix_df = interactions_train_df.pivot(index='personId', \n                                                          columns='contentId', \n                                                          values='eventStrength').fillna(0)\n\nusers_items_pivot_matrix_df.head(10)\n\n# %% [code]\nusers_items_pivot_matrix = users_items_pivot_matrix_df.as_matrix()\nusers_items_pivot_matrix[:10]\n\n# %% [code]\nusers_ids = list(users_items_pivot_matrix_df.index)\nusers_ids[:10]\n\n# %% [code]\nusers_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix)\nusers_items_pivot_sparse_matrix\n\n# %% [code]\n#The number of factors to factor the user-item matrix.\nNUMBER_OF_FACTORS_MF = 15\n#Performs matrix factorization of the original user item matrix\n#U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)\nU, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)\n\n# %% [code]\nU.shape\n\n# %% [code]\nVt.shape\n\n# %% [code]\nsigma = np.diag(sigma)\nsigma.shape\n\n# %% [markdown]\n# After the factorization, we try to to reconstruct the original matrix by multiplying its factors. The resulting matrix is not sparse any more. It was generated predictions for items the user have not yet interaction, which we will exploit for recommendations.\n\n# %% [code]\nall_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) \nall_user_predicted_ratings\n\n# %% [code]\nall_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())\n\n# %% [code]\n#Converting the reconstructed matrix back to a Pandas dataframe\ncf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()\ncf_preds_df.head(10)\n\n# %% [code]\nlen(cf_preds_df.columns)\n\n# %% [code]\nclass CFRecommender:\n    \n    MODEL_NAME = 'Collaborative Filtering'\n    \n    def __init__(self, cf_predictions_df, items_df=None):\n        self.cf_predictions_df = cf_predictions_df\n        self.items_df = items_df\n        \n    def get_model_name(self):\n        return self.MODEL_NAME\n        \n    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):\n        # Get and sort the user's predictions\n        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \\\n                                    .reset_index().rename(columns={user_id: 'recStrength'})\n\n        # Recommend the highest predicted rating movies that the user hasn't seen yet.\n        recommendations_df = sorted_user_predictions[~sorted_user_predictions['contentId'].isin(items_to_ignore)] \\\n                               .sort_values('recStrength', ascending = False) \\\n                               .head(topn)\n\n        if verbose:\n            if self.items_df is None:\n                raise Exception('\"items_df\" is required in verbose mode')\n\n            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', \n                                                          left_on = 'contentId', \n                                                          right_on = 'contentId')[['recStrength', 'contentId', 'title', 'url', 'lang']]\n\n\n        return recommendations_df\n    \ncf_recommender_model = CFRecommender(cf_preds_df, articles_df)\n\n# %% [markdown]\n# > Evaluating the Collaborative Filtering model (SVD matrix factorization), we observe that we got **Recall@5 (33%)** and **Recall@10 (46%)** values, much higher than Popularity model and Content-Based model.\n\n# %% [code]\nprint('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')\ncf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)\nprint('\\nGlobal metrics:\\n%s' % cf_global_metrics)\ncf_detailed_results_df.head(10)\n\n# %% [markdown]\n# ## Hybrid Recommender\n\n# %% [markdown]\n# What if we combine Collaborative Filtering and Content-Based Filtering approaches?    \n# Would that provide us with more accurate recommendations?    \n# In fact, hybrid methods have performed better than individual approaches in many studies and have being extensively used by researchers and practioners.  \n# Let's build a simple hybridization method, as an ensemble that takes the weighted average of the normalized CF scores with the Content-Based scores, and ranking by resulting score. In this case, as the CF model is much more accurate than the CB model, the weights for the CF and CB models are 100.0 and 1.0, respectivelly.\n\n# %% [code]\nclass HybridRecommender:\n    \n    MODEL_NAME = 'Hybrid'\n    \n    def __init__(self, cb_rec_model, cf_rec_model, items_df, cb_ensemble_weight=1.0, cf_ensemble_weight=1.0):\n        self.cb_rec_model = cb_rec_model\n        self.cf_rec_model = cf_rec_model\n        self.cb_ensemble_weight = cb_ensemble_weight\n        self.cf_ensemble_weight = cf_ensemble_weight\n        self.items_df = items_df\n        \n    def get_model_name(self):\n        return self.MODEL_NAME\n        \n    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):\n        #Getting the top-1000 Content-based filtering recommendations\n        cb_recs_df = self.cb_rec_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose,\n                                                           topn=1000).rename(columns={'recStrength': 'recStrengthCB'})\n        \n        #Getting the top-1000 Collaborative filtering recommendations\n        cf_recs_df = self.cf_rec_model.recommend_items(user_id, items_to_ignore=items_to_ignore, verbose=verbose, \n                                                           topn=1000).rename(columns={'recStrength': 'recStrengthCF'})\n        \n        #Combining the results by contentId\n        recs_df = cb_recs_df.merge(cf_recs_df,\n                                   how = 'outer', \n                                   left_on = 'contentId', \n                                   right_on = 'contentId').fillna(0.0)\n        \n        #Computing a hybrid recommendation score based on CF and CB scores\n        #recs_df['recStrengthHybrid'] = recs_df['recStrengthCB'] * recs_df['recStrengthCF'] \n        recs_df['recStrengthHybrid'] = (recs_df['recStrengthCB'] * self.cb_ensemble_weight) \\\n                                     + (recs_df['recStrengthCF'] * self.cf_ensemble_weight)\n        \n        #Sorting recommendations by hybrid score\n        recommendations_df = recs_df.sort_values('recStrengthHybrid', ascending=False).head(topn)\n\n        if verbose:\n            if self.items_df is None:\n                raise Exception('\"items_df\" is required in verbose mode')\n\n            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', \n                                                          left_on = 'contentId', \n                                                          right_on = 'contentId')[['recStrengthHybrid', 'contentId', 'title', 'url', 'lang']]\n\n\n        return recommendations_df\n    \nhybrid_recommender_model = HybridRecommender(content_based_recommender_model, cf_recommender_model, articles_df,\n                                             cb_ensemble_weight=1.0, cf_ensemble_weight=100.0)\n\n# %% [markdown]\n# **We have a new champion!**  \n# Our simple hybrid approach surpasses Content-Based filtering with its combination with Collaborative Filtering. Now we have a **Recall@5** of **34.2%** and **Recall@10** of **47.9%**\n\n# %% [code]\nprint('Evaluating Hybrid model...')\nhybrid_global_metrics, hybrid_detailed_results_df = model_evaluator.evaluate_model(hybrid_recommender_model)\nprint('\\nGlobal metrics:\\n%s' % hybrid_global_metrics)\nhybrid_detailed_results_df.head(10)\n\n# %% [markdown]\n# ## Comparing the methods\n\n# %% [code]\nglobal_metrics_df = pd.DataFrame([cb_global_metrics, pop_global_metrics, cf_global_metrics, hybrid_global_metrics]) \\\n                        .set_index('modelName')\nglobal_metrics_df\n\n# %% [code]\n%matplotlib inline\nax = global_metrics_df.transpose().plot(kind='bar', figsize=(15,8))\nfor p in ax.patches:\n    ax.annotate(\"%.3f\" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', xytext=(0, 10), textcoords='offset points')\n\n# %% [markdown]\n# # Testing\n\n# %% [markdown]\n# Let's test the best model (Hybrid) for my user.\n\n# %% [code]\ndef inspect_interactions(person_id, test_set=True):\n    if test_set:\n        interactions_df = interactions_test_indexed_df\n    else:\n        interactions_df = interactions_train_indexed_df\n    return interactions_df.loc[person_id].merge(articles_df, how = 'left', \n                                                      left_on = 'contentId', \n                                                      right_on = 'contentId') \\\n                          .sort_values('eventStrength', ascending = False)[['eventStrength', \n                                                                          'contentId',\n                                                                          'title', 'url', 'lang']]\n\n# %% [markdown]\n# Here we see some articles I interacted in Deskdrop from train set. It can be easily observed that among my main interests are **machine learning**, **deep learning**, **artificial intelligence**, and **google cloud platform**.\n\n# %% [code]\ninspect_interactions(-1479311724257856983, test_set=False).head(20)\n\n# %% [markdown]\n# **The recommendations really matches my interests, as I would read all of them!**\n\n# %% [code]\nhybrid_recommender_model.recommend_items(-1479311724257856983, topn=20, verbose=True)\n\n# %% [markdown]\n# # Conclusion\n\n# %% [markdown]\n# In this notebook, we've explored and compared the main Recommender Systems techniques on [CI&T Deskdrop](https://www.kaggle.com/gspmoreira/articles-sharing-reading-from-cit-deskdrop) dataset. It could be observed that for articles recommendation, content-based filtering and a hybrid method performed better than Collaborative Filtering alone.  \n# \n# There is large room for improvements of the results. Here are some tips:\n# - In this example, we've completely ignored the time, considering that all articles were available to be recommended to users at any time. A better approach would be to filter only articles that were available for users at a given time.\n# - You could leverage the available contextual information to model users preferences across time (period of day, day of week, month), location (country and state/district) and devices (browser, mobile native app).  \n# This contextual information can be easily incorporated in [Learn-to-Rank](https://en.wikipedia.org/wiki/Learning_to_rank) models (like XGBoost Gradient Boosting Decision Trees with ranking objective), Logistic models (with categorical features [One-Hot encoded](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) or [Feature Hashed](https://en.wikipedia.org/wiki/Feature_hashing)), and [Wide & Deep models](https://ai.googleblog.com/2016/06/wide-deep-learning-better-together-with.html), which is implemented in [TensorFlow](https://docs.w3cub.com/tensorflow~guide/tutorials/wide_and_deep/). Take a look in the summary my solution shared for [Outbrain Click Prediction](https://www.kaggle.com/c/outbrain-click-prediction/discussion/27897#157215) competition. \n# - Those basic techniques were used for didactic purposes. There are more advanced techniques in RecSys research community, specially advanced Matrix Factorization and Deep Learning models.  \n# \n# You can know more about state-of-the-art methods published in Recommender Systems on [ACM RecSys conference](https://recsys.acm.org/).  \n# If you are more like practioner than researcher, you might try some Collaborative Filtering frameworks in this dataset, like [surprise](https://github.com/NicolasHug/Surprise), [mrec](https://github.com/Mendeley/mrec),  [python-recsys](https://github.com/ocelma/python-recsys) and [Spark ALS Matrix Factorization](https://spark.apache.org/docs/latest/mllib-collaborative-filtering.html) (distributed implementation for large datasets).  \n# Take a look in this [presentation](https://www.slideshare.net/gabrielspmoreira/discovering-users-topics-of-interest-in-recommender-systems-tdc-sp-2016) where I describe a production recommender system, focused on Content-Based Filtering and Topic Modeling techniques.\n\n# %% [code]\n","execution_count":0,"outputs":[]}],"metadata":{"language_info":{"nbconvert_exporter":"python","pygments_lexer":"ipython3","mimetype":"text/x-python","name":"python","version":"3.6.3","codemirror_mode":{"version":3,"name":"ipython"},"file_extension":".py"},"kernelspec":{"display_name":"Python 3","name":"python3","language":"python"}},"nbformat":4,"nbformat_minor":1}
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [3]:
articles_df = pd.read_csv('shared_articles.csv')
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en
5,1459194522,CONTENT SHARED,-2826566343807132236,4340306774493623681,8940341205206233829,,,,HTML,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en


In [4]:
interactions_df = pd.read_csv('users_interactions.csv')
interactions_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


In [5]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])
interactions_df

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,eventStrength
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1.0
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US,1.0
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,,1.0
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,,3.0
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,,1.0
...,...,...,...,...,...,...,...,...,...
72307,1485190425,LIKE,-6590819806697898649,-9016528795238256703,8614469745607949425,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,MG,BR,2.0
72308,1485190425,VIEW,-5813211845057621660,102305705598210278,5527770709392883642,Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,SP,BR,1.0
72309,1485190072,VIEW,-1999468346928419252,-9196668942822132778,-8300596454915870873,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,SP,BR,1.0
72310,1485190434,VIEW,-6590819806697898649,-9016528795238256703,8614469745607949425,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,MG,BR,1.0


In [11]:
users_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 1895
# users with at least 5 interactions: 1140


In [12]:
print('# of interactions: %d' % len(interactions_df))
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 72312
# of interactions from users with at least 5 interactions: 69868


In [None]:
def smooth_user_preference(x):
    return math.log(1+x, 2)
    
interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['personId', 'contentId'])['eventStrength'].sum() \
                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(10)