In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
raw_df = pd.read_csv('cat_de_3year.csv', encoding = "utf-8")#, nrows = 100000)
#raw_df = pd.read_csv('cat_de_15-18.csv', encoding = "utf-8")
raw_df.head(20)

Unnamed: 0,SCM_N_ORDER_POS_NR,SCM_N_ORDER_NR,SCM_N_ORDER_NR_NC,TDT_T_KEY_ORDER_DATE,ART_N_KEY_ARTICLE,PPR_N_KEY_PHYS_PRODUCT,CUS_N_KEY_CUSTOMER,SCM_D_QUANTITY,PPR_N_PHY_ARTICLE_ID,PPR_N_PHY_PRODUCT_ID
0,248535075,58965327,0,11-OCT-15,874052,38614853,9225464,1.0,51215,38891
1,209091109,49807309,0,16-FEB-15,590805,15330165,6412112,1.0,35915,25969
2,209091110,49807309,0,16-FEB-15,1111650,53626606,6412112,1.0,56881,38333
3,209091111,49807309,0,16-FEB-15,16715,15665,6412112,1.0,4220,4097
4,248535073,58965327,0,11-OCT-15,1363350,67607358,9225464,1.0,62098,44993
5,203263548,48465719,0,12-JAN-15,1323999,11459,3116936,1.0,26073,19403
6,203263549,48465719,0,12-JAN-15,1324000,11460,3116936,1.0,26072,19403
7,203263550,48465719,0,12-JAN-15,579289,13857179,3116936,1.0,35376,25635
8,226948693,53944802,0,06-JUN-15,1674384,70644899,7438119,0.8047,63222,9826
9,226948693,53944802,0,06-JUN-15,1674384,70644897,7438119,0.1953,63221,9826


In [4]:
print('min date is {}'.format(min(raw_df['TDT_T_KEY_ORDER_DATE'])))
print('max date is {}'.format(max(raw_df['TDT_T_KEY_ORDER_DATE'])))

min date is 01-APR-15
max date is 31-OCT-17


# Let us create a data frame that contains only required data for future basic recommender system
### As of now we are not taking into account session information
> Logic for defining the strength of interaction of the user with the item would be
1. Bought an item $(0,2]$ times - weight will be "1.0"
2. Bought an item $(2,5]$ times - weight will be "2.0"
3. Bought an item $(5, +\infty)$ times - weight will be "3.0"
> I can think of a functional weight for the counts of times the item was boubht. TODO

### Required columns are
* CUS_N_KEY_CUSTOMER == personId
* PPR_N_PHY_PRODUCT_ID = contentId
* eventStrength = depends on the overall history of the customer-item interactions. _no basket or date informaiton is used_


In [5]:
## to start let us take a sample of our data
sample_df = raw_df.sample(10000000)
sample_df.head()

Unnamed: 0,SCM_N_ORDER_POS_NR,SCM_N_ORDER_NR,SCM_N_ORDER_NR_NC,TDT_T_KEY_ORDER_DATE,ART_N_KEY_ARTICLE,PPR_N_KEY_PHYS_PRODUCT,CUS_N_KEY_CUSTOMER,SCM_D_QUANTITY,PPR_N_PHY_ARTICLE_ID,PPR_N_PHY_PRODUCT_ID
31388053,372891227,87420763,0,16-MAY-17,1436135,14708,10062173,2.0,14902,11542
5607452,229891847,54629732,0,24-JUN-15,546435,9257023,1166481,1.0,33163,5477
6741529,235061131,55831184,0,25-JUL-15,417195,11377,11641008,1.0,26927,11834
27237213,353961058,83126772,0,26-FEB-17,1571831,13987,12485837,0.4999,26663,19704
31223325,371325355,87056065,0,10-MAY-17,1886302,53365386,12636544,1.0,56512,42948


In [6]:
# remove non-used columns
sample_df.drop(['SCM_N_ORDER_POS_NR','SCM_N_ORDER_NR_NC', 'SCM_N_ORDER_NR_NC', 'PPR_N_KEY_PHYS_PRODUCT', 'SCM_N_ORDER_NR', 'ART_N_KEY_ARTICLE','PPR_N_PHY_ARTICLE_ID'], axis=1, inplace=True)
sample_df.head()

Unnamed: 0,TDT_T_KEY_ORDER_DATE,CUS_N_KEY_CUSTOMER,SCM_D_QUANTITY,PPR_N_PHY_PRODUCT_ID
31388053,16-MAY-17,10062173,2.0,11542
5607452,24-JUN-15,1166481,1.0,5477
6741529,25-JUL-15,11641008,1.0,11834
27237213,26-FEB-17,12485837,0.4999,19704
31223325,10-MAY-17,12636544,1.0,42948


In [229]:
# this is not needed since we are simply counting prod id
# sample_more_1_interaction = sample_df.groupby(['CUS_N_KEY_CUSTOMER','PPR_N_PHY_PRODUCT_ID']).filter(lambda x: x['SCM_D_QUANTITY'].unique().size > 1 )

In [231]:
#sample_more_1_interaction.head()

In [7]:
tmp = sample_df.groupby(['CUS_N_KEY_CUSTOMER', 'PPR_N_PHY_PRODUCT_ID']).count()
tmp.reset_index(level=tmp.index.names, inplace=True)
interactions_df = tmp.rename(index=str, columns = {'CUS_N_KEY_CUSTOMER': 'personId', 
                                                   'PPR_N_PHY_PRODUCT_ID': 'contentId', 
                                                   'TDT_T_KEY_ORDER_DATE': 'date', 
                                                   'SCM_D_QUANTITY': 'eventStrength'}).drop('date', axis=1)
display(interactions_df)

Unnamed: 0,personId,contentId,eventStrength
0,22,10673,1
1,22,15896,2
2,22,17576,3
3,22,17987,1
4,22,19223,2
5,22,19703,1
6,22,19997,2
7,22,21311,1
8,22,25635,2
9,22,27721,1


In [None]:
# Visualize pairplot of df
# sns.pairplot(interactions_df, hue='eventStrength');

Recommender systems have a problem known as user cold-start, in which is hard do provide personalized recommendations for users with none or a very few number of consumed items, due to the lack of information to model their preferences.
For this reason, we are keeping in the dataset only users with at leas 5 interactions.

In [8]:
users_interactions_count_df = interactions_df.groupby(['personId', 'contentId']).size().groupby('personId').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['personId']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 904562
# users with at least 5 interactions: 330105


In [9]:
print('# of interactions: %d' % len(interactions_df))
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'personId',
               right_on = 'personId')
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 5855757
# of interactions from users with at least 5 interactions: 4818146


In [10]:
####
# This must be changed
###
interactions_full_df = interactions_from_selected_users_df

## Evaluation
Evaluation is important for machine learning projects, because it allows to compare objectivelly different algorithms and hyperparameter choices for models.

One key aspect of evaluation is to ensure that the trained model generalizes for data it was not trained on, using *Cross-validation techniques*. We are using here a simple cross-validation approach named *holdout*, in which a random data sample (20% in this case) are kept aside in the training process, and exclusively used for evaluation. All evaluation metrics reported here are computed using the *test set*.

_Ps. A more robust evaluation approach could be to split train and test sets by a reference date, where the train set is composed by all interactions before that date, and the test set are interactions after that date. For the sake of simplicity, we chose the first random approach for this notebook, but you may want to try the second approach to better simulate how the recsys would perform in production predicting "future" users interactions._

In [11]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['personId'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 3854516
# interactions on Test set: 963630


In Recommender Systems, there are a set metrics commonly used for evaluation. We chose to work with *Top-N* accuracy metrics, which evaluates the accuracy of the top recommendations provided to a user, comparing to the items the user has actually interacted in test set.

This evaluation method works as follows:

* For each user
    * For each item the user has interacted in test set
        * Sample 100 other items the user has never interacted. Ps. Here we naively assume those non interacted items are not relevant to the user, which might not be true, as the user may simply not be aware of those not interacted items. But let's keep this assumption.
        * Ask the recommender model to produce a ranked list of recommended items, from a set composed one interacted item and the 100 non-interacted ("non-relevant!) items
        * Compute the Top-N accuracy metrics for this user and interacted item from the recommendations ranked list
* Aggregate the global Top-N accuracy metrics

The Top-N accuracy metric choosen was *Recall@N* which evaluates whether the interacted item is among the top N items (hit) in the ranked list of 101 recommendations for a user.
Ps. Other popular ranking metrics are *NDCG@N* and *MAP@N*, whose score calculation takes into account the position of the relevant item in the ranked list (max. value if relevant item is in the first position). You can find a reference to implement this metrics in this post.

In [12]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df.set_index('personId')
interactions_train_indexed_df = interactions_train_df.set_index('personId')
interactions_test_indexed_df = interactions_test_df.set_index('personId')

In [13]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['contentId']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [14]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interactions_full_indexed_df)
        all_items = set(articles_df['contentId'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(person_id, 
                                               items_to_ignore=get_items_interacted(person_id, 
                                                                                    interactions_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['contentId'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['contentId'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator()  

## Popularity model

A common (and usually hard-to-beat) baseline approach is the Popularity model. This model is not actually personalized - it simply recommends to a user the most popular items that the user has not previously consumed. As the popularity accounts for the "wisdom of the crowds", it usually provides good recommendations, generally interesting for most people.
Ps. The main objective of a recommender system is to leverage the long-tail items to the users with very specific interests, which goes far beyond this simple technique.

In [16]:
#Computes the most popular items
item_popularity_df = interactions_full_df.groupby('contentId')['eventStrength'].sum().sort_values(ascending=False).reset_index()
item_popularity_df.head(10)

Unnamed: 0,contentId,eventStrength
0,3187,128516
1,38891,115094
2,38333,112624
3,39043,102060
4,39751,99399
5,10178,91057
6,44561,87116
7,42948,84087
8,46365,75847
9,23103,74426


In [19]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['contentId'].isin(items_to_ignore)] \
                               .sort_values('eventStrength', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['eventStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
# sample_df
# popularity_model = PopularityRecommender(item_popularity_df, articles_df)
articles_df = sample_df.rename(index=str, columns = {'CUS_N_KEY_CUSTOMER': 'personId', 
                                                   'PPR_N_PHY_PRODUCT_ID': 'contentId'})
popularity_model = PopularityRecommender(item_popularity_df, articles_df)

Here we perform the evaluation of the Popularity model, according to the method described above.
It achieved the Recall@5 of ????, which means that about ??% of interacted items in test set were ranked by Popularity model among the top-5 items (from lists with 100 random items). And Recall@10 was even higher (???%), as expected.
It might be surprising to you that usually Popularity models could perform so well!

In [None]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...


#### Content-Based Filtering model is NOT applicable to the data we are working on with right now
Content-based filtering approaches leverage description or attributes from items the user has interacted to recommend similar items. It depends only on the user previous choices, making this method robust to avoid the cold-start problem. For textual items, like articles, news and books, it is simple to use the raw text to build item profiles and user profiles.
Here we are using a very popular technique in information retrieval (search engines) named TF-IDF. This technique converts unstructured text into a vector structure, where each word is represented by a position in the vector, and the value measures how relevant a given word is for an article. As all items will be represented in the same Vector Space Model, it is to compute similarity between articles.

See this presentation (from slide 30) for more information on TF-IDF and Cosine similarity.

## Collaborative Filtering model

Collaborative Filtering (CF) has two main implementation strategies:

* Memory-based: This approach uses the memory of previous users interactions to compute users similarities based on items they've interacted (user-based approach) or compute items similarities based on the users that have interacted with them (item-based approach).
A typical example of this approach is User Neighbourhood-based CF, in which the top-N similar users (usually computed using Pearson correlation) for a user are selected and used to recommend items those similar users liked, but the current user have not interacted yet. This approach is very simple to implement, but usually do not scale well for many users. A nice Python implementation of this approach in available in Crab.
* Model-based: This approach, models are developed using different machine learning algorithms to recommend items to users. There are many model-based CF algorithms, like neural networks, bayesian networks, clustering models, and latent factor models such as Singular Value Decomposition (SVD) and, probabilistic latent semantic analysis.

### Matrix Factorization

Latent factor models compress user-item matrix into a low-dimensional representation in terms of latent factors. One advantage of using this approach is that instead of having a high dimensional matrix containing abundant number of missing values we will be dealing with a much smaller matrix in lower-dimensional space.
A reduced presentation could be utilized for either user-based or item-based neighborhood algorithms that are presented in the previous section. There are several advantages with this paradigm. It handles the sparsity of the original matrix better than memory based ones. Also comparing similarity on the resulting matrix is much more scalable especially in dealing with large sparse datasets.

Here we a use popular latent factor model named Singular Value Decomposition (SVD). There are other matrix factorization frameworks more specific to CF you might try, like surprise, mrec or python-recsys. We chose a SciPy implemenation of SVD because it is available on Kaggle kernels.
Ps. See an example of SVD on a movies dataset in this blog post.

An important decision is the number of factors to factor the user-item matrix. The higher the number of factors, the more precise is the factorization in the original matrix reconstructions. Therefore, if the model is allowed to memorize too much details of the original matrix, it may not generalize well for data it was not trained on. Reducing the number of factors increases the model generalization.