# Menu <a class="anchor" id="menu"></a>

* [1. Candidate generation](#candidates)
    * [1.1 Modèle baseline *(random recommendations)*](#candidates_baseline)
    * [1.2 Collaborative Filtering](#candidates_collaborative_filtering)
    * [1.3 Content Based Filtering](#candidates_content_based_filtering)
    * [1.4 Quelques variantes](#candidates_others)
    * [1.5 Revue des scores](#candidates_scores)
* [2. Préparons le modèle Hybrid à déployer](#prepare_hybrid)

# Préparatifs 

In [1]:
import math
import time
import pickle 
import random
import joblib
import pathlib
import multiprocessing
from multiprocessing import Pool
from collections import defaultdict

import pandas as pd
import numpy as np
from numpy.linalg import norm

import scipy.sparse as sparse
from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.metrics import ndcg_score
# from sklearn.preprocessing import MultiLabelBinarizer

import implicit
from implicit.nearest_neighbours import bm25_weight
from implicit import evaluation

import ml_metrics as metrics

random_seed = 0
np.random.seed(random_seed)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
scores_df = pd.DataFrame([], columns=['model_name', 'mean_cosine_similarity', 'map@k', 'training_time'])
scores_df.set_index('model_name', inplace=True)
scores_df.head()

Unnamed: 0_level_0,mean_cosine_similarity,map@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


### Chargeons les jeux de données `training` et `validation`

In [3]:
data_train = pd.read_csv('data/data_train.csv')
display(data_train.head(3), data_train.shape)

Unnamed: 0,user_id,article_id,category_id,score
0,59,234853,375,-0.328045
1,79,159359,281,-0.327253
2,154,96663,209,-0.363316


(1577295, 4)

In [4]:
data_valid = pd.read_csv('data/data_valid.csv')
display(data_valid.head(3), data_valid.shape)

Unnamed: 0,user_id,article_id,category_id,score
0,279777,96210,209,-0.387729
1,29634,284773,412,-0.209496
2,55,162605,281,0.213075


(241105, 4)

In [5]:
data_train.user_id.nunique(), data_valid.user_id.nunique()

(297141, 84041)

### Préparons une sparse matrix pour entrainer nos algorithmes de collaborative filtering

On va convertir nos user_id et article_id en categorical poiur réduire la taille des sparse matrix, mais pour nous assurer que les deux jeux de données (train & valid) on des index raccords entre les users et les articles, nous allons d'abord les rassembler pour produire et extraire les nouveaux index

#### Rassemblons les deux jeux de données pour produire et extraires les nouveaux index

In [6]:
data_full = pd.concat([data_train, data_valid], axis=0)
data_full.shape, data_train.shape, data_valid.shape

data_full['user_cat'] = data_full['user_id'].astype('category').cat.codes
data_full['article_cat'] = data_full['article_id'].astype('category').cat.codes

num_rows = data_full.user_cat.max()+1
num_cols = data_full.article_cat.max()+1

#### Préparons des lookup lists pour retrouver facilement la correspondance entre les `user_id` / `user_cat` et `article_id` / `article_cat`

In [7]:
user_lookup_id_cat = pd.DataFrame(data_full.groupby('user_id')['user_cat'].apply(lambda x: list(x)[0])).to_dict()
user_lookup_cat_id = pd.DataFrame(data_full.groupby('user_cat')['user_id'].apply(lambda x: list(x)[0])).to_dict()

In [8]:
article_lookup_id_cat = pd.DataFrame(data_full.groupby('article_id')['article_cat'].apply(lambda x: list(x)[0])).to_dict()
article_lookup_cat_id = pd.DataFrame(data_full.groupby('article_cat')['article_id'].apply(lambda x: list(x)[0])).to_dict()

#### Ajoutons les nouveaux index `user_cat` et `article_cat` aux jeux de données `data_train` et `data_valid`

In [9]:
data_train['user_cat'] = data_train.user_id.apply(lambda x: user_lookup_id_cat['user_cat'][x])
data_valid['user_cat'] = data_valid.user_id.apply(lambda x: user_lookup_id_cat['user_cat'][x])

In [10]:
data_train['article_cat'] = data_train.article_id.apply(lambda x: article_lookup_id_cat['article_cat'][x])
data_valid['article_cat'] = data_valid.article_id.apply(lambda x: article_lookup_id_cat['article_cat'][x])

#### Préparons à présent les sparse matrix sur la base de ces nouvelles colonnes `user_cat` et `article_cat` *(qui sont plus compactes que user_id et article_id)*

In [11]:
train_sparse_user_item = sparse.csr_matrix((data_train['score'].astype(float), (data_train['user_cat'], data_train['article_cat'])), shape=(num_rows, num_cols))
display(train_sparse_user_item.shape)

valid_sparse_user_item = sparse.csr_matrix((data_valid['score'].astype(float), (data_valid['user_cat'], data_valid['article_cat'])), shape=(num_rows, num_cols))
display(valid_sparse_user_item.shape)

(297141, 32595)

(297141, 32595)

### Chargons les embeddings

In [12]:
file = open('data/news-portal-user-interactions-by-globocom/articles_embeddings.pickle',"rb")
article_embedding = pickle.load(file)

In [13]:
display(article_embedding[:5], article_embedding.shape)

array([[-0.16118301, -0.95723313, -0.13794445, ..., -0.231686  ,
         0.5974159 ,  0.40962312],
       [-0.52321565, -0.974058  ,  0.73860806, ...,  0.18282819,
         0.39708954, -0.83436364],
       [-0.61961854, -0.9729604 , -0.20736018, ..., -0.44758022,
         0.8059317 , -0.28528407],
       [-0.7408434 , -0.97574896,  0.39169782, ..., -0.5378381 ,
         0.24354108, -0.8853287 ],
       [-0.2790515 , -0.97231525,  0.68537366, ..., -0.42406067,
         0.18548405, -0.5802922 ]], dtype=float32)

(364047, 250)

## Définissons des fonctions d'évaluation pour nos modèles

### D'abord des functions pour évaluer des métriques classiques de recommender systesm *(map@k, precision@k, recall@k, ndcg@k...)*

> **Nous devons nous rappeler que la recommandation n'est pas une prédiction.**<br>
> S'appuyer sur des métriques ML pour déterminer la performance d'un système de recommandation n'est pas suffisant.<br>
> Seul **le retour des utilisateurs apporte des résultats valables et c'est pourquoi les tests A/B devraient toujours être priviligiés**.

> Dans la mesure ou notre jeu de données **ne dispose pas de scores explicites**, et où nous voulons **également évaluer le model de Content-Based Filtering** *(qui n'utilise pas de scores)* il ne parrait pas souhaitable d'utiliser des métrique du type `MAE` ou `RMSE`. Mais nous pourrions donc nous tourner vers la `MAP@K`, la`nDCG@k`, la `Precision@k`, le `Recall@K`.
>
> * Les `Recall@K`, `Precision@K` et `Average Precision @k (ap@k)` semblent plus appropriées pour évaluer **les recommendations faite à 1 utilisateur**
> * La `nDCG@k` semblent être appropriée lorsque l'on a des 'pertinences' non binaires. Mais ici on va se contenter d'avoir des news qui sont pertinentes ou pas.
> * La `Mean Average Precision @k (map@k)` est approprié pour évaluer l'ensemble des recommendations faites aux utilisateurs de notre jeu de données de validation.
>>
> #### Nous allons donc utiliser la **Map@k**

In [14]:
def evaluate_collaborative_filtering(train_sparse, valid_sparse, reco_size, model=None, rec_function=None, sample_size=None):
    
    user_cats, article_cats = valid_sparse_user_item.nonzero()
    user_cats_unique = pd.DataFrame(user_cats, columns=['cat']).groupby('cat').first()
    user_cats_unique = list(user_cats_unique.index)
    
    if sample_size is not None and len(user_cats_unique) > sample_size:
        user_cats_unique = user_cats_unique[:sample_size]
        
    print("Number of users in the validation set: ", len(user_cats_unique))
    
    # --- Get y_reco
    if model is not None:
        y_reco, _ = model.recommend(user_cats_unique, train_sparse[user_cats_unique], N=reco_size, filter_already_liked_items=True)
    if rec_function is not None:
        y_reco = rec_function(user_cats_unique, None, train_sparse, valid_sparse, reco_size)
    
    # --- Get y_true
    d = defaultdict(list)
    for user_cat, article_cat in zip(user_cats, article_cats):
        d[user_cat].append(article_cat)
    y_true = [d[x] for x in d]
    
    if sample_size is not None and len(user_cats) > sample_size:
        y_true = y_true[:sample_size]
    
    return metrics.mapk(y_true, y_reco, k=reco_size)

In [15]:
def get_content_based_recommendations( user_id, reco_size=5, histo_size=None ):
    #print(user_id)
    
    # --- Compute viewed mean_embedding
    if histo_size is not None:
        viewed_train = data_train[data_train.user_id == user_id]['article_id'].iloc[-histo_size:]
    else:
        viewed_train = data_train[data_train.user_id == user_id]['article_id']
    if len(viewed_train) < 1: return []
    mean_vector_viewed_train = get_mean_vector(viewed_train.values)
    
    # --- Compute cosine similarity the mean of the viewed articles and the rest
    A = article_embedding.copy()
    B = mean_vector_viewed_train
    A[viewed_train.values] = -B # discard read articles
    cosine = cosine_similarity(A, B.reshape(1, -1))
    
    recommendations =  recommend_articles(cosine, reco_size)
    return recommendations

### Ensuite, puisque nous disposons d'embeddings pour tout nos articles, nous pourrions également tenter de créer une métrique basé sur leur similarité

> Nous pourrions comparer l'embedding moyen des articles lus APRÈS *(donc les interactions contenues dans data_valid)* avec l'embedding moyen des article recommandés.
>>
> Cette métrique ne nous indiquera pas si l'on a effectivement recommandé des articles lus par l'utilisateur, mais pourra nous indiquer à quel point le sujet des articles recommandés est proche de ceux lus.

In [16]:
def get_mean_cosine_similarity(data_ref, train_df, valid_df, reco_model, reco_size = 5, top_users=None):

    # --- for each user, get the ids of the articles he/she has read
    select = data_ref.groupby('user_cat')['article_id'].apply(list).reset_index(name='article_ids')
    
    # --- remove user if he/she has only one article in the data_ref (probably data_valid)
    select = select[select.article_ids.map(len) > 1]

    # --- crop selection if specified
    if top_users is not None:
        select = select[:top_users]
        
    # print("users_ids:", select.user_id)

    # --- for each user, compute the mean embedding vectors of the articles he/she has read
    select['read_mean_embedding'] = select.apply(lambda x : article_embedding[np.array(x[1])].mean(axis=0), axis=1)

    # --- for each user, make recommendations
    user_cats = select['user_cat'].values
    select['reco_article_ids'] = reco_model(user_cats, select.read_mean_embedding, train_df, valid_df, reco_size)

    # --- for each user, compute the mean embedding vectors of the recommended articles
    select['reco_mean_embedding'] = select['reco_article_ids'].apply(lambda x : article_embedding[np.array(x)].mean(axis=0))

    # --- for each user, compute the cosine similarity between the read_mean_embedding and the pred_mean_embedding
    select['cosine'] = select.apply(lambda x: cosine_similarity(x['read_mean_embedding'].reshape(1, -1), x['reco_mean_embedding'].reshape(1, -1))[0][0], axis=1)

    # --- reset the index column
    select = select.set_index('user_cat')

    # --- Compute & return overall mean cosine similarity
    return select.cosine.mean(), select

### Calculons également la cosine_similarity entre le data_train et le data_valid *(pour chaque utilisateur, puis la moyenne)*
#### pour voir a quel point les lectures du training set sont alignées avec les lectures du validation set par les mêmes utilisateurs...

In [17]:
def get_train_valid_similarity( data_t, data_v ):

    select = pd.DataFrame()
    
    # --- select the validation user_ids
    user_idx = data_v.user_id.unique()
    
    # --- for each user, get the ids of the articles he/she has read in the TRAINING set et le VALIDATION set
    select['article_ids_train'] = data_t[data_t.user_id.isin(user_idx)].groupby('user_id')['article_id'].apply(list)
    select['article_ids_valid'] = data_v[data_v.user_id.isin(user_idx)].groupby('user_id')['article_id'].apply(list)
    
    # --- remove user_id with only 1 article either the TRAINING or the VALIDATION set
    select = select[select['article_ids_train'].map(len) > 1]
    select = select[select['article_ids_valid'].map(len) > 1]
    select.dropna(axis=0, inplace=True)
    
    # --- for each user, compute the mean embedding vectors of the articles he/she has read
    select['mean_embedding_train'] = select['article_ids_train'].apply(lambda x : article_embedding[x].mean(axis=0))
    select['mean_embedding_valid'] = select['article_ids_valid'].apply(lambda x : article_embedding[x].mean(axis=0))

    # --- for each user, compute the cosine similarity between the user's mean_embeddings in the TRAINING set and in the VALIDATION set
    select['cosine'] = select.apply(lambda x: cosine_similarity(x['mean_embedding_train'].reshape(1, -1), x['mean_embedding_valid'].reshape(1, -1))[0][0], axis=1)

    # --- Compute & return overall mean cosine similarity
    return select.cosine.mean(), select

cosine_train_valid, cosine_train_valid_df = get_train_valid_similarity(data_train, data_valid)

print(f"\nmean_cosine_similarity (de 1 à -1) entre 'viewed_in_train' & 'viewed_in_valid' = {cosine_train_valid:.2f}")


mean_cosine_similarity (de 1 à -1) entre 'viewed_in_train' & 'viewed_in_valid' = 0.64


In [18]:
cosine_train_valid_df.head(3)

Unnamed: 0_level_0,article_ids_train,article_ids_valid,mean_embedding_train,mean_embedding_valid,cosine
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,"[168868, 156543, 284664, 156672, 59929, 160974...","[225010, 69353, 161872, 205845, 57748, 157815,...","[-0.17539436, -0.9664814, -0.0456632, -0.30515...","[-0.024590481, -0.9645934, -0.06022447, -0.023...",0.917003
7,"[235840, 156624, 123757, 64409, 336221, 183176]","[199474, 87223, 352979, 284470, 36162, 156279]","[-0.32671222, -0.97076577, -0.31606814, -0.361...","[-0.11400774, -0.96468645, -0.32728586, -0.201...",0.739961
8,"[332114, 284847, 272660, 273464, 313504, 23513...","[331116, 96141, 234481]","[-0.47689843, -0.9678452, -0.2180024, -0.47295...","[-0.32962552, -0.97033435, 0.20547153, -0.0870...",0.741852


---
---
# 1. Candidate generation <a class="anchor" id="candidates"></a> [⇪](#menu)

In [19]:
reco_size = 5

## 1.1 Modèle baseline *(random recommendations)* <a class="anchor" id="candidates_baseline"></a> [⇪](#menu)
#### Établissons une baseline en utilisant nos functions d'évaluations sur un modèle faisant des recommendations au hasard

In [20]:
model_name = "Baseline model - Random"

In [21]:
def reco_random(user_cats, mean_embeddings, train_df, valid_df, reco_size):
    # --- ensure to get the same random matrix for reproductibility
    np.random.seed(random_seed)
    
    # --- set 5 random article_id for each users
    return list(np.random.randint(len(article_embedding), size=(len(user_cats), reco_size)))

### D'abord avec notre function d'évaluation classique qui évalue la `map@k`

#### Pour tout le jeu de données

In [22]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item, valid_sparse_user_item, reco_size, rec_function=reco_random)
print(f"Mean Average Precision @ {reco_size} (ALL users): {map_k:.10f}")

Number of users in the validation set:  84041
Mean Average Precision @ 5 (ALL users): 0.0000031731


#### Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [23]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item, valid_sparse_user_item, reco_size, rec_function=reco_random, sample_size=1000)

print(f"Mean Average Precision @ {reco_size} (first 1000 users): {map_k:.10f}")
scores_df.at[model_name,'map@k'] = map_k

Number of users in the validation set:  1000
Mean Average Precision @ 5 (first 1000 users): 0.0000000000


### Ensuite avec notre métrique maisons la `mean_cosine_similarity`

#### Pour tout le jeu de données

In [24]:
MCS, MCS_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_random, 5)
display(MCS_df.head(), MCS_df.shape)
print(f"\nmean_cosine_similarity: {MCS:.2f} (sachant que la cosine similarity va de 1 à -1)")

Unnamed: 0_level_0,article_ids,read_mean_embedding,reco_article_ids,reco_mean_embedding,cosine
user_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,"[225010, 69353, 161872, 205845, 57748, 157815,...","[-0.024590481, -0.9645934, -0.06022447, -0.023...","[305711, 117952, 152315, 358083, 359783]","[-0.15132096, -0.956843, 0.125719, -0.27082413...",0.557499
7,"[199474, 87223, 352979, 284470, 36162, 156279]","[-0.11400774, -0.96468645, -0.32728586, -0.201...","[304137, 122579, 86293, 211543, 212038]","[-0.4073767, -0.959381, -0.1800201, -0.0457417...",0.672065
8,"[331116, 96141, 234481]","[-0.32962552, -0.97033435, 0.20547153, -0.0870...","[310744, 170584, 314764, 80186, 17089]","[-0.4528594, -0.96688926, 0.4099636, -0.472056...",0.585191
10,"[196588, 193449, 195689, 65991, 100931, 58556,...","[-0.23681411, -0.96241695, -0.08872973, -0.346...","[150055, 220760, 363345, 255653, 82457]","[-0.3074916, -0.96819973, 0.09544015, -0.04938...",0.642696
11,"[208582, 96877, 195177, 205824]","[-0.15431535, -0.9709704, 0.069619074, 0.13036...","[329843, 332752, 7877, 346110, 73135]","[0.16090424, -0.96745205, -0.013877422, -0.109...",0.418073


(46638, 5)


mean_cosine_similarity: 0.46 (sachant que la cosine similarity va de 1 à -1)


#### Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [25]:
MCS, MCS_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_random, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS:.2f} (sachant que la cosine similarity va de 1 à -1)")

scores_df.at[model_name,'mean_cosine_similarity'] = MCS
scores_df.at[model_name,'training_time'] = 0


mean_cosine_similarity: 0.48 (sachant que la cosine similarity va de 1 à -1)


#### Vérifions la distribution des cosine_similarity calculés

In [26]:
MCS_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cosine,1000.0,0.482212,0.13109,0.04751,0.394346,0.495873,0.57634,0.826292


#### Comparons

In [27]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,map@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baseline model - Random,0.482212,0.0,0


## 1.2 Collaborative Filtering <a class="anchor" id="candidates_collaborative_filtering"></a> [⇪](#menu)

### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [28]:
model_als = implicit.als.AlternatingLeastSquares(
    factors=32, 
    regularization=0.05, 
    iterations=50,
    alpha=40
)

model_name = model_als.__class__.__name__
t0 = time.perf_counter()
model_als.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:18<00:00,  2.77s/it]


### Testons une `recommandation sur la base d'un ou plusieurs utilisateurs`

In [29]:
# Make recommendations for the first 10 users in the dataset
user_ids = [59, 1024] # liste d'ID utilisateurs
user_cat = [ user_lookup_id_cat['user_cat'][x] for x in user_ids ]

codes, scores = model_als.recommend(user_cat, valid_sparse_user_item[user_cat], N=reco_size, filter_already_liked_items=True) 

for i, user_id in enumerate(user_ids):
    print(f"\n --- Liste d'articles candidats pour l'utilisateur {user_id} --- \n")
    
    for cat, score in zip(codes[i], scores[i]):
        idx = article_lookup_cat_id['article_id'][cat]
        print(f"catCode: {cat:10} \t article_id: {idx:10} \t score: {score}")


 --- Liste d'articles candidats pour l'utilisateur 59 --- 

catCode:      19109 	 article_id:     205832 	 score: 0.8376213312149048
catCode:      15145 	 article_id:     160417 	 score: 0.6153063178062439
catCode:      15815 	 article_id:     166581 	 score: 0.6120333671569824
catCode:      22029 	 article_id:     236338 	 score: 0.4914150536060333
catCode:      18486 	 article_id:     199197 	 score: 0.47491663694381714

 --- Liste d'articles candidats pour l'utilisateur 1024 --- 

catCode:      11936 	 article_id:     119592 	 score: 0.7874321341514587
catCode:      30456 	 article_id:     336245 	 score: 0.40986379981040955
catCode:      21773 	 article_id:     235132 	 score: 0.39906179904937744
catCode:      19460 	 article_id:     207994 	 score: 0.39259636402130127
catCode:      28808 	 article_id:     313504 	 score: 0.38978150486946106


### Testons une `recommandation sur la base d'un article` *(ce n'est pas le but d'un Collaborative Filtering, mais on peut le faire alors autant l'essayer)*

In [30]:
article_id = 162605
article_cat = article_lookup_id_cat['article_cat'][article_id]
reco_size = 5

# Get similar items.
codes, scores = model_als.similar_items(article_cat, N=reco_size , filter_items=[article_cat])

print(f"\n --- Liste d'articles candidats sur la base de l'article {article_id} --- \n")
for cat, score in zip(codes, scores):
    idx = article_lookup_cat_id['article_id'][cat]
    print(f"catCode: {cat:10} \t article_id: {idx:10} \t score: {score:.2f}")


 --- Liste d'articles candidats sur la base de l'article 162605 --- 

catCode:      11770 	 article_id:     118856 	 score: 0.70
catCode:      10778 	 article_id:     107024 	 score: 0.69
catCode:      28912 	 article_id:     314431 	 score: 0.69
catCode:      19767 	 article_id:     211351 	 score: 0.69
catCode:      28969 	 article_id:     315052 	 score: 0.69


### Evaluons le modèle

#### Regardons le `MAP@k`

In [31]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item, valid_sparse_user_item, reco_size, model=model_als)
print(f"Mean Average Precision @ {reco_size} (ALL users): {map_k:.10f}")

Number of users in the validation set:  84041
Mean Average Precision @ 5 (ALL users): 0.0009399083


In [32]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item, valid_sparse_user_item, reco_size, model=model_als, sample_size=1000)

print(f"Mean Average Precision @ {reco_size} (first 1000 users): {map_k:.10f}")
scores_df.at[model_name,'map@k'] = map_k

Number of users in the validation set:  1000
Mean Average Precision @ 5 (first 1000 users): 0.0003000000


#### Regardons la `mean_cosine_similarity`

In [33]:
def lookup_articles_from_cat(cat):
    try:
        return article_lookup_cat_id['article_id'][cat]
    except KeyError:
        return -1

In [34]:
def reco_collaborative_filtering_als(user_cats, mean_embeddings, train_df, valid_df, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles_from_cat, x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    reco_codes, reco_score = model_als.recommend(user_cats, train_df[user_cats], N=reco_size, filter_already_liked_items=True) 
    
    return apply_numpy(reco_codes)

#### Pour tout le jeu de données

In [35]:
MCS, MCS_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_als, 5)

display(MCS_df.head(), MCS_df.shape)
print(f"\nmean_cosine_similarity: {MCS:.2f} (sachant que la cosine similarity va de 1 à -1)")

Unnamed: 0_level_0,article_ids,read_mean_embedding,reco_article_ids,reco_mean_embedding,cosine
user_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,"[225010, 69353, 161872, 205845, 57748, 157815,...","[-0.024590481, -0.9645934, -0.06022447, -0.023...","[124749, 156624, 160417, 64329, 348093]","[-0.07110355, -0.96495885, -0.13800652, -0.050...",0.782454
7,"[199474, 87223, 352979, 284470, 36162, 156279]","[-0.11400774, -0.96468645, -0.32728586, -0.201...","[124749, 42883, 156964, 284096, 168623]","[-0.5024486, -0.95664835, -0.1012487, -0.32630...",0.626999
8,"[331116, 96141, 234481]","[-0.32962552, -0.97033435, 0.20547153, -0.0870...","[129434, 162655, 123757, 234267, 336223]","[-0.24365985, -0.9682046, -0.41410017, -0.3402...",0.59226
10,"[196588, 193449, 195689, 65991, 100931, 58556,...","[-0.23681411, -0.96241695, -0.08872973, -0.346...","[59057, 288320, 285663, 160974, 233717]","[-0.4927772, -0.970342, -0.17037562, -0.212060...",0.726268
11,"[208582, 96877, 195177, 205824]","[-0.15431535, -0.9709704, 0.069619074, 0.13036...","[277491, 225019, 206168, 235616, 36399]","[-0.40128607, -0.97201157, -0.043656655, -0.07...",0.729266


(46638, 5)


mean_cosine_similarity: 0.54 (sachant que la cosine similarity va de 1 à -1)


#### Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [36]:
MCS1, MCS1_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_als, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1


mean_cosine_similarity: 0.58 (sachant que la cosine similarity va de 1 à -1)


#### Comparons

In [37]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,map@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baseline model - Random,0.482212,0.0,0.0
AlternatingLeastSquares,0.575309,0.0003,138.599869


## 1.3 Content Based Filtering <a class="anchor" id="candidates_content_based_filtering"></a> [⇪](#menu)

articles_embeddings.pickle Pickle (Python 3) of a NumPy matrix containing the Article Content Embeddings (250-dimensional vectors), trained upon articles' text and metadata by the CHAMELEON's ACR module (see paper for details) for 364047 published articles.
P.s. The full text of news articles could not be provided due to license restrictions, but those embeddings can be used by Neural Networks to represent their content. See this paper for a t-SNE visualization of these embeddings, colored by category.

### Trouvons l'embedding moyen des articles lus par un utilisateur donnée

In [38]:
user_id = 20137
history_size = 5

def get_mean_vector(articles_idx):
    apply_numpy = lambda x : article_embedding[x].mean(axis=0)
    return apply_numpy(articles_idx)            

user_articles_idx = data_train[data_train.user_id == user_id]['article_id']
last_articles_idx = user_articles_idx.iloc[-history_size:].values #.sort_values('click_timestamp')
mean_vector = get_mean_vector(last_articles_idx)

print(f"Articles utilisés dans le mean embedding: {last_articles_idx}")

Articles utilisés dans le mean embedding: [288440 337441 202476 250043 284583]


### Calculons la similarité de cet embedding avec les embeddings des articles présents dans note fichier

In [39]:
def get_cosine(article_embedding, mean_vector, user_articles_idx=None):
    
    A = article_embedding.copy()
    B = mean_vector

    # --- Assurons nous de ne pas recommander les article déjà lus par cet utilisateur
    if user_articles_idx is not None:
        A[user_articles_idx] = -B # je donne aux articles lus un embedding inverse de celui que l'on cible
 
    # --- Calculons la cosine similarity entre l'embedding moyen de l'utilisateur et les articles connus
    return np.dot(A,B)/(norm(A, axis=1)*norm(B))

# --- Calculons la cosine similarity entre l'embedding moyen de l'utilisateur et les articles connus
cosine = get_cosine(article_embedding, mean_vector, user_articles_idx)
print("Cosine Similarity:", cosine, cosine.shape)

Cosine Similarity: [0.32582363 0.2783536  0.31348327 ... 0.38568467 0.2742523  0.42668363] (364047,)


### Recommandons 5 articles à l'utilisateur

In [40]:
def recommend_articles(cosine, reco_size=5):

    cos = pd.DataFrame(cosine, columns=['cosine_sim'])
    selection = cos.sort_values('cosine_sim', ascending=False)[:reco_size]
    selection.reset_index(inplace=True)
    selection.rename(columns={'index':'article_id'}, inplace=True)
    return selection

reco = recommend_articles(cosine, 5)
reco

Unnamed: 0,article_id,cosine_sim
0,284768,0.830968
1,345593,0.821919
2,285424,0.821205
3,345566,0.818335
4,283576,0.816523


### Comparons avec les articles consultés par cet utilisateur dans le validation_set

In [41]:
mean_vector_recommended = get_mean_vector(reco.article_id)

In [42]:
viewed = data_valid[data_valid.user_id == user_id]['article_id'].values
mean_vector_viewed = get_mean_vector(viewed)

#### Similarité entre les articles lus dans le `valid_set` et les articles prédis

In [43]:
def get_cosine_similarity(A, B):
    return np.dot(A,B)/(norm(A)*norm(B))

cosine_similarity_score = get_cosine_similarity(mean_vector_recommended, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity_score)

Cosine Similarity: 0.71754134


#### Similarité entre les articles lus dans le `train_set` et les articles lus dans le `valid_set`

In [44]:
cosine_similarity_score = get_cosine_similarity(mean_vector, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity_score)

Cosine Similarity: 0.7984939


#### Calculons la cosine similarity moyenne et la map@k 

In [45]:
%%time

def f(user_id):
    
    # --- Get recommendations
    reco =  get_content_based_recommendations(user_id, reco_size, None)
    mean_vector_reco = get_mean_vector(reco.article_id)
    
    # --- Compute viewed mean_embedding VALID
    viewed_valid = data_valid[data_valid.user_id == user_id]['article_id']
    if len(viewed_valid) < 1: return -1, [], []
    mean_vector_viewed_valid = get_mean_vector(viewed_valid.values)
    
    # --- Compute similarities
    cosine_similarity_pred_viewed = cosine_similarity(mean_vector_reco.reshape(1, -1), mean_vector_viewed_valid.reshape(1, -1))
    
    # --- Return values
    return (cosine_similarity_pred_viewed[0][0], list(viewed_valid.values), list(reco.article_id))
    
results = []
user_ids = np.sort(data_valid.user_id.unique())

with Pool(multiprocessing.cpu_count()) as p:
    results.append(p.map(f, user_ids[:1000]))

CPU times: user 181 ms, sys: 170 ms, total: 350 ms
Wall time: 4min 53s


In [46]:
mean_cosine_similarity = np.array([x for x,y,z in results[0]]).mean()
mean_cosine_similarity

print(f"\nmean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = {mean_cosine_similarity:.2f} (que l'on compare donc avec les 0.57 du Collaborative Filtering ALS)")

model_name = "Content Based Filtering"
scores_df.at[model_name,'mean_cosine_similarity'] = mean_cosine_similarity
scores_df.at[model_name,'training_time'] = 0


mean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = 0.58 (que l'on compare donc avec les 0.57 du Collaborative Filtering ALS)


In [47]:
y_true = [y for x,y,z in results[0]]
y_reco = [z for x,y,z in results[0]]
map_k = metrics.mapk(y_true, y_reco, k=reco_size)

print(f"Mean Average Precision @ {reco_size} (first 1000 users): {map_k:.10f}")
scores_df.at[model_name,'map@k'] = map_k

Mean Average Precision @ 5 (first 1000 users): 0.0002066667


#### Comparons

In [48]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,map@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baseline model - Random,0.482212,0.0,0.0
AlternatingLeastSquares,0.575309,0.0003,138.599869
Content Based Filtering,0.576889,0.000207,0.0


> #### Si l'on se fie au score moyen de similarité entre les recommandations et ce qui a été effectivemet lu (dans le validation set), le `Collaborative filtering` semble plus précis.
> Mais ce n'est pas le seul élément à prendre en compte *(d'autant que cette mesure n'est pas très significative)*.

> `Collaborative Filtering`:
> * plus en accord avec ce qui est dans le validation set par les utilisateurs,
> * plus rapide,
> * limité aux articles visités.<br>

> `Content Based Filtering`:
> * plus lent *(du moins je n'ai pas réussi à le rendre rapide)*,
> * moins prévis *(mais je n'ai pu évaluer que les 100 premiers utlisateurs)*
> * peut recommander n'importe quel article dont on a l'embedding, y compris ceux jamais visités par un utilisateur.

> Dans les deux cas, les modèles employés ne permettent pas de prendre en compte des features supplémentaires comme par exemple la catégorie de l'article etc. *(Il faudrait un Collaborative Filtering en DNN)*

> L'idéal est donc probablement d'utiliser une combinaison des deux approches; le Collaboravie filtering pour être le plus proche possible des intérêts de l'utilisateur, et le Content Based Filtering pour apporter de la diversité sans pour autant trop s'éloigner des sujets de prédilection de l'utilisateur.

## 1.4 Essayons quelques variantes <a class="anchor" id="candidates_others"></a> [⇪](#menu)

### AlternatingLeastSquares avec BM25

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations
> Utilisons un système permettant d'essayer de rééquilibrer les notes implicites *(pour éviter de donner trop d'importance aux articles qui ont un très gros ratio temps de lecture / nombre de mots)*

In [49]:
train_sparse_user_item_bm25 = bm25_weight(train_sparse_user_item, K1=100, B=0.9).tocsr() # Implicit veut des matrices [user x item]
valid_sparse_user_item_bm25 = bm25_weight(valid_sparse_user_item, K1=100, B=0.9).tocsr() # Implicit veut des matrices [user x item]

display(train_sparse_user_item_bm25.shape)
display(valid_sparse_user_item_bm25.shape)

(297141, 32595)

(297141, 32595)

In [50]:
model_bm25 = implicit.als.AlternatingLeastSquares(
    factors=32, 
    regularization=0.05, 
    iterations=50,
    alpha=40
)

model_name = f"{model_bm25.__class__.__name__}_with_BM25"
t0 = time.perf_counter()
model_bm25.fit(train_sparse_user_item_bm25)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:25<00:00,  1.71s/it]


#### Regardons la `mean_cosine_similarity`

In [51]:
def reco_collaborative_filtering_bm25(user_cats, mean_embeddings, train_df, valid_df, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles_from_cat, x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    reco_codes, reco_score = model_bm25.recommend(user_cats, train_df[user_cats], N=reco_size, filter_already_liked_items=True) 
    
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [52]:
MCS1als, MCS1als_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_collaborative_filtering_bm25, 5)
print(f"\nmean_cosine_similarity: {MCS1als:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.47 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [53]:
MCS1als, MCS1als_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_collaborative_filtering_bm25, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1als:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1als


mean_cosine_similarity: 0.51 (sachant que la cosine similarity va de 1 à -1)


#### Regardons le `MAP@k`

In [54]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_size, model=model_bm25)
print(f"Mean Average Precision @ {reco_size} (ALL users): {map_k:.10f}")

Number of users in the validation set:  84041
Mean Average Precision @ 5 (ALL users): 0.0000000000


In [55]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_size, model=model_bm25, sample_size=1000)

print(f"Mean Average Precision @ {reco_size} (first 1000 users): {map_k:.10f}")
scores_df.at[model_name,'map@k'] = map_k

Number of users in the validation set:  1000
Mean Average Precision @ 5 (first 1000 users): 0.0000000000


#### Comparons

In [56]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,map@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baseline model - Random,0.482212,0.0,0.0
AlternatingLeastSquares,0.575309,0.0003,138.599869
Content Based Filtering,0.576889,0.000207,0.0
AlternatingLeastSquares_with_BM25,0.505538,0.0,85.557782


### Logistic Matrix Factorization

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [57]:
model_lmf = implicit.cpu.lmf.LogisticMatrixFactorization(
    factors=32,
    learning_rate=0.05,
    regularization=0.05, 
    iterations=50,
)

model_name = model_lmf.__class__.__name__
t0 = time.perf_counter()
model_lmf.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:31<00:00,  1.61it/s]


#### Regardons la `mean_cosine_similarity`

In [58]:
def reco_collaborative_filtering_lmf(user_cats, mean_embeddings, train_df, valid_df, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles_from_cat, x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    reco_codes, reco_score = model_lmf.recommend(user_cats, train_df[user_cats], N=reco_size, filter_already_liked_items=True) 
    
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [59]:
MCS1lmf, MCS1lmf_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_lmf, 5)
print(f"\nmean_cosine_similarity: {MCS1lmf:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.53 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [60]:
MCS1lmf, MCS1lmf_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_lmf, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1lmf:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1lmf


mean_cosine_similarity: 0.56 (sachant que la cosine similarity va de 1 à -1)


#### Regardons le `MAP@k`

In [61]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_size, model=model_lmf)
print(f"Mean Average Precision @ {reco_size} (ALL users): {map_k:.10f}")

Number of users in the validation set:  84041
Mean Average Precision @ 5 (ALL users): 0.0002485923


In [62]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_size, model=model_lmf, sample_size=1000)

print(f"Mean Average Precision @ {reco_size} (first 1000 users): {map_k:.10f}")
scores_df.at[model_name,'map@k'] = map_k

Number of users in the validation set:  1000
Mean Average Precision @ 5 (first 1000 users): 0.0001000000


#### Comparons

In [63]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,map@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baseline model - Random,0.482212,0.0,0.0
AlternatingLeastSquares,0.575309,0.0003,138.599869
Content Based Filtering,0.576889,0.000207,0.0
AlternatingLeastSquares_with_BM25,0.505538,0.0,85.557782
LogisticMatrixFactorization,0.563861,0.0001,31.612296


### Bayesian Personalized Ranking

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [64]:
model_bpr = implicit.cpu.bpr.BayesianPersonalizedRanking(
    factors=32,
    learning_rate=0.05,
    regularization=0.05, 
    iterations=50,
)

model_name = model_bpr.__class__.__name__
t0 = time.perf_counter()
model_bpr.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:09<00:00,  5.43it/s, train_auc=88.16%, skipped=4.26%]


#### Regardons la `mean_cosine_similarity`

In [65]:
def reco_collaborative_filtering_bpr(user_cats, mean_embeddings, train_df, valid_df, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles_from_cat, x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    reco_codes, reco_score = model_bpr.recommend(user_cats, train_df[user_cats], N=reco_size, filter_already_liked_items=True) 
    
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [66]:
MCS1bpr, MCS1bpr_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_bpr, 5)
print(f"\nmean_cosine_similarity: {MCS1bpr:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.55 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [67]:
MCS1bpr, MCS1bpr_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_bpr, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1bpr:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1bpr


mean_cosine_similarity: 0.59 (sachant que la cosine similarity va de 1 à -1)


#### Regardons le `MAP@k`

In [68]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_size, model=model_bpr)
print(f"Mean Average Precision @ {reco_size} (ALL users): {map_k:.10f}")

Number of users in the validation set:  84041
Mean Average Precision @ 5 (ALL users): 0.0004158552


In [69]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_size, model=model_bpr, sample_size=1000)

print(f"Mean Average Precision @ {reco_size} (first 1000 users): {map_k:.10f}")
scores_df.at[model_name,'map@k'] = map_k

Number of users in the validation set:  1000
Mean Average Precision @ 5 (first 1000 users): 0.0003400000


#### Comparons

In [70]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,map@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baseline model - Random,0.482212,0.0,0.0
AlternatingLeastSquares,0.575309,0.0003,138.599869
Content Based Filtering,0.576889,0.000207,0.0
AlternatingLeastSquares_with_BM25,0.505538,0.0,85.557782
LogisticMatrixFactorization,0.563861,0.0001,31.612296
BayesianPersonalizedRanking,0.589526,0.00034,9.405733


### Nearest Neighbour - Item Item Recommender

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [71]:
model_iir = implicit.nearest_neighbours.ItemItemRecommender()

model_name = model_iir.__class__.__name__
t0 = time.perf_counter()
model_iir.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32595/32595 [00:00<00:00, 94771.53it/s]


#### Regardons la `mean_cosine_similarity`

In [72]:
def reco_collaborative_filtering_iir(user_cats, mean_embeddings, train_df, valid_df, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles_from_cat, x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    reco_codes, reco_score = model_iir.recommend(user_cats, train_df[user_cats], N=reco_size, filter_already_liked_items=True) 
    
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [73]:
MCS1iir, MCS1iir_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_iir, 5)
print(f"\nmean_cosine_similarity: {MCS1iir:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.55 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [74]:
MCS1iir, MCS1iir_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_iir, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1iir:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1iir


mean_cosine_similarity: 0.57 (sachant que la cosine similarity va de 1 à -1)


#### Regardons le `MAP@k`

In [75]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_size, model=model_iir)
print(f"Mean Average Precision @ {reco_size} (ALL users): {map_k:.10f}")

Number of users in the validation set:  84041
Mean Average Precision @ 5 (ALL users): 0.0006338077


In [76]:
map_k = evaluate_collaborative_filtering(train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_size, model=model_iir, sample_size=1000)

print(f"Mean Average Precision @ {reco_size} (first 1000 users): {map_k:.10f}")
scores_df.at[model_name,'map@k'] = map_k

Number of users in the validation set:  1000
Mean Average Precision @ 5 (first 1000 users): 0.0017750000


#### Comparons

In [77]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,map@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baseline model - Random,0.482212,0.0,0.0
AlternatingLeastSquares,0.575309,0.0003,138.599869
Content Based Filtering,0.576889,0.000207,0.0
AlternatingLeastSquares_with_BM25,0.505538,0.0,85.557782
LogisticMatrixFactorization,0.563861,0.0001,31.612296
BayesianPersonalizedRanking,0.589526,0.00034,9.405733
ItemItemRecommender,0.573461,0.001775,0.393621


### Content Based Filtering basé sur le dernier article lu uniquement

In [78]:
%%time
def f(user_id):
    
    histo_size = 1
    
    # --- Get recommendations
    reco =  get_content_based_recommendations(user_id, reco_size, histo_size)
    mean_vector_reco = get_mean_vector(reco.article_id)
    
    # --- Compute viewed mean_embedding VALID
    viewed_valid = data_valid[data_valid.user_id == user_id]['article_id']
    if len(viewed_valid) < 1: return -1, [], []
    mean_vector_viewed_valid = get_mean_vector(viewed_valid.values)
    
    # --- Compute similarities
    cosine_similarity_pred_viewed = cosine_similarity(mean_vector_reco.reshape(1, -1), mean_vector_viewed_valid.reshape(1, -1))
    
    # --- Return values
    return (cosine_similarity_pred_viewed[0][0], list(viewed_valid.values), list(reco.article_id))
    
results = []
user_ids = np.sort(data_valid.user_id.unique())

with Pool(multiprocessing.cpu_count()) as p:
    results.append(p.map(f, user_ids[:1000]))

CPU times: user 138 ms, sys: 197 ms, total: 336 ms
Wall time: 5min 26s


In [79]:
mean_cosine_similarity = np.array([x for x,y,z in results[0]]).mean()
mean_cosine_similarity

print(f"\nmean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = {mean_cosine_similarity:.2f} (que l'on compare donc avec les 0.57 du Collaborative Filtering ALS)")

model_name = "Content Based Filtering based on last read article"
scores_df.at[model_name,'mean_cosine_similarity'] = mean_cosine_similarity
scores_df.at[model_name,'training_time'] = 0


mean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = 0.41 (que l'on compare donc avec les 0.57 du Collaborative Filtering ALS)


In [80]:
y_true = [y for x,y,z in results[0]]
y_reco = [z for x,y,z in results[0]]
map_k = metrics.mapk(y_true, y_reco, k=reco_size)

print(f"Mean Average Precision @ {reco_size} (first 1000 users): {map_k:.10f}")
scores_df.at[model_name,'map@k'] = map_k

Mean Average Precision @ 5 (first 1000 users): 0.0006000000


### Content Based Filtering basé sur les 5 derniers articles lus uniquement

In [81]:
%%time
def f(user_id):
    
    histo_size = 5
    
    # --- Get recommendations
    reco =  get_content_based_recommendations(user_id, reco_size, histo_size)
    mean_vector_reco = get_mean_vector(reco.article_id)
    
    # --- Compute viewed mean_embedding VALID
    viewed_valid = data_valid[data_valid.user_id == user_id]['article_id']
    if len(viewed_valid) < 1: return -1, [], []
    mean_vector_viewed_valid = get_mean_vector(viewed_valid.values)
    
    # --- Compute similarities
    cosine_similarity_pred_viewed = cosine_similarity(mean_vector_reco.reshape(1, -1), mean_vector_viewed_valid.reshape(1, -1))
    
    # --- Return values
    return (cosine_similarity_pred_viewed[0][0], list(viewed_valid.values), list(reco.article_id))
    
results = []
user_ids = np.sort(data_valid.user_id.unique())

with Pool(multiprocessing.cpu_count()) as p:
    results.append(p.map(f, user_ids[:1000]))

CPU times: user 250 ms, sys: 247 ms, total: 497 ms
Wall time: 5min 21s


In [82]:
mean_cosine_similarity = np.array([x for x,y,z in results[0]]).mean()
mean_cosine_similarity

print(f"\nmean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = {mean_cosine_similarity:.2f} (que l'on compare donc avec les 0.57 du Collaborative Filtering ALS)")

model_name = "Content Based Filtering based on 5 last read articles"
scores_df.at[model_name,'mean_cosine_similarity'] = mean_cosine_similarity
scores_df.at[model_name,'training_time'] = 0


mean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = 0.52 (que l'on compare donc avec les 0.57 du Collaborative Filtering ALS)


In [83]:
y_true = [y for x,y,z in results[0]]
y_reco = [z for x,y,z in results[0]]
map_k = metrics.mapk(y_true, y_reco, k=reco_size)

print(f"Mean Average Precision @ {reco_size} (first 1000 users): {map_k:.10f}")
scores_df.at[model_name,'map@k'] = map_k

Mean Average Precision @ 5 (first 1000 users): 0.0000000000


## 1.5 Revue des scores <a class="anchor" id="candidates_scores"></a> [⇪](#menu)

In [84]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,map@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Baseline model - Random,0.482212,0.0,0.0
AlternatingLeastSquares,0.575309,0.0003,138.599869
Content Based Filtering,0.576889,0.000207,0.0
AlternatingLeastSquares_with_BM25,0.505538,0.0,85.557782
LogisticMatrixFactorization,0.563861,0.0001,31.612296
BayesianPersonalizedRanking,0.589526,0.00034,9.405733
ItemItemRecommender,0.573461,0.001775,0.393621
Content Based Filtering based on last read article,0.412783,0.0006,0.0
Content Based Filtering based on 5 last read articles,0.518529,0.0,0.0


---
---
# 2. Préparons le modèle Hybrid à déployer <a class="anchor" id="prepare_hybrid"></a> [⇪](#menu)

### Sauvegardons le modèle choisi

In [104]:
joblib.dump((model_bpr, train_sparse_user_item, article_lookup_cat_id, user_lookup_id_cat), pathlib.Path('azure_function','data','collaborative_recommender.pkl'))

['azure_function/data/collaborative_recommender.pkl']

In [105]:
joblib.dump((article_embedding, data_train), pathlib.Path('azure_function','data','content_based_recommender.pkl'))

['azure_function/data/content_based_recommender.pkl']

### Chargeons le modèle exporté

In [106]:
(reco_als, sparse_matrix, article_lookup_cat_id, user_lookup_id_cat) = joblib.load(pathlib.Path('azure_function','data','collaborative_recommender.pkl'))
(article_embedding, data_train) = joblib.load(pathlib.Path('azure_function','data','content_based_recommender.pkl'))

In [107]:
user_id = 299017

### Préparons une fonction de recommandation pour le Collaborative filtering

In [108]:
def get_collaborative_recommendations( user_id, reco_size ):
    
    user_codes = user_lookup_id_cat['user_cat'][user_id]
    codes, scores = reco_als.recommend(user_codes, sparse_matrix[user_codes], N=reco_size, filter_already_liked_items=True)
    recommendations = pd.DataFrame(np.vstack((codes, scores)).T, columns=['article_id', 'score'])
    recommendations['article_id'] = recommendations['article_id'].apply(lambda x: article_lookup_cat_id['article_id'][x])
    return recommendations

candidates_collaborative_filtering = get_collaborative_recommendations(user_id, 5)
candidates_collaborative_filtering

Unnamed: 0,article_id,score
0,234698,0.888624
1,250043,0.828257
2,215674,0.810039
3,138930,0.792689
4,277067,0.792326


### Préparons une fonction de recommandation pour le Content Based Filtering

In [109]:
def get_content_based_recommendations( user_id, reco_size=5 ):
    print(user_id)
    
    # --- Compute viewed mean_embedding
    viewed_train = data_train[data_train.user_id == user_id]['article_id']
    # if len(viewed_train) < 1: return
    mean_vector_viewed_train = get_mean_vector(viewed_train.values)
    
    # --- Compute cosine similarity the mean of the viewed articles and the rest
    A = article_embedding.copy()
    B = mean_vector_viewed_train
    A[viewed_train.values] = -B # Cancel already read articles
    cosine = cosine_similarity(A, B.reshape(1, -1))
    
    recommendations =  recommend_articles(cosine, reco_size)
    return recommendations
    
candidates_content_based_filtering = get_content_based_recommendations(user_id, 5)
candidates_content_based_filtering

299017


Unnamed: 0,article_id,cosine_sim
0,312610,0.926546
1,324693,0.923426
2,313922,0.922881
3,314113,0.918606
4,313556,0.918562


### Préparons une fonction de recommandation unissant le Collaborative Filtering et le Content Based Filtering

In [110]:
def get_reco( user_id, reco_size ):
    
    # Get Collaborative Filtering Recommendations
    candidates_collaborative_filtering = get_collaborative_recommendations(user_id, reco_size)
    
    # Get Content Based Recommendations
    candidates_content_based_filtering = get_content_based_recommendations(user_id, reco_size)
    
    # Select some of them at random
    cf_size, cbf_size = math.floor(reco_size/2), math.ceil(reco_size/2)  # 1/2 vs 1/2 avec priorité cbf
    #cf_size, cbf_size = round(reco_size/3*1), round(reco_size/3*2)  # 1/3 vs 2/3
    p_cf = candidates_collaborative_filtering.sample(cf_size)
    p_cbf = candidates_content_based_filtering.sample(cbf_size)
    
    # Return the selected articles_id
    return np.concatenate((p_cf['article_id'].values, p_cbf['article_id'].values)). astype(int)

get_reco( user_id, 5 )

299017


array([138930, 250043, 314113, 312610, 313922])