In [1]:
import time
import pickle 
import random

import pandas as pd
import numpy as np
from numpy.linalg import norm

import scipy.sparse as sparse
from sklearn.metrics.pairwise import cosine_similarity

import implicit
from implicit.nearest_neighbours import bm25_weight
from implicit import evaluation

import ml_metrics

random_seed = 0
np.random.seed(random_seed)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
scores_df = pd.DataFrame([], columns=['model_name', 'mean_cosine_similarity', 'precision@k', 'map@k', 'ndcg@k', 'training_time'])
scores_df.set_index('model_name', inplace=True)
scores_df.head()

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


### Chargeons les jeux de données `training` et `validation`

In [3]:
data_train = pd.read_csv('data/data_train.csv')
display(data_train.head(3), data_train.shape)

Unnamed: 0,user_id,article_id,score
0,59,234853,0.214286
1,79,159359,0.215827
2,154,96663,0.145631


(1577295, 3)

In [4]:
data_valid = pd.read_csv('data/data_valid.csv')
display(data_valid.head(3), data_valid.shape)

Unnamed: 0,user_id,article_id,score
0,279777,96210,0.109489
1,29634,284773,0.469863
2,55,162605,1.324273


(241105, 3)

### Chargons les embeddings

In [5]:
file = open('data/news-portal-user-interactions-by-globocom/articles_embeddings.pickle',"rb")
article_embedding = pickle.load(file)

In [6]:
display(article_embedding[:5], article_embedding.shape)

array([[-0.16118301, -0.95723313, -0.13794445, ..., -0.231686  ,
         0.5974159 ,  0.40962312],
       [-0.52321565, -0.974058  ,  0.73860806, ...,  0.18282819,
         0.39708954, -0.83436364],
       [-0.61961854, -0.9729604 , -0.20736018, ..., -0.44758022,
         0.8059317 , -0.28528407],
       [-0.7408434 , -0.97574896,  0.39169782, ..., -0.5378381 ,
         0.24354108, -0.8853287 ],
       [-0.2790515 , -0.97231525,  0.68537366, ..., -0.42406067,
         0.18548405, -0.5802922 ]], dtype=float32)

(364047, 250)

# 1. Candidate generation

## 1.1 Collaborative Filtering

### Préparons une sparse matrix pour entrainer nos algorithmes de collaborative filtering

In [7]:
# --- Train ---
data_train['user_cat_code'] = data_train['user_id'].astype('category').cat.codes
data_train['article_cat_code'] = data_train['article_id'].astype('category').cat.codes

# train_sparse_item_user = sparse.csr_matrix((data_train['score'].astype(float), (data_train['article_cat_code'], data_train['user_cat_code'])))
train_sparse_user_item = sparse.csr_matrix((data_train['score'].astype(float), (data_train['user_cat_code'], data_train['article_cat_code'])))
display(train_sparse_user_item.shape)

# --- Validation ---
data_valid['user_cat_code'] = data_valid['user_id'].astype('category').cat.codes
data_valid['article_cat_code'] = data_valid['article_id'].astype('category').cat.codes

# valid_sparse_item_user = sparse.csr_matrix((data_valid['score'].astype(float), (data_valid['article_cat_code'], data_valid['user_cat_code'])))
valid_sparse_user_item = sparse.csr_matrix((data_valid['score'].astype(float), (data_valid['user_cat_code'], data_valid['article_cat_code'])))

display(valid_sparse_user_item.shape)

(297141, 28002)

(84041, 7576)

### Utilisons un système permettant de rééquilibrer les notes implicites *(pour éviter de donner trop d'importance aux articles qui ont un très gros ratio `temps de lecture` / `nombre de mots`)*

In [8]:
# train_sparse_item_user_bm25 = bm25_weight(sparse_item_user, K1=100, B=0.9).tocsr()
train_sparse_user_item_bm25 = bm25_weight(train_sparse_user_item, K1=100, B=0.9).tocsr() # Implicit veut des matrices [user x item]
valid_sparse_user_item_bm25 = bm25_weight(valid_sparse_user_item, K1=100, B=0.9).tocsr() # Implicit veut des matrices [user x item]

display(train_sparse_user_item_bm25.shape)
display(valid_sparse_user_item_bm25.shape)

(297141, 28002)

(84041, 7576)

### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [9]:
model_bm25 = implicit.als.AlternatingLeastSquares(
    factors=32, 
    regularization=0.05, 
    iterations=50,
    alpha=40
)

model_name = f"{model_bm25.__class__.__name__}_with_BM25"
t0 = time.perf_counter()
model_bm25.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [03:38<00:00,  4.37s/it]


### Testons une `recommandation sur la base d'un ou plusieurs utilisateurs`

In [10]:
# Make recommendations for the first 10 users in the dataset
userids = [59, 1024] # liste d'ID utilisateurs
rec_size = 5

codes, scores = model_bm25.recommend(userids, valid_sparse_user_item_bm25[userids], N=rec_size, filter_already_liked_items=True) 

for i, user_id in enumerate(userids):
    print(f"\n --- Liste d'articles candidats pour l'utilisateur {user_id} --- \n")
    
    for code, score in zip(codes[i], scores[i]):
        idx = data_train[data_train.article_cat_code==code]['article_id'].iloc[0]
        print(f"catCode: {code:10} \t article_id: {idx:10} \t score: {score:.2f}")


 --- Liste d'articles candidats pour l'utilisateur 59 --- 

catCode:       9556 	 article_id:     108854 	 score: 0.43
catCode:       5718 	 article_id:      68866 	 score: 0.37
catCode:      13935 	 article_id:     168868 	 score: 0.35
catCode:      10394 	 article_id:     119592 	 score: 0.35
catCode:       8217 	 article_id:      96663 	 score: 0.32

 --- Liste d'articles candidats pour l'utilisateur 1024 --- 

catCode:      26186 	 article_id:     336221 	 score: 0.50
catCode:      18722 	 article_id:     234698 	 score: 0.48
catCode:       8014 	 article_id:      95716 	 score: 0.39
catCode:      18926 	 article_id:     235870 	 score: 0.38
catCode:      26187 	 article_id:     336223 	 score: 0.38


### Testons une `recommandation sur la base d'un article` *(ce n'est pas le but d'un Collaborative Filtering, mais on peut le faire alors autant l'essayer)*

In [11]:
article_id = 162605
article_code = data_train[data_train.article_id == article_id]['article_cat_code'].iloc[0]
rec_size = 5

# Get similar items.
codes, scores = model_bm25.similar_items(article_code, N=rec_size , filter_items=[article_code])

print(f"\n --- Liste d'articles candidats sur la base de l'article {article_id} --- \n")
for code, score in zip(codes, scores):
    idx = data_train[data_train.article_cat_code==code]['article_id'].iloc[0]
    print(f"catCode: {code:10} \t article_id: {idx:10} \t score: {score:.2f}")


 --- Liste d'articles candidats sur la base de l'article 162605 --- 

catCode:       6359 	 article_id:      74112 	 score: 0.93
catCode:      13314 	 article_id:     162107 	 score: 0.82
catCode:      22526 	 article_id:     285045 	 score: 0.79
catCode:       9266 	 article_id:     106664 	 score: 0.71
catCode:      13000 	 article_id:     159197 	 score: 0.71


### Evaluons le modèle

> **Nous devons nous rappeler que la recommandation n'est pas une prédiction.**<br>
> S'appuyer sur des métriques ML pour déterminer la performance d'un système de recommandation n'est pas suffisant.<br>
> Seul **le retour des utilisateurs apporte des résultats valables et c'est pourquoi les tests A/B devraient toujours être priviligiés**.

- Dans la mesure ou notre jeu de données **ne dispose pas de scores explicites**, il ne parrait pas souhaitable d'utiliser des métrique du type `MAE` ou `RMSE`.
- Dans la mesure ou l'on **ne cherche pas particulièrement à obtenir un ordre précis**, il ne parrait pas souhaitable d'utiliser des métriques de ranking comme le `MAP@K` ou le `nDCCG`.
- Nous pourrions donc nous tourner vers la `Precision@k`, le `Recall@K` et donc le `F1@k`, mais il est probable que ce ne soit pas très représentatif.

#### Regardons la precision@k

In [12]:
precision_k = evaluation.precision_at_k(model_bm25, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, K=5, show_progress=True, num_threads=1)
print(precision_k)
scores_df.at[model_name,'precision@k'] = precision_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:24<00:00, 3453.41it/s]

0.00015757212864188583





> Le probleme c'est même si c'est l'une des métriques disponible les plus adaptée, elle reste peu adaptée à notre problème...<br>
> Ici `Precision = (# of top k recommendations that are relevant)/(# of items that are recommended)`<br>
> Mais malgré un nombre d'article assez large, on ne recommande que 5 articles et les utilisateurs ont un historique assez faible dans notre jeu de données. Donc les chances de recommander un article parmi 5 qui a effectivement été lu ensuite par l'utilisateur sont vraiment faible.

#### Regardons le MAP@k

In [13]:
map_k = evaluation.mean_average_precision_at_k(model_bm25, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, K=5, show_progress=True, num_threads=1)
print(map_k)
scores_df.at[model_name,'map@k'] = map_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:24<00:00, 3457.78it/s]

5.749508507091116e-05





#### Regardons le nDCG@k

In [14]:
ndcg_k = evaluation.ndcg_at_k(model_bm25, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, K=5, show_progress=True, num_threads=1)
print(ndcg_k)
scores_df.at[model_name,'ndcg@k'] = ndcg_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:23<00:00, 3516.78it/s]

9.955403366734356e-05





### Construisons une métrique sur mesure

Pour avoir une idée une métrique utilsable pour comparer nos différents modèles, nous pourrions comparer l'embedding moyen des articles lus APRÈS *(donc les actions contenues dans data_valid)* avec l'embedding moyen des article recommandés ET avec l'embedding moyen des articles recommandés.

#### Calculons la cosine similarity moyenne sur le jeu de validation

In [15]:
article_lookup = pd.DataFrame(data_train.groupby('article_cat_code')['article_id'].apply(lambda x: list(x)[0])).to_dict()

def lookup_articles(x):
    try:
        return article_lookup['article_id'][x]
    except Exception:
        return -1

In [16]:
def get_mean_cosine_similarity(data_ref, reco_ref, reco_model, reco_size = 5, top_users=None):

    # --- for each user, get the ids of the articles he/she has read
    select = data_ref.groupby('user_id')['article_id'].apply(list).reset_index(name='article_ids')
    select = select[select.article_ids.map(len) > 1]
    if top_users is not None:
        select = select[:top_users]

    # --- for each user, compute the mean embedding vectors of the articles he/she has read
    select['read_mean_embedding'] = select.apply(lambda x : article_embedding[np.array(x[1])].mean(axis=0), axis=1)

    # --- for each user, make recommendations
    select['reco_article_ids'] = reco_model(select.index, select.read_mean_embedding, reco_ref, reco_size)

    # --- for each user, compute the mean embedding vectors of the recommended articles
    select['reco_mean_embedding'] = select['reco_article_ids'].apply(lambda x : article_embedding[np.array(x)].mean(axis=0))

    # --- for each user, compute the cosine similarity between the read_mean_embedding and the pred_mean_embedding
    select['cosine'] = select.apply(lambda x: cosine_similarity(x['read_mean_embedding'].reshape(1, -1), x['reco_mean_embedding'].reshape(1, -1))[0][0], axis=1)

    # --- reset the index column
    select = select.set_index('user_id')

    # --- Compute & return overall mean cosine similarity
    return select.cosine.mean(), select

In [17]:
def reco_collaborative_filtering_bm25(user_ids, mean_embeddings, reco_ref, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles,x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    reco_codes, reco_score = model_bm25.recommend(user_ids, reco_ref[user_ids], N=reco_size, filter_already_liked_items=True) 
    return apply_numpy(reco_codes)

#### Pour tout le jeu de données

In [18]:
MCS, MCS_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item_bm25, reco_collaborative_filtering_bm25, 5)

display(MCS_df.head(), MCS_df.shape)
print(f"\nmean_cosine_similarity: {MCS:.2f} (sachant que la cosine similarity va de 1 à -1)")

Unnamed: 0_level_0,article_ids,read_mean_embedding,reco_article_ids,reco_mean_embedding,cosine
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,"[225010, 69353, 161872, 205845, 57748, 157815,...","[-0.024590481, -0.9645934, -0.06022447, -0.023...","[119592, 96663, 160474, 284463, 108854]","[-0.43367648, -0.96880037, -0.12601939, -0.163...",0.718369
7,"[199474, 87223, 352979, 284470, 36162, 156279]","[-0.11400774, -0.96468645, -0.32728586, -0.201...","[336499, 336223, 271261, 87224, 225019]","[-0.21747354, -0.9656499, -0.51685613, -0.3808...",0.692803
8,"[331116, 96141, 234481]","[-0.32962552, -0.97033435, 0.20547153, -0.0870...","[284096, 348113, 160417, 161191, 352902]","[-0.2786653, -0.96174604, -0.08292996, -0.0697...",0.687489
10,"[196588, 193449, 195689, 65991, 100931, 58556,...","[-0.23681411, -0.96241695, -0.08872973, -0.346...","[234698, 336221, 156964, 64409, 235870]","[-0.17372157, -0.9607512, -0.47276193, -0.2004...",0.715739
11,"[208582, 96877, 195177, 205824]","[-0.15431535, -0.9709704, 0.069619074, 0.13036...","[336223, 272660, 129434, 59057, 336220]","[-0.29166535, -0.9653824, -0.52180314, -0.4616...",0.413324


(46638, 5)


mean_cosine_similarity: 0.51 (sachant que la cosine similarity va de 1 à -1)


#### Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [19]:
MCS1, MCS1_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item_bm25, reco_collaborative_filtering_bm25, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1


mean_cosine_similarity: 0.55 (sachant que la cosine similarity va de 1 à -1)


#### Comparons

In [20]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AlternatingLeastSquares_with_BM25,0.546562,0.000158,5.7e-05,0.0001,218.431039


## 1.2 Content Based Filtering

articles_embeddings.pickle Pickle (Python 3) of a NumPy matrix containing the Article Content Embeddings (250-dimensional vectors), trained upon articles' text and metadata by the CHAMELEON's ACR module (see paper for details) for 364047 published articles.
P.s. The full text of news articles could not be provided due to license restrictions, but those embeddings can be used by Neural Networks to represent their content. See this paper for a t-SNE visualization of these embeddings, colored by category.

### Trouvons l'embedding moyen des articles lus par un utilisateur donnée

In [21]:
user_id = 20137
history_size = 5

def get_mean_vector(articles_idx):
    apply_numpy = lambda x : article_embedding[x].mean(axis=0)
    return apply_numpy(articles_idx)            

user_articles_idx = data_train[data_train.user_id == user_id]['article_id']
last_articles_idx = user_articles_idx.iloc[-history_size:].values #.sort_values('click_timestamp')
mean_vector = get_mean_vector(last_articles_idx)

print(f"Articles utilisés dans le mean embedding: {last_articles_idx}")

Articles utilisés dans le mean embedding: [288440 337441 202476 250043 284583]


### Calculons la similarité de cet embedding avec les embeddings des articles présents dans note fichier

In [22]:
def get_cosine(article_embedding, mean_vector, user_articles_idx=None):
    
    A = article_embedding.copy()
    B = mean_vector

    # --- Assurons nous de ne pas recommander les article déjà lus par cet utilisateur
    if user_articles_idx is not None:
        A[user_articles_idx] = -B # je donne aux articles lus un embedding inverse de celui que l'on cible
 
    # --- Calculons la cosine similarity entre l'embedding moyen de l'utilisateur et les articles connus
    return np.dot(A,B)/(norm(A, axis=1)*norm(B))

# --- Calculons la cosine similarity entre l'embedding moyen de l'utilisateur et les articles connus
cosine = get_cosine(article_embedding, mean_vector, user_articles_idx)
print("Cosine Similarity:", cosine, cosine.shape)

Cosine Similarity: [0.32582363 0.2783536  0.31348327 ... 0.38568467 0.2742523  0.42668363] (364047,)


### Recommandons 5 articles à l'utilisateur

In [23]:
def recommend_articles(cosine, pred_size=5):
    cos = pd.DataFrame(cosine, columns=['cosine_sim'])
    selection = cos.sort_values('cosine_sim', ascending=False)[:pred_size]
    selection.reset_index(inplace=True)
    selection.rename(columns={'index':'article_id'}, inplace=True)
    return selection

reco = recommend_articles(cosine, 5)
reco

Unnamed: 0,article_id,cosine_sim
0,284768,0.830968
1,345593,0.821919
2,285424,0.821205
3,345566,0.818335
4,283576,0.816523


### Comparons avec les articles consultés par cet utilisateur dans le validation_set

In [24]:
mean_vector_recommended = get_mean_vector(reco.article_id)

In [25]:
viewed = data_valid[data_valid.user_id == user_id]['article_id'].values
mean_vector_viewed = get_mean_vector(viewed)

#### Similarité entre les articles lus dans le `valid_set` et les articles prédis

In [26]:
def get_cosine_similarity(A, B):
    return np.dot(A,B)/(norm(A)*norm(B))

cosine_similarity_score = get_cosine_similarity(mean_vector_recommended, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity_score)

Cosine Similarity: 0.71754134


#### Similarité entre les articles lus dans le `train_set` et les articles lus dans le `valid_set`

In [27]:
cosine_similarity_score = get_cosine_similarity(mean_vector, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity_score)

Cosine Similarity: 0.7984939


#### Calculons la cosine similarity moyenne sur l'ensemble du jeu de validation *(⚠️ en fait une fraction car c'est beaucoup trop lent)*

In [28]:
cosine_pred_viewed = []
cosine_hist_viewed = []

# --- for each user, get the ids of the articles he/she has read
select = data_valid.groupby('user_id')['article_id'].apply(list).reset_index(name='article_ids')
select = select[select.article_ids.map(len) > 1]

# --- for each user, compute the mean embedding vectors of the articles he/she has read
select['read_mean_embedding'] = select.apply(lambda x : article_embedding[np.array(x[1])].mean(axis=0), axis=1)

for user_id in select.index[:1000]:
    # print(user_id)
    
    # Compute viewed mean_embedding
    viewed = data_valid[data_valid.user_id == user_id]['article_id']
    if len(viewed) < 2: continue # 
    mean_vector_viewed = get_mean_vector(viewed)
    
    # Compute recommended mean_embedding
    A = article_embedding.copy()
    B = select['read_mean_embedding'].iloc[user_id]
    A[select['article_ids'].iloc[user_id]] = -B # On supprime les articles déjà lus
    cosine = cosine_similarity(A, B.reshape(1, -1))
    
    pred = recommend_articles(cosine, 5)
    mean_vector_pred = get_mean_vector(pred.article_id)
    
    # Compute similarities
    cosine_similarity_pred_viewed = cosine_similarity(mean_vector_pred.reshape(1, -1), mean_vector_viewed.reshape(1, -1)) # get_cosine_similarity(mean_vector_pred, mean_vector_viewed)
    cosine_similarity_hist_viewed = cosine_similarity(B.reshape(1, -1), mean_vector_viewed.reshape(1, -1)) # get_cosine_similarity(mean_vector_hist, mean_vector_viewed)
    
    cosine_pred_viewed.append(cosine_similarity_pred_viewed)
    cosine_hist_viewed.append(cosine_similarity_hist_viewed)

In [29]:
print(f"\nmean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = {np.mean(cosine_pred_viewed):.2f} (que l'on compare donc avec les 0.55 du Collaborative Filtering ALS)")
print(f"\nmean_cosine_similarity (de 1 à -1) entre 'viewed_in_train' & 'viewed_in_valid' = {np.mean(cosine_hist_viewed):.2f} (pour donner une idée de la variabilité des centres d'intérêts utilisateurs")

model_name = "Content Based Filtering"
scores_df.at[model_name,'mean_cosine_similarity'] = np.mean(cosine_pred_viewed)
scores_df.at[model_name,'training_time'] = 0


mean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = 0.48 (que l'on compare donc avec les 0.55 du Collaborative Filtering ALS)

mean_cosine_similarity (de 1 à -1) entre 'viewed_in_train' & 'viewed_in_valid' = 0.54 (pour donner une idée de la variabilité des centres d'intérêts utilisateurs


#### Comparons

In [30]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AlternatingLeastSquares_with_BM25,0.546562,0.000158,5.7e-05,0.0001,218.431039
Content Based Filtering,0.479843,,,,0.0


> #### Si l'on se fie au score moyen de similarité entre les recommandations et ce qui a été effectivemet lu (dans le validation set), le `Collaborative filtering` semble plus précis.
> Mais ce n'est pas le seul élément à prendre en compte *(d'autant que cette mesure n'est pas très significative)*.

> `Collaborative Filtering`:
> * plus en accord avec ce qui est dans le validation set par les utilisateurs,
> * plus rapide,
> * limité aux articles visités.<br>

> `Content Based Filtering`:
> * plus lent *(du moins je n'ai pas réussi à le rendre rapide)*,
> * moins prévis *(mais je n'ai pu évaluer que les 100 premiers utlisateurs)*
> * peut recommander n'importe quel article dont on a l'embedding, y compris ceux jamais visités par un utilisateur.

> Dans les deux cas, les modèles employés ne permettent pas de prendre en compte des features supplémentaires comme par exemple la catégorie de l'article etc. *(Il faudrait un Collaborative Filtering en DNN)*

> L'idéal est donc probablement d'utiliser une combinaison des deux approches; le Collaboravie filtering pour être le plus proche possible des intérêts de l'utilisateur, et le Content Based Filtering pour apporter de la diversité sans pour autant trop s'éloigner des sujets de prédilection de l'utilisateur.

## 1.3 Essayons quelques variantes

### AlternatingLeastSquares sans BM25

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [31]:
model_als = implicit.als.AlternatingLeastSquares(
    factors=32, 
    regularization=0.05, 
    iterations=50,
    alpha=40
)

model_name = model_als.__class__.__name__
t0 = time.perf_counter()
model_als.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [04:10<00:00,  5.00s/it]


#### Calculons la cosine similarity moyenne sur le jeu de validation

In [32]:
def reco_collaborative_filtering_als(user_ids, mean_embeddings, reco_ref, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles,x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    reco_codes, reco_score = model_als.recommend(user_ids, reco_ref[user_ids], N=reco_size, filter_already_liked_items=True) 
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [33]:
MCS1als, MCS1als_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item, reco_collaborative_filtering_als, 5)
print(f"\nmean_cosine_similarity: {MCS1als:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.51 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [34]:
MCS1als, MCS1als_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item, reco_collaborative_filtering_als, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1als:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1als


mean_cosine_similarity: 0.55 (sachant que la cosine similarity va de 1 à -1)


#### Regardons la precision@k

In [35]:
precision_k = evaluation.precision_at_k(model_als, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(precision_k)
scores_df.at[model_name,'precision@k'] = precision_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:23<00:00, 3506.93it/s]

0.00013656251148963438





#### Regardons le MAP@k

In [36]:
map_k = evaluation.mean_average_precision_at_k(model_als, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(map_k)
scores_df.at[model_name,'map@k'] = map_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:24<00:00, 3370.93it/s]

5.931628609845195e-05





#### Regardons le nDCG@k

In [37]:
ndcg_k = evaluation.ndcg_at_k(model_als, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(ndcg_k)
scores_df.at[model_name,'ndcg@k'] = ndcg_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:24<00:00, 3368.34it/s]

9.426374434527173e-05





#### Comparons

In [38]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AlternatingLeastSquares_with_BM25,0.546562,0.000158,5.7e-05,0.0001,218.431039
Content Based Filtering,0.479843,,,,0.0
AlternatingLeastSquares,0.549646,0.000137,5.9e-05,9.4e-05,250.276323


### Logistic Matrix Factorization

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [39]:
model_lmf = implicit.cpu.lmf.LogisticMatrixFactorization(
    factors=32,
    learning_rate=0.05,
    regularization=0.05, 
    iterations=50,
)

model_name = model_lmf.__class__.__name__
t0 = time.perf_counter()
model_lmf.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:26<00:00,  1.91it/s]


#### Calculons la cosine similarity moyenne sur le jeu de validation

In [40]:
def reco_collaborative_filtering_lmf(user_ids, mean_embeddings, reco_ref, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles,x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    reco_codes, reco_score = model_lmf.recommend(user_ids, reco_ref[user_ids], N=reco_size, filter_already_liked_items=True) 
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [41]:
MCS1lmf, MCS1lmf_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item, reco_collaborative_filtering_lmf, 5)
print(f"\nmean_cosine_similarity: {MCS1lmf:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.49 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [42]:
MCS1lmf, MCS1lmf_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item, reco_collaborative_filtering_lmf, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1lmf:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1lmf


mean_cosine_similarity: 0.52 (sachant que la cosine similarity va de 1 à -1)


#### Regardons la precision@k

In [43]:
precision_k = evaluation.precision_at_k(model_lmf, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(precision_k)
scores_df.at[model_name,'precision@k'] = precision_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:25<00:00, 3340.76it/s]

0.0003309014701479602





#### Regardons le MAP@k

In [44]:
map_k = evaluation.mean_average_precision_at_k(model_lmf, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(map_k)
scores_df.at[model_name,'map@k'] = map_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:25<00:00, 3332.78it/s]

0.00012547116552899442





#### Regardons le nDCG@k

In [45]:
ndcg_k = evaluation.ndcg_at_k(model_lmf, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(ndcg_k)
scores_df.at[model_name,'ndcg@k'] = ndcg_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:25<00:00, 3312.53it/s]

0.00021629735503866384





#### Comparons

In [46]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AlternatingLeastSquares_with_BM25,0.546562,0.000158,5.7e-05,0.0001,218.431039
Content Based Filtering,0.479843,,,,0.0
AlternatingLeastSquares,0.549646,0.000137,5.9e-05,9.4e-05,250.276323
LogisticMatrixFactorization,0.515754,0.000331,0.000125,0.000216,26.603471


### Bayesian Personalized Ranking

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [47]:
model_bpr = implicit.cpu.bpr.BayesianPersonalizedRanking(
    factors=32,
    learning_rate=0.05,
    regularization=0.05, 
    iterations=50,
)

model_name = model_bpr.__class__.__name__
t0 = time.perf_counter()
model_bpr.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:07<00:00,  6.56it/s, train_auc=88.18%, skipped=4.26%]


#### Calculons la cosine similarity moyenne sur le jeu de validation

In [48]:
def reco_collaborative_filtering_bpr(user_ids, mean_embeddings, reco_ref, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles,x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    reco_codes, reco_score = model_bpr.recommend(user_ids, reco_ref[user_ids], N=reco_size, filter_already_liked_items=True) 
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [49]:
MCS1bpr, MCS1bpr_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item, reco_collaborative_filtering_bpr, 5)
print(f"\nmean_cosine_similarity: {MCS1bpr:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.49 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [50]:
MCS1bpr, MCS1bpr_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item, reco_collaborative_filtering_bpr, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1bpr:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1bpr


mean_cosine_similarity: 0.53 (sachant que la cosine similarity va de 1 à -1)


#### Regardons la precision@k

In [51]:
precision_k = evaluation.precision_at_k(model_bpr, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(precision_k)
scores_df.at[model_name,'precision@k'] = precision_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:25<00:00, 3316.40it/s]

9.454327718513149e-05





#### Regardons le MAP@k

In [52]:
map_k = evaluation.mean_average_precision_at_k(model_bpr, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(map_k)
scores_df.at[model_name,'map@k'] = map_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:25<00:00, 3287.23it/s]

2.3563234347256432e-05





#### Regardons le nDCG@k

In [53]:
ndcg_k = evaluation.ndcg_at_k(model_bpr, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(ndcg_k)
scores_df.at[model_name,'ndcg@k'] = ndcg_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:25<00:00, 3343.85it/s]

4.9678635050222066e-05





#### Comparons

In [54]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AlternatingLeastSquares_with_BM25,0.546562,0.000158,5.7e-05,0.0001,218.431039
Content Based Filtering,0.479843,,,,0.0
AlternatingLeastSquares,0.549646,0.000137,5.9e-05,9.4e-05,250.276323
LogisticMatrixFactorization,0.515754,0.000331,0.000125,0.000216,26.603471
BayesianPersonalizedRanking,0.525987,9.5e-05,2.4e-05,5e-05,7.832134


### Item Item Recommender

In [55]:
model_name = "ItemItemRecommender"

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [56]:
model_iir = implicit.nearest_neighbours.ItemItemRecommender()

model_name = model_iir.__class__.__name__
t0 = time.perf_counter()
model_iir.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28002/28002 [00:00<00:00, 115543.13it/s]


#### Calculons la cosine similarity moyenne sur le jeu de validation

In [57]:
def reco_collaborative_filtering_iir(user_ids, mean_embeddings, reco_ref, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles,x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    reco_codes, reco_score = model_iir.recommend(user_ids, reco_ref[user_ids], N=reco_size, filter_already_liked_items=True) 
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [58]:
MCS1iir, MCS1iir_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item, reco_collaborative_filtering_iir, 5)
print(f"\nmean_cosine_similarity: {MCS1iir:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.49 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [59]:
MCS1iir, MCS1iir_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item, reco_collaborative_filtering_iir, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1iir:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1iir


mean_cosine_similarity: 0.51 (sachant que la cosine similarity va de 1 à -1)


#### Regardons la precision@k

In [60]:
precision_k = evaluation.precision_at_k(model_iir, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(precision_k)
scores_df.at[model_name,'precision@k'] = precision_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:09<00:00, 8909.93it/s]

0.0005567548545346632





#### Regardons le MAP@k

In [61]:
map_k = evaluation.mean_average_precision_at_k(model_iir, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(map_k)
scores_df.at[model_name,'map@k'] = map_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:08<00:00, 9699.66it/s]

0.0002331963631507901





#### Regardons le nDCG@k

In [62]:
ndcg_k = evaluation.ndcg_at_k(model_iir, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(ndcg_k)
scores_df.at[model_name,'ndcg@k'] = ndcg_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:09<00:00, 8868.24it/s]

0.000389925374540457





#### Comparons

In [63]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AlternatingLeastSquares_with_BM25,0.546562,0.000158,5.7e-05,0.0001,218.431039
Content Based Filtering,0.479843,,,,0.0
AlternatingLeastSquares,0.549646,0.000137,5.9e-05,9.4e-05,250.276323
LogisticMatrixFactorization,0.515754,0.000331,0.000125,0.000216,26.603471
BayesianPersonalizedRanking,0.525987,9.5e-05,2.4e-05,5e-05,7.832134
ItemItemRecommender,0.509033,0.000557,0.000233,0.00039,0.311624


## 1.4 Modèle baseline <a class="anchor" id="models_baseline"></a> [⇪](#menu)

Qu'obtenons-nous en donnant des ID d'articles au hasard plutôt qu'en cherchant les plus proches ?

In [64]:
def reco_random(user_ids, mean_embeddings, reco_ref, reco_size):
    np.random.seed(random_seed)
    return list(np.random.randint(len(article_embedding), size=(len(user_ids), rec_size)))

#### Pour tout le jeu de données

In [65]:
MCSbl, MCSbl_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item, reco_random, 5)

display(MCSbl_df.head(), MCSbl_df.shape)
print(f"\nmean_cosine_similarity: {MCSbl:.2f} (sachant que la cosine similarity va de 1 à -1)")

Unnamed: 0_level_0,article_ids,read_mean_embedding,reco_article_ids,reco_mean_embedding,cosine
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,"[225010, 69353, 161872, 205845, 57748, 157815,...","[-0.024590481, -0.9645934, -0.06022447, -0.023...","[305711, 117952, 152315, 358083, 359783]","[-0.15132096, -0.956843, 0.125719, -0.27082413...",0.557499
7,"[199474, 87223, 352979, 284470, 36162, 156279]","[-0.11400774, -0.96468645, -0.32728586, -0.201...","[304137, 122579, 86293, 211543, 212038]","[-0.4073767, -0.959381, -0.1800201, -0.0457417...",0.672065
8,"[331116, 96141, 234481]","[-0.32962552, -0.97033435, 0.20547153, -0.0870...","[310744, 170584, 314764, 80186, 17089]","[-0.4528594, -0.96688926, 0.4099636, -0.472056...",0.585191
10,"[196588, 193449, 195689, 65991, 100931, 58556,...","[-0.23681411, -0.96241695, -0.08872973, -0.346...","[150055, 220760, 363345, 255653, 82457]","[-0.3074916, -0.96819973, 0.09544015, -0.04938...",0.642696
11,"[208582, 96877, 195177, 205824]","[-0.15431535, -0.9709704, 0.069619074, 0.13036...","[329843, 332752, 7877, 346110, 73135]","[0.16090424, -0.96745205, -0.013877422, -0.109...",0.418073


(46638, 5)


mean_cosine_similarity: 0.46 (sachant que la cosine similarity va de 1 à -1)


#### Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [66]:
MCSbl, MCSbl_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item, reco_random, 5, 1000)
print(f"\nmean_cosine_similarity: {MCSbl:.2f} (sachant que la cosine similarity va de 1 à -1)")

model_name = "Content Based Filtering - Random"
scores_df.at[model_name,'mean_cosine_similarity'] = MCSbl
scores_df.at[model_name,'training_time'] = 0


mean_cosine_similarity: 0.48 (sachant que la cosine similarity va de 1 à -1)


#### Vérifions la distribution des conine_similarity calculés

In [67]:
MCSbl_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cosine,1000.0,0.482212,0.13109,0.04751,0.394346,0.495873,0.57634,0.826292


## 1.4 Revue des scores <a class="anchor" id="models_scores"></a> [⇪](#menu)

In [68]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AlternatingLeastSquares_with_BM25,0.546562,0.000158,5.7e-05,0.0001,218.431039
Content Based Filtering,0.479843,,,,0.0
AlternatingLeastSquares,0.549646,0.000137,5.9e-05,9.4e-05,250.276323
LogisticMatrixFactorization,0.515754,0.000331,0.000125,0.000216,26.603471
BayesianPersonalizedRanking,0.525987,9.5e-05,2.4e-05,5e-05,7.832134
ItemItemRecommender,0.509033,0.000557,0.000233,0.00039,0.311624
Content Based Filtering - Random,0.482212,,,,0.0
