In [1]:
import math
import time
import pickle 
import random
import joblib
import pathlib

import pandas as pd
import numpy as np
from numpy.linalg import norm

import scipy.sparse as sparse
from sklearn.metrics.pairwise import cosine_similarity

import implicit
from implicit.nearest_neighbours import bm25_weight
from implicit import evaluation

import ml_metrics

random_seed = 0
np.random.seed(random_seed)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
scores_df = pd.DataFrame([], columns=['model_name', 'mean_cosine_similarity', 'precision@k', 'map@k', 'ndcg@k', 'training_time'])
scores_df.set_index('model_name', inplace=True)
scores_df.head()

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


### Chargeons les jeux de données `training` et `validation`

In [3]:
data_train = pd.read_csv('data/data_train.csv')
display(data_train.head(3), data_train.shape)

Unnamed: 0,user_id,article_id,category_id,score
0,59,234853,375,-0.328045
1,79,159359,281,-0.327253
2,154,96663,209,-0.363316


(1577295, 4)

In [4]:
data_valid = pd.read_csv('data/data_valid.csv')
display(data_valid.head(3), data_valid.shape)

Unnamed: 0,user_id,article_id,category_id,score
0,279777,96210,209,-0.387729
1,29634,284773,412,-0.209496
2,55,162605,281,0.213075


(241105, 4)

### Préparons une sparse matrix pour entrainer nos algorithmes de collaborative filtering

In [5]:
# --- Train ---
data_train['user_cat_code'] = data_train['user_id'].astype('category').cat.codes
data_train['article_cat_code'] = data_train['article_id'].astype('category').cat.codes

# train_sparse_item_user = sparse.csr_matrix((data_train['score'].astype(float), (data_train['article_cat_code'], data_train['user_cat_code'])))
train_sparse_user_item = sparse.csr_matrix((data_train['score'].astype(float), (data_train['user_cat_code'], data_train['article_cat_code'])))
display(train_sparse_user_item.shape)

# --- Validation ---
data_valid['user_cat_code'] = data_valid['user_id'].astype('category').cat.codes
data_valid['article_cat_code'] = data_valid['article_id'].astype('category').cat.codes

# valid_sparse_item_user = sparse.csr_matrix((data_valid['score'].astype(float), (data_valid['article_cat_code'], data_valid['user_cat_code'])))
valid_sparse_user_item = sparse.csr_matrix((data_valid['score'].astype(float), (data_valid['user_cat_code'], data_valid['article_cat_code'])))

display(valid_sparse_user_item.shape)

(297141, 28002)

(84041, 7576)

#### Et préparons une fonction lookup pour retrouver les article_id à partir des article_cat_code...

In [6]:
article_lookup = pd.DataFrame(data_train.groupby('article_cat_code')['article_id'].apply(lambda x: list(x)[0])).to_dict()

def lookup_articles(x):
    try:
        return article_lookup['article_id'][x]
    except Exception:
        return -1

#### et les user_id à partir des user_cat_code...

In [7]:
user_lookup = pd.DataFrame(data_train.groupby('user_cat_code')['user_id'].apply(lambda x: list(x)[0])).to_dict()

def lookup_users(x):
    try:
        return user_lookup['user_id'][x]
    except Exception:
        return -1

### Chargons les embeddings

In [8]:
file = open('data/news-portal-user-interactions-by-globocom/articles_embeddings.pickle',"rb")
article_embedding = pickle.load(file)

In [9]:
display(article_embedding[:5], article_embedding.shape)

array([[-0.16118301, -0.95723313, -0.13794445, ..., -0.231686  ,
         0.5974159 ,  0.40962312],
       [-0.52321565, -0.974058  ,  0.73860806, ...,  0.18282819,
         0.39708954, -0.83436364],
       [-0.61961854, -0.9729604 , -0.20736018, ..., -0.44758022,
         0.8059317 , -0.28528407],
       [-0.7408434 , -0.97574896,  0.39169782, ..., -0.5378381 ,
         0.24354108, -0.8853287 ],
       [-0.2790515 , -0.97231525,  0.68537366, ..., -0.42406067,
         0.18548405, -0.5802922 ]], dtype=float32)

(364047, 250)

## Calculons la cosine_similarity entre le data_train et le data_valid *(pour chaque utilisateur, puis la moyenne)*

In [10]:
def get_train_valid_similarity( data_t, data_v ):

    select = pd.DataFrame()
    
    # --- select the validation user_ids
    user_idx = data_v.user_id.unique()
    
    # --- for each user, get the ids of the articles he/she has read in the TRAINING set et le VALIDATION set
    select['article_ids_train'] = data_t[data_t.user_id.isin(user_idx)].groupby('user_id')['article_id'].apply(list)
    select['article_ids_valid'] = data_v[data_v.user_id.isin(user_idx)].groupby('user_id')['article_id'].apply(list)
    
    # --- remove user_id with only 1 article either the TRAINING or the VALIDATION set
    select = select[select['article_ids_train'].map(len) > 1]
    select = select[select['article_ids_valid'].map(len) > 1]
    select.dropna(axis=0, inplace=True)
    
    # --- for each user, compute the mean embedding vectors of the articles he/she has read
    select['mean_embedding_train'] = select['article_ids_train'].apply(lambda x : article_embedding[x].mean(axis=0))
    select['mean_embedding_valid'] = select['article_ids_valid'].apply(lambda x : article_embedding[x].mean(axis=0))

    # --- for each user, compute the cosine similarity between the user's mean_embeddings in the TRAINING set and in the VALIDATION set
    select['cosine'] = select.apply(lambda x: cosine_similarity(x['mean_embedding_train'].reshape(1, -1), x['mean_embedding_valid'].reshape(1, -1))[0][0], axis=1)

    # --- Compute & return overall mean cosine similarity
    return select.cosine.mean(), select

cosine_train_valid, cosine_train_valid_df = get_train_valid_similarity(data_train, data_valid)

print(f"\nmean_cosine_similarity (de 1 à -1) entre 'viewed_in_train' & 'viewed_in_valid' = {cosine_train_valid:.2f}")


mean_cosine_similarity (de 1 à -1) entre 'viewed_in_train' & 'viewed_in_valid' = 0.64


In [11]:
cosine_train_valid_df.head(3)

Unnamed: 0_level_0,article_ids_train,article_ids_valid,mean_embedding_train,mean_embedding_valid,cosine
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,"[168868, 156543, 284664, 156672, 59929, 160974...","[225010, 69353, 161872, 205845, 57748, 157815,...","[-0.17539436, -0.9664814, -0.0456632, -0.30515...","[-0.024590481, -0.9645934, -0.06022447, -0.023...",0.917003
7,"[235840, 156624, 123757, 64409, 336221, 183176]","[199474, 87223, 352979, 284470, 36162, 156279]","[-0.32671222, -0.97076577, -0.31606814, -0.361...","[-0.11400774, -0.96468645, -0.32728586, -0.201...",0.739961
8,"[332114, 284847, 272660, 273464, 313504, 23513...","[331116, 96141, 234481]","[-0.47689843, -0.9678452, -0.2180024, -0.47295...","[-0.32962552, -0.97033435, 0.20547153, -0.0870...",0.741852


## Calculons ce même score de cosine similarity si l'on donne des articles au hasard aux utilisateurs *(baseline)*

In [12]:
def get_mean_cosine_similarity(data_ref, train_df, valid_df, reco_model, reco_size = 5, top_users=None):

    # --- for each user, get the ids of the articles he/she has read
    select = data_ref.groupby('user_id')['article_id'].apply(list).reset_index(name='article_ids')
    
    # --- remove user if he/she has only one article in the data_ref (probably data_valid)
    select = select[select.article_ids.map(len) > 1]
    
    # --- crop selection if specified
    if top_users is not None:
        select = select[:top_users]
        
    # print("users_ids:", select.user_id)

    # --- for each user, compute the mean embedding vectors of the articles he/she has read
    select['read_mean_embedding'] = select.apply(lambda x : article_embedding[np.array(x[1])].mean(axis=0), axis=1)

    # --- for each user, make recommendations
    select['reco_article_ids'] = reco_model(select.index, select.read_mean_embedding, train_df, valid_df, reco_size)

    # --- for each user, compute the mean embedding vectors of the recommended articles
    select['reco_mean_embedding'] = select['reco_article_ids'].apply(lambda x : article_embedding[np.array(x)].mean(axis=0))

    # --- for each user, compute the cosine similarity between the read_mean_embedding and the pred_mean_embedding
    select['cosine'] = select.apply(lambda x: cosine_similarity(x['read_mean_embedding'].reshape(1, -1), x['reco_mean_embedding'].reshape(1, -1))[0][0], axis=1)

    # --- reset the index column
    select = select.set_index('user_id')

    # --- Compute & return overall mean cosine similarity
    return select.cosine.mean(), select

In [13]:
def reco_random(user_ids, mean_embeddings, train_df, valid_df, reco_size):
    # --- ensure to get the same random matrix for reproductibility
    np.random.seed(random_seed)
    
    # --- set 5 random article_id for each users
    return list(np.random.randint(len(article_embedding), size=(len(user_ids), reco_size)))

#### Pour tout le jeu de données

In [14]:
MCS, MCS_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_random, 5)

display(MCS_df.head(), MCS_df.shape)
print(f"\nmean_cosine_similarity: {MCS:.2f} (sachant que la cosine similarity va de 1 à -1)")

Unnamed: 0_level_0,article_ids,read_mean_embedding,reco_article_ids,reco_mean_embedding,cosine
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,"[225010, 69353, 161872, 205845, 57748, 157815,...","[-0.024590481, -0.9645934, -0.06022447, -0.023...","[305711, 117952, 152315, 358083, 359783]","[-0.15132096, -0.956843, 0.125719, -0.27082413...",0.557499
7,"[199474, 87223, 352979, 284470, 36162, 156279]","[-0.11400774, -0.96468645, -0.32728586, -0.201...","[304137, 122579, 86293, 211543, 212038]","[-0.4073767, -0.959381, -0.1800201, -0.0457417...",0.672065
8,"[331116, 96141, 234481]","[-0.32962552, -0.97033435, 0.20547153, -0.0870...","[310744, 170584, 314764, 80186, 17089]","[-0.4528594, -0.96688926, 0.4099636, -0.472056...",0.585191
10,"[196588, 193449, 195689, 65991, 100931, 58556,...","[-0.23681411, -0.96241695, -0.08872973, -0.346...","[150055, 220760, 363345, 255653, 82457]","[-0.3074916, -0.96819973, 0.09544015, -0.04938...",0.642696
11,"[208582, 96877, 195177, 205824]","[-0.15431535, -0.9709704, 0.069619074, 0.13036...","[329843, 332752, 7877, 346110, 73135]","[0.16090424, -0.96745205, -0.013877422, -0.109...",0.418073


(46638, 5)


mean_cosine_similarity: 0.46 (sachant que la cosine similarity va de 1 à -1)


#### Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [15]:
MCS, MCS_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_random, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS:.2f} (sachant que la cosine similarity va de 1 à -1)")

model_name = "Content Based Filtering - Random"
scores_df.at[model_name,'mean_cosine_similarity'] = MCS
scores_df.at[model_name,'training_time'] = 0


mean_cosine_similarity: 0.48 (sachant que la cosine similarity va de 1 à -1)


#### Vérifions la distribution des cosine_similarity calculés

In [16]:
MCS_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
cosine,1000.0,0.482212,0.13109,0.04751,0.394346,0.495873,0.57634,0.826292


# 1. Candidate generation

## 1.1 Collaborative Filtering

### Utilisons un système permettant de rééquilibrer les notes implicites *(pour éviter de donner trop d'importance aux articles qui ont un très gros ratio `temps de lecture` / `nombre de mots`)*

In [17]:
# train_sparse_item_user_bm25 = bm25_weight(sparse_item_user, K1=100, B=0.9).tocsr()
train_sparse_user_item_bm25 = bm25_weight(train_sparse_user_item, K1=100, B=0.9).tocsr() # Implicit veut des matrices [user x item]
valid_sparse_user_item_bm25 = bm25_weight(valid_sparse_user_item, K1=100, B=0.9).tocsr() # Implicit veut des matrices [user x item]

display(train_sparse_user_item_bm25.shape)
display(valid_sparse_user_item_bm25.shape)

(297141, 28002)

(84041, 7576)

### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [18]:
model_bm25 = implicit.als.AlternatingLeastSquares(
    factors=32, 
    regularization=0.05, 
    iterations=50,
    alpha=40
)

model_name = f"{model_bm25.__class__.__name__}_with_BM25"
t0 = time.perf_counter()
model_bm25.fit(train_sparse_user_item_bm25)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [01:17<00:00,  1.55s/it]


### Testons une `recommandation sur la base d'un ou plusieurs utilisateurs`

In [19]:
# Make recommendations for the first 10 users in the dataset
userids = [59, 1024] # liste d'ID utilisateurs
rec_size = 5

codes, scores = model_bm25.recommend(userids, valid_sparse_user_item_bm25[userids], N=rec_size, filter_already_liked_items=True) 

for i, user_id in enumerate(userids):
    print(f"\n --- Liste d'articles candidats pour l'utilisateur {user_id} --- \n")
    
    for code, score in zip(codes[i], scores[i]):
        idx = data_train[data_train.article_cat_code==code]['article_id'].iloc[0]
        print(f"catCode: {code:10} \t article_id: {idx:10} \t score: {score:.2f}")


 --- Liste d'articles candidats pour l'utilisateur 59 --- 

catCode:      19529 	 article_id:     242815 	 score: 0.00
catCode:      14993 	 article_id:     184332 	 score: 0.00
catCode:      17246 	 article_id:     214207 	 score: 0.00
catCode:      23071 	 article_id:     289386 	 score: 0.00
catCode:       6939 	 article_id:      81903 	 score: 0.00

 --- Liste d'articles candidats pour l'utilisateur 1024 --- 

catCode:      14993 	 article_id:     184332 	 score: 0.00
catCode:      14498 	 article_id:     177475 	 score: 0.00
catCode:      19801 	 article_id:     250102 	 score: 0.00
catCode:      15478 	 article_id:     193452 	 score: 0.00
catCode:        519 	 article_id:       7908 	 score: 0.00


### Testons une `recommandation sur la base d'un article` *(ce n'est pas le but d'un Collaborative Filtering, mais on peut le faire alors autant l'essayer)*

In [20]:
article_id = 162605
article_code = data_train[data_train.article_id == article_id]['article_cat_code'].iloc[0]
rec_size = 5

# Get similar items.
codes, scores = model_bm25.similar_items(article_code, N=rec_size , filter_items=[article_code])

print(f"\n --- Liste d'articles candidats sur la base de l'article {article_id} --- \n")
for code, score in zip(codes, scores):
    idx = data_train[data_train.article_cat_code==code]['article_id'].iloc[0]
    print(f"catCode: {code:10} \t article_id: {idx:10} \t score: {score:.2f}")


 --- Liste d'articles candidats sur la base de l'article 162605 --- 

catCode:      19604 	 article_id:     244192 	 score: 0.70
catCode:       8807 	 article_id:     102859 	 score: 0.63
catCode:      18409 	 article_id:     231890 	 score: 0.63
catCode:      20058 	 article_id:     254419 	 score: 0.61
catCode:      19229 	 article_id:     237585 	 score: 0.61


### Evaluons le modèle

> **Nous devons nous rappeler que la recommandation n'est pas une prédiction.**<br>
> S'appuyer sur des métriques ML pour déterminer la performance d'un système de recommandation n'est pas suffisant.<br>
> Seul **le retour des utilisateurs apporte des résultats valables et c'est pourquoi les tests A/B devraient toujours être priviligiés**.

- Dans la mesure ou notre jeu de données **ne dispose pas de scores explicites**, il ne parrait pas souhaitable d'utiliser des métrique du type `MAE` ou `RMSE`.
- Dans la mesure ou l'on **ne cherche pas particulièrement à obtenir un ordre précis**, il ne parrait pas souhaitable d'utiliser des métriques de ranking comme le `MAP@K` ou le `nDCCG`.
- Nous pourrions donc nous tourner vers la `Precision@k`, le `Recall@K` et donc le `F1@k`, mais il est probable que ce ne soit pas très représentatif.

#### Regardons la precision@k

In [21]:
precision_k = evaluation.precision_at_k(model_bm25, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, K=5, show_progress=True, num_threads=1)
print(precision_k)
scores_df.at[model_name,'precision@k'] = precision_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:24<00:00, 3402.37it/s]

0.0007615986217691148





> Le probleme c'est même si c'est l'une des métriques disponible les plus adaptée, elle reste peu adaptée à notre problème...<br>
> Ici `Precision = (# of top k recommendations that are relevant)/(# of items that are recommended)`<br>
> Mais malgré un nombre d'article assez large, on ne recommande que 5 articles et les utilisateurs ont un historique assez faible dans notre jeu de données. Donc les chances de recommander un article parmi 5 qui a effectivement été lu ensuite par l'utilisateur sont vraiment faible.

#### Regardons le MAP@k

In [22]:
map_k = evaluation.mean_average_precision_at_k(model_bm25, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, K=5, show_progress=True, num_threads=1)
print(map_k)
scores_df.at[model_name,'map@k'] = map_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:24<00:00, 3441.00it/s]

0.0003265866263688755





#### Regardons le nDCG@k

In [23]:
ndcg_k = evaluation.ndcg_at_k(model_bm25, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, K=5, show_progress=True, num_threads=1)
print(ndcg_k)
scores_df.at[model_name,'ndcg@k'] = ndcg_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:22<00:00, 3693.59it/s]

0.0005376861866706927





### Construisons une métrique sur mesure

Pour avoir une idée une métrique utilsable pour comparer nos différents modèles, nous pourrions comparer l'embedding moyen des articles lus APRÈS *(donc les actions contenues dans data_valid)* avec l'embedding moyen des article recommandés ET avec l'embedding moyen des articles recommandés.

#### Calculons la cosine similarity moyenne sur le jeu de validation

In [24]:
def reco_collaborative_filtering_bm25(user_ids, mean_embeddings, train_df, valid_df, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles,x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    user_codes = [lookup_users(x) for x in user_ids]
    
    reco_codes, reco_score = model_bm25.recommend(user_codes, train_df[user_codes], N=reco_size, filter_already_liked_items=True) 
    return apply_numpy(reco_codes)

#### Pour tout le jeu de données

In [25]:
MCS, MCS_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_collaborative_filtering_bm25, 5)

display(MCS_df.head(), MCS_df.shape)
print(f"\nmean_cosine_similarity: {MCS:.2f} (sachant que la cosine similarity va de 1 à -1)")

Unnamed: 0_level_0,article_ids,read_mean_embedding,reco_article_ids,reco_mean_embedding,cosine
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,"[225010, 69353, 161872, 205845, 57748, 157815,...","[-0.024590481, -0.9645934, -0.06022447, -0.023...","[327691, 227622, 73183, 227453, 286259]","[-0.42431584, -0.9727848, 0.39563924, -0.09649...",0.59579
7,"[199474, 87223, 352979, 284470, 36162, 156279]","[-0.11400774, -0.96468645, -0.32728586, -0.201...","[282462, 242815, 100358, 340854, 309540]","[-0.497061, -0.9597889, 0.35031134, -0.4517836...",0.595449
8,"[331116, 96141, 234481]","[-0.32962552, -0.97033435, 0.20547153, -0.0870...","[227453, 19208, 204888, 192062, 153842]","[0.16150436, -0.9692475, 0.20166084, 0.1541895...",0.590344
10,"[196588, 193449, 195689, 65991, 100931, 58556,...","[-0.23681411, -0.96241695, -0.08872973, -0.346...","[140615, 128379, 254496, 117759, 313082]","[-0.3815019, -0.97024024, 0.5806314, -0.448627...",0.640554
11,"[208582, 96877, 195177, 205824]","[-0.15431535, -0.9709704, 0.069619074, 0.13036...","[68719, 213536, 99786, 34025, 230256]","[0.07979739, -0.96122915, 0.19083002, 0.061166...",0.454397


(46638, 5)


mean_cosine_similarity: 0.47 (sachant que la cosine similarity va de 1 à -1)


#### Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [26]:
MCS1, MCS1_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, reco_collaborative_filtering_bm25, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1


mean_cosine_similarity: 0.50 (sachant que la cosine similarity va de 1 à -1)


#### Comparons

In [27]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Content Based Filtering - Random,0.482212,,,,0.0
AlternatingLeastSquares_with_BM25,0.499395,0.000762,0.000327,0.000538,77.47612


## 1.2 Content Based Filtering

articles_embeddings.pickle Pickle (Python 3) of a NumPy matrix containing the Article Content Embeddings (250-dimensional vectors), trained upon articles' text and metadata by the CHAMELEON's ACR module (see paper for details) for 364047 published articles.
P.s. The full text of news articles could not be provided due to license restrictions, but those embeddings can be used by Neural Networks to represent their content. See this paper for a t-SNE visualization of these embeddings, colored by category.

### Trouvons l'embedding moyen des articles lus par un utilisateur donnée

In [28]:
user_id = 20137
history_size = 5

def get_mean_vector(articles_idx):
    apply_numpy = lambda x : article_embedding[x].mean(axis=0)
    return apply_numpy(articles_idx)            

user_articles_idx = data_train[data_train.user_id == user_id]['article_id']
last_articles_idx = user_articles_idx.iloc[-history_size:].values #.sort_values('click_timestamp')
mean_vector = get_mean_vector(last_articles_idx)

print(f"Articles utilisés dans le mean embedding: {last_articles_idx}")

Articles utilisés dans le mean embedding: [288440 337441 202476 250043 284583]


### Calculons la similarité de cet embedding avec les embeddings des articles présents dans note fichier

In [29]:
def get_cosine(article_embedding, mean_vector, user_articles_idx=None):
    
    A = article_embedding.copy()
    B = mean_vector

    # --- Assurons nous de ne pas recommander les article déjà lus par cet utilisateur
    if user_articles_idx is not None:
        A[user_articles_idx] = -B # je donne aux articles lus un embedding inverse de celui que l'on cible
 
    # --- Calculons la cosine similarity entre l'embedding moyen de l'utilisateur et les articles connus
    return np.dot(A,B)/(norm(A, axis=1)*norm(B))

# --- Calculons la cosine similarity entre l'embedding moyen de l'utilisateur et les articles connus
cosine = get_cosine(article_embedding, mean_vector, user_articles_idx)
print("Cosine Similarity:", cosine, cosine.shape)

Cosine Similarity: [0.32582363 0.2783536  0.31348327 ... 0.38568467 0.2742523  0.42668363] (364047,)


### Recommandons 5 articles à l'utilisateur

In [30]:
def recommend_articles(cosine, reco_size=5):

    cos = pd.DataFrame(cosine, columns=['cosine_sim'])
    selection = cos.sort_values('cosine_sim', ascending=False)[:reco_size]
    selection.reset_index(inplace=True)
    selection.rename(columns={'index':'article_id'}, inplace=True)
    return selection

reco = recommend_articles(cosine, 5)
reco

Unnamed: 0,article_id,cosine_sim
0,284768,0.830968
1,345593,0.821919
2,285424,0.821205
3,345566,0.818335
4,283576,0.816523


### Comparons avec les articles consultés par cet utilisateur dans le validation_set

In [31]:
mean_vector_recommended = get_mean_vector(reco.article_id)

In [32]:
viewed = data_valid[data_valid.user_id == user_id]['article_id'].values
mean_vector_viewed = get_mean_vector(viewed)

#### Similarité entre les articles lus dans le `valid_set` et les articles prédis

In [33]:
def get_cosine_similarity(A, B):
    return np.dot(A,B)/(norm(A)*norm(B))

cosine_similarity_score = get_cosine_similarity(mean_vector_recommended, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity_score)

Cosine Similarity: 0.71754134


#### Similarité entre les articles lus dans le `train_set` et les articles lus dans le `valid_set`

In [34]:
cosine_similarity_score = get_cosine_similarity(mean_vector, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity_score)

Cosine Similarity: 0.7984939


#### Calculons la cosine similarity moyenne sur l'ensemble du jeu de validation *(⚠️ en fait une fraction car c'est beaucoup trop lent)*

In [35]:
cosine_pred_viewed = []
reco_size = 5

# --- get users of the validation set
user_idx = np.sort(data_valid.user_id.unique())

for user_id in user_idx[:1000]:
    #print(user_id)
    
    # --- Compute viewed mean_embedding
    viewed_train = data_train[data_train.user_id == user_id]['article_id']
    if len(viewed_train) < 1: continue
    mean_vector_viewed_train = get_mean_vector(viewed_train.values)
    
    # --- Compute viewed mean_embedding 
    viewed_valid = data_valid[data_valid.user_id == user_id]['article_id']
    if len(viewed_valid) < 1: continue
    mean_vector_viewed_valid = get_mean_vector(viewed_valid.values)
    
    # --- Compute cosine similarity the mean of the viewed articles and the rest
    A = article_embedding.copy()
    B = mean_vector_viewed_train
    A[viewed_train.values] = -B # Cancel already read articles
    cosine = cosine_similarity(A, B.reshape(1, -1))
    
    recommendations =  recommend_articles(cosine, reco_size)
    mean_vector_reco = get_mean_vector(recommendations.article_id)
    
    # --- Compute similarities
    cosine_similarity_pred_viewed = cosine_similarity(mean_vector_reco.reshape(1, -1), mean_vector_viewed_valid.reshape(1, -1))
    
    cosine_pred_viewed.append(cosine_similarity_pred_viewed)

In [36]:
print(f"\nmean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = {np.mean(cosine_pred_viewed):.2f} (que l'on compare donc avec les 0.55 du Collaborative Filtering ALS)")

model_name = "Content Based Filtering"
scores_df.at[model_name,'mean_cosine_similarity'] = np.mean(cosine_pred_viewed)
scores_df.at[model_name,'training_time'] = 0


mean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = 0.58 (que l'on compare donc avec les 0.55 du Collaborative Filtering ALS)


#### Comparons

In [37]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Content Based Filtering - Random,0.482212,,,,0.0
AlternatingLeastSquares_with_BM25,0.499395,0.000762,0.000327,0.000538,77.47612
Content Based Filtering,0.576889,,,,0.0


> #### Si l'on se fie au score moyen de similarité entre les recommandations et ce qui a été effectivemet lu (dans le validation set), le `Collaborative filtering` semble plus précis.
> Mais ce n'est pas le seul élément à prendre en compte *(d'autant que cette mesure n'est pas très significative)*.

> `Collaborative Filtering`:
> * plus en accord avec ce qui est dans le validation set par les utilisateurs,
> * plus rapide,
> * limité aux articles visités.<br>

> `Content Based Filtering`:
> * plus lent *(du moins je n'ai pas réussi à le rendre rapide)*,
> * moins prévis *(mais je n'ai pu évaluer que les 100 premiers utlisateurs)*
> * peut recommander n'importe quel article dont on a l'embedding, y compris ceux jamais visités par un utilisateur.

> Dans les deux cas, les modèles employés ne permettent pas de prendre en compte des features supplémentaires comme par exemple la catégorie de l'article etc. *(Il faudrait un Collaborative Filtering en DNN)*

> L'idéal est donc probablement d'utiliser une combinaison des deux approches; le Collaboravie filtering pour être le plus proche possible des intérêts de l'utilisateur, et le Content Based Filtering pour apporter de la diversité sans pour autant trop s'éloigner des sujets de prédilection de l'utilisateur.

## 1.3 Essayons quelques variantes

### AlternatingLeastSquares sans BM25

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [38]:
model_als = implicit.als.AlternatingLeastSquares(
    factors=32, 
    regularization=0.05, 
    iterations=50,
    alpha=40
)

model_name = model_als.__class__.__name__
t0 = time.perf_counter()
model_als.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:15<00:00,  2.71s/it]


#### Calculons la cosine similarity moyenne sur le jeu de validation

In [39]:
def reco_collaborative_filtering_als(user_ids, mean_embeddings, train_df, valid_df, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles,x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    user_codes = [lookup_users(x) for x in user_ids]

    reco_codes, reco_score = model_als.recommend(user_codes, train_df[user_codes], N=reco_size, filter_already_liked_items=True) 
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [40]:
MCS1als, MCS1als_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_als, 5)
print(f"\nmean_cosine_similarity: {MCS1als:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.53 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [41]:
MCS1als, MCS1als_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_als, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1als:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1als


mean_cosine_similarity: 0.57 (sachant que la cosine similarity va de 1 à -1)


#### Regardons la precision@k

In [42]:
precision_k = evaluation.precision_at_k(model_als, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(precision_k)
scores_df.at[model_name,'precision@k'] = precision_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:23<00:00, 3649.63it/s]

0.0005409976416704747





#### Regardons le MAP@k

In [43]:
map_k = evaluation.mean_average_precision_at_k(model_als, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(map_k)
scores_df.at[model_name,'map@k'] = map_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:23<00:00, 3587.86it/s]

0.00016936177976622518





#### Regardons le nDCG@k

In [44]:
ndcg_k = evaluation.ndcg_at_k(model_als, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(ndcg_k)
scores_df.at[model_name,'ndcg@k'] = ndcg_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:23<00:00, 3579.92it/s]

0.00031228233986141987





#### Comparons

In [45]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Content Based Filtering - Random,0.482212,,,,0.0
AlternatingLeastSquares_with_BM25,0.499395,0.000762,0.000327,0.000538,77.47612
Content Based Filtering,0.576889,,,,0.0
AlternatingLeastSquares,0.566251,0.000541,0.000169,0.000312,135.519352


### Logistic Matrix Factorization

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [46]:
model_lmf = implicit.cpu.lmf.LogisticMatrixFactorization(
    factors=32,
    learning_rate=0.05,
    regularization=0.05, 
    iterations=50,
)

model_name = model_lmf.__class__.__name__
t0 = time.perf_counter()
model_lmf.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:25<00:00,  2.00it/s]


#### Calculons la cosine similarity moyenne sur le jeu de validation

In [47]:
def reco_collaborative_filtering_lmf(user_ids, mean_embeddings, train_df, valid_df, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles,x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    user_codes = [lookup_users(x) for x in user_ids]
    
    reco_codes, reco_score = model_lmf.recommend(user_codes, train_df[user_codes], N=reco_size, filter_already_liked_items=True) 
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [48]:
MCS1lmf, MCS1lmf_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_lmf, 5)
print(f"\nmean_cosine_similarity: {MCS1lmf:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.52 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [49]:
MCS1lmf, MCS1lmf_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_lmf, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1lmf:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1lmf


mean_cosine_similarity: 0.55 (sachant que la cosine similarity va de 1 à -1)


#### Regardons la precision@k

In [50]:
precision_k = evaluation.precision_at_k(model_lmf, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(precision_k)
scores_df.at[model_name,'precision@k'] = precision_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:23<00:00, 3629.25it/s]

0.0001733293415060744





#### Regardons le MAP@k

In [51]:
map_k = evaluation.mean_average_precision_at_k(model_lmf, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(map_k)
scores_df.at[model_name,'map@k'] = map_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:23<00:00, 3646.08it/s]

5.39088725212165e-05





#### Regardons le nDCG@k

In [52]:
ndcg_k = evaluation.ndcg_at_k(model_lmf, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(ndcg_k)
scores_df.at[model_name,'ndcg@k'] = ndcg_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:23<00:00, 3562.27it/s]

0.00010117728775452379





#### Comparons

In [53]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Content Based Filtering - Random,0.482212,,,,0.0
AlternatingLeastSquares_with_BM25,0.499395,0.000762,0.000327,0.000538,77.47612
Content Based Filtering,0.576889,,,,0.0
AlternatingLeastSquares,0.566251,0.000541,0.000169,0.000312,135.519352
LogisticMatrixFactorization,0.552624,0.000173,5.4e-05,0.000101,25.511894


### Bayesian Personalized Ranking

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [54]:
model_bpr = implicit.cpu.bpr.BayesianPersonalizedRanking(
    factors=32,
    learning_rate=0.05,
    regularization=0.05, 
    iterations=50,
)

model_name = model_bpr.__class__.__name__
t0 = time.perf_counter()
model_bpr.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:07<00:00,  6.43it/s, train_auc=88.08%, skipped=4.27%]


#### Calculons la cosine similarity moyenne sur le jeu de validation

In [55]:
def reco_collaborative_filtering_bpr(user_ids, mean_embeddings, train_df, valid_df, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles,x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    user_codes = [lookup_users(x) for x in user_ids]
    
    reco_codes, reco_score = model_bpr.recommend(user_codes, train_df[user_codes], N=reco_size, filter_already_liked_items=True) 
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [56]:
MCS1bpr, MCS1bpr_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_bpr, 5)
print(f"\nmean_cosine_similarity: {MCS1bpr:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.49 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [57]:
MCS1bpr, MCS1bpr_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_bpr, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1bpr:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1bpr


mean_cosine_similarity: 0.52 (sachant que la cosine similarity va de 1 à -1)


#### Regardons la precision@k

In [58]:
precision_k = evaluation.precision_at_k(model_bpr, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(precision_k)
scores_df.at[model_name,'precision@k'] = precision_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:22<00:00, 3719.04it/s]

0.00011030049004932008





#### Regardons le MAP@k

In [59]:
map_k = evaluation.mean_average_precision_at_k(model_bpr, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(map_k)
scores_df.at[model_name,'map@k'] = map_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:24<00:00, 3500.40it/s]

4.323947702774704e-05





#### Regardons le nDCG@k

In [60]:
ndcg_k = evaluation.ndcg_at_k(model_bpr, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(ndcg_k)
scores_df.at[model_name,'ndcg@k'] = ndcg_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:24<00:00, 3487.60it/s]

7.25009254368096e-05





#### Comparons

In [61]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Content Based Filtering - Random,0.482212,,,,0.0
AlternatingLeastSquares_with_BM25,0.499395,0.000762,0.000327,0.000538,77.47612
Content Based Filtering,0.576889,,,,0.0
AlternatingLeastSquares,0.566251,0.000541,0.000169,0.000312,135.519352
LogisticMatrixFactorization,0.552624,0.000173,5.4e-05,0.000101,25.511894
BayesianPersonalizedRanking,0.523876,0.00011,4.3e-05,7.3e-05,7.977837


### Item Item Recommender

In [62]:
model_name = "ItemItemRecommender"

#### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [63]:
model_iir = implicit.nearest_neighbours.ItemItemRecommender()

model_name = model_iir.__class__.__name__
t0 = time.perf_counter()
model_iir.fit(train_sparse_user_item)
scores_df.at[model_name,'training_time'] = time.perf_counter() - t0

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 28002/28002 [00:00<00:00, 102486.85it/s]


#### Calculons la cosine similarity moyenne sur le jeu de validation

In [64]:
def reco_collaborative_filtering_iir(user_ids, mean_embeddings, train_df, valid_df, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles,x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    user_codes = [lookup_users(x) for x in user_ids]
    
    reco_codes, reco_score = model_iir.recommend(user_codes, train_df[user_codes], N=reco_size, filter_already_liked_items=True) 
    return apply_numpy(reco_codes)

Pour tout le jeu de données

In [65]:
MCS1iir, MCS1iir_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_iir, 5)
print(f"\nmean_cosine_similarity: {MCS1iir:.2f} (sachant que la cosine similarity va de 1 à -1)")


mean_cosine_similarity: 0.52 (sachant que la cosine similarity va de 1 à -1)


Pour les 1000 premiers utilisateurs *(car j'ai du mal à calculer le score sur l'ensemble du jeu de données avec le Content Based Filtering)*

In [66]:
MCS1iir, MCS1iir_df = get_mean_cosine_similarity(data_valid, train_sparse_user_item, valid_sparse_user_item, reco_collaborative_filtering_iir, 5, 1000)
print(f"\nmean_cosine_similarity: {MCS1iir:.2f} (sachant que la cosine similarity va de 1 à -1)")
scores_df.at[model_name,'mean_cosine_similarity'] = MCS1iir


mean_cosine_similarity: 0.55 (sachant que la cosine similarity va de 1 à -1)


#### Regardons la precision@k

In [67]:
precision_k = evaluation.precision_at_k(model_iir, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(precision_k)
scores_df.at[model_name,'precision@k'] = precision_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:08<00:00, 9506.22it/s]

0.00042019234304502886





#### Regardons le MAP@k

In [68]:
map_k = evaluation.mean_average_precision_at_k(model_iir, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(map_k)
scores_df.at[model_name,'map@k'] = map_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:08<00:00, 9481.83it/s]

0.00019785316426241682





#### Regardons le nDCG@k

In [69]:
ndcg_k = evaluation.ndcg_at_k(model_iir, train_sparse_user_item, valid_sparse_user_item, K=5, show_progress=True, num_threads=1)
print(ndcg_k)
scores_df.at[model_name,'ndcg@k'] = ndcg_k

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:08<00:00, 9584.77it/s]

0.0003075531886240062





#### Comparons

In [70]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Content Based Filtering - Random,0.482212,,,,0.0
AlternatingLeastSquares_with_BM25,0.499395,0.000762,0.000327,0.000538,77.47612
Content Based Filtering,0.576889,,,,0.0
AlternatingLeastSquares,0.566251,0.000541,0.000169,0.000312,135.519352
LogisticMatrixFactorization,0.552624,0.000173,5.4e-05,0.000101,25.511894
BayesianPersonalizedRanking,0.523876,0.00011,4.3e-05,7.3e-05,7.977837
ItemItemRecommender,0.552734,0.00042,0.000198,0.000308,0.321904


### Content Based Filtering basé sur le dernier article lu uniquement

In [71]:
cosine_pred_viewed = []
reco_size = 5

# --- get users of the validation set
user_idx = np.sort(data_valid.user_id.unique())

for user_id in user_idx[:1000]:
    #print(user_id)
    
    # --- Compute viewed mean_embedding
    viewed_train = data_train[data_train.user_id == user_id]['article_id'].iloc[-1:]
    if len(viewed_train) < 1: continue
    mean_vector_viewed_train = get_mean_vector(viewed_train.values)
    
    # --- Compute viewed mean_embedding
    viewed_train = data_train[data_train.user_id == user_id]['article_id'].iloc[-1:]
    if len(viewed_train) < 1: continue
    mean_vector_viewed_train = get_mean_vector(viewed_train.values)
    
    # --- Compute cosine similarity the mean of the viewed articles and the rest
    A = article_embedding.copy()
    B = mean_vector_viewed_train
    A[viewed_train.values] = -B # Cancel already read articles
    cosine = cosine_similarity(A, B.reshape(1, -1))
    
    recommendations =  recommend_articles(cosine, reco_size)
    mean_vector_reco = get_mean_vector(recommendations.article_id)
    
    # --- Compute similarities
    cosine_similarity_pred_viewed = cosine_similarity(mean_vector_reco.reshape(1, -1), mean_vector_viewed_valid.reshape(1, -1))
    
    cosine_pred_viewed.append(cosine_similarity_pred_viewed)

In [72]:
print(f"\nmean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = {np.mean(cosine_pred_viewed):.2f} (que l'on compare donc avec les 0.55 du Collaborative Filtering ALS)")

model_name = "Content Based Filtering based on last read article"
scores_df.at[model_name,'mean_cosine_similarity'] = np.mean(cosine_pred_viewed)
scores_df.at[model_name,'training_time'] = 0


mean_cosine_similarity (de 1 à -1) entre 'recommended' & 'viewed_in_valid' = 0.19 (que l'on compare donc avec les 0.55 du Collaborative Filtering ALS)


## 1.3 Revue des scores <a class="anchor" id="models_scores"></a> [⇪](#menu)

In [73]:
scores_df

Unnamed: 0_level_0,mean_cosine_similarity,precision@k,map@k,ndcg@k,training_time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Content Based Filtering - Random,0.482212,,,,0.0
AlternatingLeastSquares_with_BM25,0.499395,0.000762,0.000327,0.000538,77.47612
Content Based Filtering,0.576889,,,,0.0
AlternatingLeastSquares,0.566251,0.000541,0.000169,0.000312,135.519352
LogisticMatrixFactorization,0.552624,0.000173,5.4e-05,0.000101,25.511894
BayesianPersonalizedRanking,0.523876,0.00011,4.3e-05,7.3e-05,7.977837
ItemItemRecommender,0.552734,0.00042,0.000198,0.000308,0.321904
Content Based Filtering based on last read article,0.186063,,,,0.0


# 2. Préparons le modèle Hybrid à déployer

### Sauvegardons le modèle choisi

In [74]:
joblib.dump((model_als, train_sparse_user_item, article_lookup, user_lookup), pathlib.Path('azure_function','data','collaborative_recommender.pkl'))

['azure_function/data/collaborative_recommender.pkl']

In [75]:
joblib.dump((article_embedding, data_train), pathlib.Path('azure_function','data','content_based_recommender.pkl'))

['azure_function/data/content_based_recommender.pkl']

### Chargeons le modèle exporté

In [76]:
(reco_als, sparse_matrix, article_lookup, user_lookup) = joblib.load(pathlib.Path('azure_function','data','collaborative_recommender.pkl'))
(article_embedding, data_train) = joblib.load(pathlib.Path('azure_function','data','content_based_recommender.pkl'))

In [77]:
user_id = 299017

### Préparons une fonction de recommandation pour le Collaborative filtering

In [79]:
def get_collaborative_recommendations( user_id, reco_size ):
    
    user_codes = lookup_users(user_id)
    codes, scores = reco_als.recommend(user_codes, sparse_matrix[user_codes], N=reco_size, filter_already_liked_items=True)
    recommendations = pd.DataFrame(np.vstack((codes, scores)).T, columns=['article_id', 'score'])
    recommendations['article_id'] = recommendations['article_id'].apply(lambda x: lookup_articles(x))
    return recommendations

candidates_collaborative_filtering = get_collaborative_recommendations(user_id, 5)
candidates_collaborative_filtering

Unnamed: 0,article_id,score
0,348093,4.359293e-12
1,224354,3.515467e-12
2,288320,3.435705e-12
3,352901,3.384959e-12
4,284547,3.295397e-12


### Préparons une fonction de recommandation pour le Content Based Filtering

In [80]:
def get_content_based_recommendations( user_id, reco_size=5 ):
    print(user_id)
    
    # --- Compute viewed mean_embedding
    viewed_train = data_train[data_train.user_id == user_id]['article_id']
    # if len(viewed_train) < 1: return
    mean_vector_viewed_train = get_mean_vector(viewed_train.values)
    
    # --- Compute cosine similarity the mean of the viewed articles and the rest
    A = article_embedding.copy()
    B = mean_vector_viewed_train
    A[viewed_train.values] = -B # Cancel already read articles
    cosine = cosine_similarity(A, B.reshape(1, -1))
    
    recommendations =  recommend_articles(cosine, reco_size)
    return recommendations
    
candidates_content_based_filtering = get_content_based_recommendations(user_id, 5)
candidates_content_based_filtering

299017


Unnamed: 0,article_id,cosine_sim
0,312610,0.926546
1,324693,0.923426
2,313922,0.922881
3,314113,0.918606
4,313556,0.918562


### Préparons une fonction de recommandation unissant le Collaborative Filtering et le Content Based Filtering

In [81]:
def get_reco( user_id, reco_size ):
    
    # Get Collaborative Filtering Recommendations
    candidates_collaborative_filtering = get_collaborative_recommendations(user_id, reco_size)
    
    # Get Content Based Recommendations
    candidates_content_based_filtering = get_content_based_recommendations(user_id, reco_size)
    
    # Select some of them at random
    cf_size, cbf_size = math.floor(reco_size/2), math.ceil(reco_size/2)  # 1/2 vs 1/2 avec priorité cbf
    #cf_size, cbf_size = round(reco_size/3*1), round(reco_size/3*2)  # 1/3 vs 2/3
    p_cf = candidates_collaborative_filtering.sample(cf_size)
    p_cbf = candidates_content_based_filtering.sample(cbf_size)
    
    # Return the selected articles_id
    return np.concatenate((p_cf['article_id'].values, p_cbf['article_id'].values)). astype(int)

get_reco( user_id, 5 )

299017


array([284547, 224354, 313922, 312610, 324693])