In [1]:
import pickle 

import pandas as pd
import numpy as np
from numpy.linalg import norm

import scipy.sparse as sparse
from sklearn.metrics.pairwise import cosine_similarity

import implicit
from implicit.nearest_neighbours import bm25_weight
from implicit import evaluation

import ml_metrics

  from .autonotebook import tqdm as notebook_tqdm


### Chargeons les jeux de données `training` et `validation`

In [2]:
data_train = pd.read_csv('data/data_train.csv')
display(data_train.head(3), data_train.shape)

Unnamed: 0,user_id,article_id,score
0,59,234853,0.214286
1,79,159359,0.215827
2,154,96663,0.145631


(1577295, 3)

In [3]:
data_valid = pd.read_csv('data/data_valid.csv')
display(data_valid.head(3), data_valid.shape)

Unnamed: 0,user_id,article_id,score
0,279777,96210,0.109489
1,29634,284773,0.469863
2,55,162605,1.324273


(241105, 3)

### Chargons les embeddings

In [4]:
file = open('data/news-portal-user-interactions-by-globocom/articles_embeddings.pickle',"rb")
article_embedding = pickle.load(file)

In [5]:
display(article_embedding[:5], article_embedding.shape)

array([[-0.16118301, -0.95723313, -0.13794445, ..., -0.231686  ,
         0.5974159 ,  0.40962312],
       [-0.52321565, -0.974058  ,  0.73860806, ...,  0.18282819,
         0.39708954, -0.83436364],
       [-0.61961854, -0.9729604 , -0.20736018, ..., -0.44758022,
         0.8059317 , -0.28528407],
       [-0.7408434 , -0.97574896,  0.39169782, ..., -0.5378381 ,
         0.24354108, -0.8853287 ],
       [-0.2790515 , -0.97231525,  0.68537366, ..., -0.42406067,
         0.18548405, -0.5802922 ]], dtype=float32)

(364047, 250)

# 1. Candidate generation

# 1.1 Collaborative Filtering

### Préparons une sparse matrix pour entrainer nos algorithmes de collaborative filtering

In [6]:
# --- Train ---
data_train['user_cat_code'] = data_train['user_id'].astype('category').cat.codes
data_train['article_cat_code'] = data_train['article_id'].astype('category').cat.codes

# train_sparse_item_user = sparse.csr_matrix((data_train['score'].astype(float), (data_train['article_cat_code'], data_train['user_cat_code'])))
train_sparse_user_item = sparse.csr_matrix((data_train['score'].astype(float), (data_train['user_cat_code'], data_train['article_cat_code'])))
display(train_sparse_user_item.shape)

# --- Validation ---
data_valid['user_cat_code'] = data_valid['user_id'].astype('category').cat.codes
data_valid['article_cat_code'] = data_valid['article_id'].astype('category').cat.codes

# valid_sparse_item_user = sparse.csr_matrix((data_valid['score'].astype(float), (data_valid['article_cat_code'], data_valid['user_cat_code'])))
valid_sparse_user_item = sparse.csr_matrix((data_valid['score'].astype(float), (data_valid['user_cat_code'], data_valid['article_cat_code'])))

display(valid_sparse_user_item.shape)

(297141, 28002)

(84041, 7576)

### Utilisons un système permettant de rééquilibrer les notes implicites *(pour éviter de donner trop d'importance aux articles qui ont un très gros ratio `temps de lecture` / `nombre de mots`)*

In [7]:
# train_sparse_item_user_bm25 = bm25_weight(sparse_item_user, K1=100, B=0.9).tocsr()
train_sparse_user_item_bm25 = bm25_weight(train_sparse_user_item, K1=100, B=0.9).tocsr() # Implicit veut des matrices [user x item]
valid_sparse_user_item_bm25 = bm25_weight(valid_sparse_user_item, K1=100, B=0.9).tocsr() # Implicit veut des matrices [user x item]

display(train_sparse_user_item_bm25.shape)
display(valid_sparse_user_item_bm25.shape)

(297141, 28002)

(84041, 7576)

### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [8]:
model_bm25 = implicit.als.AlternatingLeastSquares(
    factors=32, 
    regularization=0.05, 
    iterations=50,
    alpha=40
)

model_bm25.fit(train_sparse_user_item_bm25)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [04:20<00:00,  5.21s/it]


### Testons une `recommandation sur la base d'un ou plusieurs utilisateurs`

In [9]:
# Make recommendations for the first 10 users in the dataset
userids = [59, 1024] # liste d'ID utilisateurs
rec_size = 5

codes, scores = model_bm25.recommend(userids, valid_sparse_user_item_bm25[userids], N=rec_size, filter_already_liked_items=True) 

for i, user_id in enumerate(userids):
    print(f"\n --- Liste d'articles candidats pour l'utilisateur {user_id} --- \n")
    
    for code, score in zip(codes[i], scores[i]):
        idx = data_train[data_train.article_cat_code==code]['article_id'].iloc[0]
        print(f"catCode: {code:10} \t article_id: {idx:10} \t score: {score:.2f}")


 --- Liste d'articles candidats pour l'utilisateur 59 --- 

catCode:      10394 	 article_id:     119592 	 score: 1.07
catCode:       5718 	 article_id:      68866 	 score: 0.99
catCode:       9556 	 article_id:     108854 	 score: 0.96
catCode:       8217 	 article_id:      96663 	 score: 0.93
catCode:      13935 	 article_id:     168868 	 score: 0.92

 --- Liste d'articles candidats pour l'utilisateur 1024 --- 

catCode:       8129 	 article_id:      96210 	 score: 1.33
catCode:      26190 	 article_id:     336245 	 score: 1.28
catCode:       1430 	 article_id:      20691 	 score: 1.18
catCode:      14884 	 article_id:     183176 	 score: 1.17
catCode:      18548 	 article_id:     233688 	 score: 1.06


### Testons une `recommandation sur la base d'un article` *(ce n'est pas le but d'un Collaborative Filtering, mais on peut le faire alors autant l'essayer)*

In [10]:
article_id = 162605
article_code = data_train[data_train.article_id == article_id]['article_cat_code'].iloc[0]
rec_size = 5

# Get similar items.
codes, scores = model_bm25.similar_items(article_code, N=rec_size , filter_items=[article_code])

print(f"\n --- Liste d'articles candidats sur la base de l'article {article_id} --- \n")
for code, score in zip(codes, scores):
    idx = data_train[data_train.article_cat_code==code]['article_id'].iloc[0]
    print(f"catCode: {code:10} \t article_id: {idx:10} \t score: {score:.2f}")


 --- Liste d'articles candidats sur la base de l'article 162605 --- 

catCode:       6359 	 article_id:      74112 	 score: 0.96
catCode:       3766 	 article_id:      47888 	 score: 0.81
catCode:      22526 	 article_id:     285045 	 score: 0.80
catCode:      13314 	 article_id:     162107 	 score: 0.80
catCode:      16610 	 article_id:     206429 	 score: 0.79


### Evaluons le modèle

> **Nous devons nous rappeler que la recommandation n'est pas une prédiction.**<br>
> S'appuyer sur des métriques ML pour déterminer la performance d'un système de recommandation n'est pas suffisant.<br>
> Seul le retour des utilisateurs apporte des résultats valables et c'est pourquoi les tests A/B devraient toujours être priviligiés.

- Dans la mesure ou notre jeu de données **ne dispose pas de scores explicites**, il ne parrait pas souhaitable d'utiliser des métrique du type `MAE` ou `RMSE`.
- Dans la mesure ou l'on **ne cherche pas particulièrement à obtenir un ordre précis**, il ne parrait pas souhaitable d'utiliser des métriques de ranking comme le `MAP@K` ou le `nDCCG`.
- Nous pourrions donc nous tourner vers la `Precision@k`, le `Recall@K` et donc le `F1@k`, mais il est probable que ce ne soit pas très représentatif.

#### Regardons la precision@k

In [11]:
evaluation.precision_at_k(model_bm25, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, K=5, show_progress=True, num_threads=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:27<00:00, 3048.35it/s]


0.00013656251148963438

> Le probleme c'est même si c'est l'une des métriques disponible les plus adaptée, elle reste peu adaptée à notre problème...<br>
> Ici `Precision = (# of top k recommendations that are relevant)/(# of items that are recommended)`<br>
> Mais malgré un nombre d'article assez large, on ne recommande que 5 articles et les utilisateurs ont un historique assez faible dans notre jeu de données. Donc les chances de recommander un article parmi 5 qui a effectivement été lu ensuite par l'utilisateur sont vraiment faible.

### Construisons une métrique sur mesure

Pour avoir une idée plus globale, nous pourrions comparer l'embedding moyen des articles lus APRÈS *(donc les actions contenues dans data_valid)* avec l'embedding moyen des article recommandés ET avec l'embedding moyen des articles lus AVANT *(donc les actions contenues dans le data_train)*.

#### Récuperons la liste des articles lus par chaque utilisateur du validation set

In [12]:
article_lookup = pd.DataFrame(data_train.groupby('article_cat_code')['article_id'].apply(lambda x: list(x)[0])).to_dict()

In [13]:
def get_mean_cosine_similarity(data_ref, pred_ref, verbose=1):

    # --- for each user, get the ids of the articles he/she has read
    select = data_ref.groupby('user_id')['article_id'].apply(list).reset_index(name='article_ids')

    # --- for each user, compute the mean embedding vectors of the articles he/she has read
    select['read_mean_embedding'] = select.apply(lambda x : article_embedding[np.array(x[1])].mean(axis=0), axis=1)
    #select['size1'] = select['mean_embedding'].apply(lambda x : len(x))

    # --- for each user, make recommendations
    rec_size = 5
    pred_codes, pred_scores = model_bm25.recommend(select.index, pred_ref[select.index], N=rec_size, filter_already_liked_items=True) 
    select['pred_codes'] = pred_codes.tolist()

    # --- for each user, convert the article_cat_code to article_id
    def lookup_articles(x):
        return article_lookup['article_id'][x]

    pred_idx = select['pred_codes'].apply(lambda x : list(map(lookup_articles,x)))
    select['pred_idx'] = pred_idx.to_numpy().tolist()

    # --- for each user, compute the mean embedding vectors of the recommended articles
    select['pred_mean_embedding'] = select['pred_idx'].apply(lambda x : article_embedding[np.array(x)].mean(axis=0))

    # --- for each user, compute the cosine similarity between the read_mean_embedding and the pred_mean_embedding
    select['cosine'] = select.apply(lambda x: cosine_similarity(x['read_mean_embedding'].reshape(1, -1), x['pred_mean_embedding'].reshape(1, -1))[0][0], axis=1)

    # --- reset the index column
    select = select.set_index('user_id')

    # --- Display sample
    if verbose > 0:
        display(select.head(), select.shape)

    # --- Compute & return overall mean cosine similarity
    return select.cosine.mean(), select

MCS, MCS_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item_bm25, verbose=1)

print(f"mean_cosine_similarity: {MCS:.2f}")

Unnamed: 0_level_0,article_ids,read_mean_embedding,pred_codes,pred_idx,pred_mean_embedding,cosine
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,[36162],"[0.3944709, -0.9545699, -0.12643021, -0.031527...","[22509, 12972, 8237, 7534, 13106]","[284985, 158906, 96755, 87224, 160158]","[-0.1399826, -0.9593972, -0.44222125, 0.071453...",0.598711
2,[30760],"[-0.3254827, -0.96189374, -0.32057792, -0.7900...","[18554, 9648, 10394, 22265, 26768]","[233717, 111043, 119592, 283505, 348112]","[-0.50335515, -0.9651922, -0.1767746, -0.50258...",0.326945
5,"[225010, 69353, 161872, 205845, 57748, 157815,...","[-0.024590481, -0.9645934, -0.06022447, -0.023...","[10394, 8217, 13141, 9556, 22427]","[119592, 96663, 160474, 108854, 284463]","[-0.43367648, -0.96880037, -0.12601939, -0.163...",0.718369
6,[202355],"[-0.09313608, -0.9619413, -0.22228149, 0.77629...","[18894, 18646, 18548, 19039, 22509]","[235689, 234269, 233688, 236444, 284985]","[-0.43889603, -0.96931684, -0.22122476, -0.144...",0.435389
7,"[199474, 87223, 352979, 284470, 36162, 156279]","[-0.11400774, -0.96468645, -0.32728586, -0.201...","[7534, 18543, 7541, 7536, 7537]","[87224, 233658, 87236, 87231, 87232]","[-0.24903056, -0.9495894, -0.20430413, 0.59792...",0.604813


(84041, 6)

mean_cosine_similarity: 0.45


# 1.2 Content Based Filtering

articles_embeddings.pickle Pickle (Python 3) of a NumPy matrix containing the Article Content Embeddings (250-dimensional vectors), trained upon articles' text and metadata by the CHAMELEON's ACR module (see paper for details) for 364047 published articles.
P.s. The full text of news articles could not be provided due to license restrictions, but those embeddings can be used by Neural Networks to represent their content. See this paper for a t-SNE visualization of these embeddings, colored by category.

### Test: Calculons la cosine similarité entre deux vecteurs

In [None]:
A = article_embedding[162605]
B = article_embedding[300884]

In [None]:
cosine_similarity = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine_similarity)

In [None]:
cosine_similarity(A.reshape(1, -1),B.reshape(1, -1))

### Trouvons les articles lus par un utilisateur donnée

In [None]:
user_id = 20137
history_size = 5

def get_mean_vector(articles_idx, verbose=1):
    
    mean_vector = np.zeros(article_embedding[0].shape)
    for article_id in articles_idx:
        if verbose > 0:
            print(article_id)
        mean_vector += article_embedding[article_id]
    mean_vector /= len(articles_idx)
    return mean_vector

last_articles_idx = data_train[data_train.user_id == user_id]['article_id'].iloc[-history_size:].values #.sort_values('click_timestamp')
mean_vector = get_mean_vector(last_articles_idx)

### Calculons la similarité de ce vector avec les autres articles

In [None]:
A = article_embedding
#### ICI on drop les articles déjà lus par l'utilisateur
B = mean_vector
print("A:", A.shape, "B:", B.shape, '\n')
 
# compute cosine similarity
cosine = np.dot(A,B)/(norm(A, axis=1)*norm(B))
print("Cosine Similarity:", cosine, cosine.shape)

### Recommandons 5 articles à l'utilisateur

In [None]:
def predict_articles(cosine, pred_size=5):
    cos = pd.DataFrame(cosine, columns=['cosine_sim'])
    selection = cos.sort_values('cosine_sim', ascending=False)[:pred_size]
    selection.reset_index(inplace=True)
    selection.rename(columns={'index':'article_id'}, inplace=True)
    return selection

pred = predict_articles(cosine, 5)
pred

### Comparons avec les articles consultés par cet utilisateur dans le validation_set

In [None]:
mean_vector_predicted = get_mean_vector(pred.article_id)

In [None]:
viewed = data_valid[data_valid.user_id == user_id]['article_id'].values
mean_vector_viewed = get_mean_vector(viewed)

#### Similarité entre les articles lus dans le `valid_set` et les articles prédis

In [None]:
def get_cosine_similarity(A, B):
    return np.dot(A,B)/(norm(A)*norm(B))

cosine_similarity = get_cosine_similarity(mean_vector_predicted, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity)

#### Similarité entre les articles lus dans le `train_set` et les articles lus dans le `valid_set`

In [None]:
cosine_similarity = get_cosine_similarity(mean_vector, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity)

### RMSE entre la moyenne des embeddings des articles prédits et la moyenne des embeddings des articles lus

In [None]:
ml_metrics.rmse(mean_vector_viewed, mean_vector_predicted)

In [None]:
cosine_pred_viewed = []
cosine_hist_viewed = []
# rmse_pred_viewed = []
mean_vectors_viewed = []
mean_vectors_predicted = []

for user_id in data_valid.user_id[:1000]:
    # print(user_id)
    last_articles_idx = data_train[data_train.user_id == user_id]['article_id'].iloc[-history_size:].values #.sort_values('click_timestamp')
    mean_vector_hist = get_mean_vector(last_articles_idx, verbose=0)
    
    A = article_embedding
    B = mean_vector_hist
    cosine = np.dot(A,B)/(norm(A, axis=1)*norm(B))
    
    pred = predict_articles(cosine, 5)
    mean_vector_pred = get_mean_vector(pred.article_id, verbose=0)
    
    viewed = data_valid[data_valid.user_id == user_id]['article_id'].values
    mean_vector_viewed = get_mean_vector(viewed, verbose=0)
    
    cosine_similarity_pred_viewed = get_cosine_similarity(mean_vector_pred, mean_vector_viewed)
    cosine_similarity_hist_viewed = get_cosine_similarity(mean_vector_hist, mean_vector_viewed)
    # print(f"Cosine Similarity :: pred/viewed={cosine_similarity_pred_viewed} | hist/viewed={cosine_similarity_hist_viewed}")
    #rmse = ml_metrics.rmse(mean_vector_viewed, mean_vector_predicted)
    
    cosine_pred_viewed.append(cosine_similarity_pred_viewed)
    cosine_hist_viewed.append(cosine_similarity_hist_viewed)
    #rmse_pred_viewed.append(rmse)
    mean_vectors_viewed.append(mean_vector_viewed)
    mean_vectors_predicted.append(mean_vector_predicted)

In [None]:
print(f"MEAN Cosine Similarity :: pred/viewed={np.mean(cosine_pred_viewed)} | hist/viewed={np.mean(cosine_hist_viewed)}")
#print(f"RMSE :: pred/viewed={np.mean(rmse_pred_viewed)}")
print(f"RMSE :: pred/viewed={ml_metrics.rmse(mean_vectors_viewed, mean_vectors_predicted)}")

In [None]:
data_valid.user_id