In [260]:
import pickle 

import pandas as pd
import numpy as np
from numpy.linalg import norm

import scipy.sparse as sparse

import implicit
from implicit.nearest_neighbours import bm25_weight
from implicit import evaluation

import ml_metrics

# 1. Collaborative Filtering

In [2]:
data_train = pd.read_csv('data/data_train.csv')
display(data_train.head(3), data_train.shape)

Unnamed: 0,user_id,article_id,score
0,59,234853,0.214286
1,79,159359,0.215827
2,154,96663,0.145631


(1577295, 3)

In [3]:
data_valid = pd.read_csv('data/data_valid.csv')
display(data_valid.head(3), data_valid.shape)

Unnamed: 0,user_id,article_id,score
0,279777,96210,0.109489
1,29634,284773,0.469863
2,55,162605,1.324273


(241105, 3)

In [4]:
data_train['user_cat_code'] = data_train['user_id'].astype('category').cat.codes
data_train['article_cat_code'] = data_train['article_id'].astype('category').cat.codes

# sparse_item_user = sparse.csr_matrix((data_train['score'].astype(float), (data_train['article_cat_code'], data_train['user_cat_code'])))
sparse_user_item = sparse.csr_matrix((data_train['score'].astype(float), (data_train['user_cat_code'], data_train['article_cat_code'])))

display(sparse_user_item.shape)

(297141, 28002)

In [21]:
data_train.user_id.nunique(), data_train.article_id.nunique()

(297141, 28002)

In [20]:
data_train.head(2)

Unnamed: 0,user_id,article_id,score,user_cat_code,article_cat_code
0,59,234853,0.214286,58,18751
1,79,159359,0.215827,78,13017


In [24]:
sparse_user_item.eliminate_zeros()
sparse_user_item.data = np.ones(len(sparse_user_item.data))
display(sparse_user_item.shape)

(297141, 28002)

In [29]:
# sparse_item_user_bm25 = bm25_weight(sparse_item_user, K1=100, B=0.8)
sparse_user_item_bm25 = bm25_weight(sparse_user_item, K1=100, B=0.9)

# sparse_item_user_bm25 = sparse_item_user_bm25.tocsr() #.T.tocsr() 
sparse_user_item_bm25 = sparse_user_item_bm25.tocsr() #.T.tocsr() X
#sparse_user_item_bm25 = sparse_user_item_bm25.T.tocsr()

display(sparse_user_item_bm25.shape)

(297141, 28002)

In [30]:
model_bm25 = implicit.als.AlternatingLeastSquares(
    factors=64, 
    regularization=0.05, 
    iterations=50, 
)

alpha = 40
data_alpha_bm25 = (sparse_user_item_bm25 * alpha).astype('double')
model_bm25.fit(data_alpha_bm25)

#model_bm25.fit(sparse_user_item_bm25)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [05:57<00:00,  7.16s/it]


In [32]:
# Similar item - Method 2

article_id = 162605
item_id = data_train[data_train.article_id == article_id]['article_cat_code'].iloc[0]
print(f"article_id: {article_id} | article_cat_code: {item_id}")

# Gget similar items.
ids, scores = model_bm25.similar_items(item_id, N=5 , filter_items=[item_id]) # ⚠️ On pourrait filtrer les articles déjà lus par l'utilisateur

for code, score in zip(ids, scores):
    idx = data_train[data_train.article_cat_code==code]['article_id'].iloc[0]
    print(f"catCode: {code:10} \t article_id: {idx:10} \t score: {score:.2f}")

article_id: 162605 | article_cat_code: 13358
catCode:      13314 	 article_id:     162107 	 score: 0.73
catCode:      12669 	 article_id:     156473 	 score: 0.66
catCode:      12851 	 article_id:     157974 	 score: 0.65
catCode:      23969 	 article_id:     300884 	 score: 0.64
catCode:      22253 	 article_id:     283402 	 score: 0.64


> ### ⚠️⚠️⚠️ Pour l'évaluation on pourrait utiliser la proximité des embeddings correspondant

In [171]:
# Make recommendations for the first 10 users in the dataset
userids = [59] # np.arange(10)
codes, scores = model_bm25.recommend(userids, sparse_user_item_bm25[userids], N=5, filter_already_liked_items=True) 
display(codes, codes.shape, scores, scores.shape) ## les scores ne devraient ils pas être sous 1 ?

array([[10381,  8217, 12240, 13017, 16531]], dtype=int32)

(1, 5)

array([[0.95670205, 0.8148098 , 0.8054829 , 0.7729543 , 0.76639163]],
      dtype=float32)

(1, 5)

In [9]:
data_valid['user_cat_code'] = data_valid['user_id'].astype('category').cat.codes
data_valid['article_cat_code'] = data_valid['article_id'].astype('category').cat.codes

#valid_sparse_item_user = sparse.csr_matrix((data_valid['score'].astype(float), (data_valid['article_cat_code'], data_valid['user_cat_code'])))
valid_sparse_user_item = sparse.csr_matrix((data_valid['score'].astype(float), (data_valid['user_cat_code'], data_valid['article_cat_code'])))

display(valid_sparse_user_item.shape)

(84041, 7576)

In [10]:
# valid_sparse_item_user_bm25 = bm25_weight(valid_sparse_item_user, K1=100, B=0.8)
valid_sparse_user_item_bm25 = bm25_weight(valid_sparse_user_item, K1=100, B=0.8)

# valid_sparse_item_user_bm25 = valid_sparse_item_user_bm25.tocsr()  #.T.tocsr()
valid_sparse_user_item_bm25 = valid_sparse_user_item_bm25.tocsr()  #.T.tocsr() X

display(valid_sparse_user_item_bm25.shape)

(84041, 7576)

In [11]:
evaluation.mean_average_precision_at_k(model_bm25, sparse_user_item_bm25, valid_sparse_user_item_bm25, K=5, show_progress=True, num_threads=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:26<00:00, 3148.30it/s]


3.6807431293455975e-05

In [12]:
evaluation.ndcg_at_k(model_bm25, sparse_user_item_bm25, valid_sparse_user_item_bm25, K=5, show_progress=True, num_threads=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:25<00:00, 3273.48it/s]


7.206724289875208e-05

In [13]:
evaluation.precision_at_k(model_bm25, sparse_user_item_bm25, valid_sparse_user_item_bm25, K=5, show_progress=True, num_threads=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:26<00:00, 3183.82it/s]


0.00013131010720157153

In [14]:
data_train.user_id.nunique(), data_valid.user_id.nunique()

(297141, 84041)

In [15]:
data_train.article_id.nunique(), data_valid.article_id.nunique()

(28002, 7576)

In [None]:
data_train.user_id.shape

(1577295,)

# 2. Content Based Filtering

articles_embeddings.pickle Pickle (Python 3) of a NumPy matrix containing the Article Content Embeddings (250-dimensional vectors), trained upon articles' text and metadata by the CHAMELEON's ACR module (see paper for details) for 364047 published articles.
P.s. The full text of news articles could not be provided due to license restrictions, but those embeddings can be used by Neural Networks to represent their content. See this paper for a t-SNE visualization of these embeddings, colored by category.

### Chargons les embeddings

In [None]:
file = open('data/news-portal-user-interactions-by-globocom/articles_embeddings.pickle',"rb")
article_embedding = pickle.load(file)

In [None]:
display(article_embedding, article_embedding.shape)

array([[-0.16118301, -0.95723313, -0.13794445, ..., -0.231686  ,
         0.5974159 ,  0.40962312],
       [-0.52321565, -0.974058  ,  0.73860806, ...,  0.18282819,
         0.39708954, -0.83436364],
       [-0.61961854, -0.9729604 , -0.20736018, ..., -0.44758022,
         0.8059317 , -0.28528407],
       ...,
       [-0.25139043, -0.9762427 ,  0.58609664, ..., -0.14372464,
         0.06809307, -0.7050104 ],
       [ 0.22434181, -0.92328775, -0.38174152, ...,  0.6871319 ,
        -0.5315117 ,  0.01072566],
       [-0.25713393, -0.9946313 ,  0.9837918 , ...,  0.98387307,
        -0.8381829 , -0.1792827 ]], dtype=float32)

(364047, 250)

### Test: Calculons la cosine similarité entre deux vecteurs

In [None]:
A = article_embedding[162605]
B = article_embedding[300884]

In [None]:
cosine_similarity = np.dot(A,B)/(norm(A)*norm(B))
print("Cosine Similarity:", cosine_similarity)

Cosine Similarity: 0.04703806


### Trouvons les articles lus par un utilisateur donnée

In [287]:
user_id = 20137
history_size = 5

def get_mean_vector(articles_idx, verbose=1):
    
    mean_vector = np.zeros(article_embedding[0].shape)
    for article_id in articles_idx:
        if verbose > 0:
            print(article_id)
        mean_vector += article_embedding[article_id]
    mean_vector /= len(articles_idx)
    return mean_vector

last_articles_idx = data_train[data_train.user_id == user_id]['article_id'].iloc[-history_size:].values #.sort_values('click_timestamp')
mean_vector = get_mean_vector(last_articles_idx)

288440
337441
202476
250043
284583


### Calculons la similarité de ce vector avec les autres articles

In [288]:
A = article_embedding
#### ICI on drop les articles déjà lus par l'utilisateur
B = mean_vector
print("A:", A.shape, "B:", B.shape, '\n')
 
# compute cosine similarity
cosine = np.dot(A,B)/(norm(A, axis=1)*norm(B))
print("Cosine Similarity:", cosine, cosine.shape)

A: (364047, 250) B: (250,) 

Cosine Similarity: [0.32582364 0.27835361 0.31348326 ... 0.38568463 0.27425226 0.42668363] (364047,)


### Recommandons 5 articles à l'utilisateur

In [289]:
def predict_articles(cosine, pred_size=5):
    cos = pd.DataFrame(cosine, columns=['cosine_sim'])
    selection = cos.sort_values('cosine_sim', ascending=False)[:pred_size]
    selection.reset_index(inplace=True)
    selection.rename(columns={'index':'article_id'}, inplace=True)
    return selection

pred = predict_articles(cosine, 5)
pred

Unnamed: 0,article_id,cosine_sim
0,284768,0.830968
1,345593,0.821919
2,285424,0.821205
3,345566,0.818335
4,283576,0.816523


### Comparons avec les articles consultés par cet utilisateur dans le validation_set

In [290]:
mean_vector_predicted = get_mean_vector(pred.article_id)

284768
345593
285424
345566
283576


In [291]:
viewed = data_valid[data_valid.user_id == user_id]['article_id'].values
mean_vector_viewed = get_mean_vector(viewed)

225010
313920
236566
285675
271061
199474
162378
87215
87560
87552
87555
87552
87554
87215
87181
87199
206785
285849
289003
336254
157478
206415
209122
205958
205824
50644
224730
30760


#### Similarité entre les articles lus dans le `valid_set` et les articles prédis

In [292]:
def get_cosine_similarity(A, B):
    return np.dot(A,B)/(norm(A)*norm(B))

cosine_similarity = get_cosine_similarity(mean_vector_predicted, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity)

Cosine Similarity: 0.7175413733059302


#### Similarité entre les articles lus dans le `train_set` et les articles lus dans le `valid_set`

In [293]:
cosine_similarity = get_cosine_similarity(mean_vector, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity)

Cosine Similarity: 0.798493936241232


### RMSE entre la moyenne des embeddings des articles prédits et la moyenne des embeddings des articles lus

In [294]:
ml_metrics.rmse(mean_vector_viewed, mean_vector_predicted)

0.315302836858221

In [312]:
cosine_pred_viewed = []
cosine_hist_viewed = []
# rmse_pred_viewed = []
mean_vectors_viewed = []
mean_vectors_predicted = []

for user_id in data_valid.user_id[:1000]:
    # print(user_id)
    last_articles_idx = data_train[data_train.user_id == user_id]['article_id'].iloc[-history_size:].values #.sort_values('click_timestamp')
    mean_vector_hist = get_mean_vector(last_articles_idx, verbose=0)
    
    A = article_embedding
    B = mean_vector_hist
    cosine = np.dot(A,B)/(norm(A, axis=1)*norm(B))
    
    pred = predict_articles(cosine, 5)
    mean_vector_pred = get_mean_vector(pred.article_id, verbose=0)
    
    viewed = data_valid[data_valid.user_id == user_id]['article_id'].values
    mean_vector_viewed = get_mean_vector(viewed, verbose=0)
    
    cosine_similarity_pred_viewed = get_cosine_similarity(mean_vector_pred, mean_vector_viewed)
    cosine_similarity_hist_viewed = get_cosine_similarity(mean_vector_hist, mean_vector_viewed)
    # print(f"Cosine Similarity :: pred/viewed={cosine_similarity_pred_viewed} | hist/viewed={cosine_similarity_hist_viewed}")
    #rmse = ml_metrics.rmse(mean_vector_viewed, mean_vector_predicted)
    
    cosine_pred_viewed.append(cosine_similarity_pred_viewed)
    cosine_hist_viewed.append(cosine_similarity_hist_viewed)
    #rmse_pred_viewed.append(rmse)
    mean_vectors_viewed.append(mean_vector_viewed)
    mean_vectors_predicted.append(mean_vector_predicted)

In [313]:
print(f"MEAN Cosine Similarity :: pred/viewed={np.mean(cosine_pred_viewed)} | hist/viewed={np.mean(cosine_hist_viewed)}")
#print(f"RMSE :: pred/viewed={np.mean(rmse_pred_viewed)}")
print(f"RMSE :: pred/viewed={ml_metrics.rmse(mean_vectors_viewed, mean_vectors_predicted)}")

MEAN Cosine Similarity :: pred/viewed=0.526694617616008 | hist/viewed=0.5662065310553823
RMSE :: pred/viewed=0.42283893207973633


In [None]:
data_valid.user_id

0         279777
1          29634
2             55
3          20137
4         152940
           ...  
241100    243409
241101    215206
241102    201738
241103    201738
241104    252642
Name: user_id, Length: 241105, dtype: int64