In [1]:
import pickle 

import pandas as pd
import numpy as np
from numpy.linalg import norm

import scipy.sparse as sparse
from sklearn.metrics.pairwise import cosine_similarity

import implicit
from implicit.nearest_neighbours import bm25_weight
from implicit import evaluation

import ml_metrics

  from .autonotebook import tqdm as notebook_tqdm


### Chargeons les jeux de données `training` et `validation`

In [2]:
data_train = pd.read_csv('data/data_train.csv')
display(data_train.head(3), data_train.shape)

Unnamed: 0,user_id,article_id,score
0,59,234853,0.214286
1,79,159359,0.215827
2,154,96663,0.145631


(1577295, 3)

In [3]:
data_valid = pd.read_csv('data/data_valid.csv')
display(data_valid.head(3), data_valid.shape)

Unnamed: 0,user_id,article_id,score
0,279777,96210,0.109489
1,29634,284773,0.469863
2,55,162605,1.324273


(241105, 3)

### Chargons les embeddings

In [4]:
file = open('data/news-portal-user-interactions-by-globocom/articles_embeddings.pickle',"rb")
article_embedding = pickle.load(file)

In [5]:
display(article_embedding[:5], article_embedding.shape)

array([[-0.16118301, -0.95723313, -0.13794445, ..., -0.231686  ,
         0.5974159 ,  0.40962312],
       [-0.52321565, -0.974058  ,  0.73860806, ...,  0.18282819,
         0.39708954, -0.83436364],
       [-0.61961854, -0.9729604 , -0.20736018, ..., -0.44758022,
         0.8059317 , -0.28528407],
       [-0.7408434 , -0.97574896,  0.39169782, ..., -0.5378381 ,
         0.24354108, -0.8853287 ],
       [-0.2790515 , -0.97231525,  0.68537366, ..., -0.42406067,
         0.18548405, -0.5802922 ]], dtype=float32)

(364047, 250)

# 1. Candidate generation

# 1.1 Collaborative Filtering

### Préparons une sparse matrix pour entrainer nos algorithmes de collaborative filtering

In [140]:
# --- Train ---
data_train['user_cat_code'] = data_train['user_id'].astype('category').cat.codes
data_train['article_cat_code'] = data_train['article_id'].astype('category').cat.codes

# train_sparse_item_user = sparse.csr_matrix((data_train['score'].astype(float), (data_train['article_cat_code'], data_train['user_cat_code'])))
train_sparse_user_item = sparse.csr_matrix((data_train['score'].astype(float), (data_train['user_cat_code'], data_train['article_cat_code'])))
display(train_sparse_user_item.shape)

# --- Validation ---
data_valid['user_cat_code'] = data_valid['user_id'].astype('category').cat.codes
data_valid['article_cat_code'] = data_valid['article_id'].astype('category').cat.codes

# valid_sparse_item_user = sparse.csr_matrix((data_valid['score'].astype(float), (data_valid['article_cat_code'], data_valid['user_cat_code'])))
valid_sparse_user_item = sparse.csr_matrix((data_valid['score'].astype(float), (data_valid['user_cat_code'], data_valid['article_cat_code'])))

display(valid_sparse_user_item.shape)

(297141, 28002)

(84041, 7576)

### Utilisons un système permettant de rééquilibrer les notes implicites *(pour éviter de donner trop d'importance aux articles qui ont un très gros ratio `temps de lecture` / `nombre de mots`)*

In [141]:
# train_sparse_item_user_bm25 = bm25_weight(sparse_item_user, K1=100, B=0.9).tocsr()
train_sparse_user_item_bm25 = bm25_weight(train_sparse_user_item, K1=100, B=0.9).tocsr() # Implicit veut des matrices [user x item]
valid_sparse_user_item_bm25 = bm25_weight(valid_sparse_user_item, K1=100, B=0.9).tocsr() # Implicit veut des matrices [user x item]

display(train_sparse_user_item_bm25.shape)
display(valid_sparse_user_item_bm25.shape)

(297141, 28002)

(84041, 7576)

### Entrainons un premier modèle pour calculer les embeddings utilisateurs et faire des recommandations

In [142]:
model_bm25 = implicit.als.AlternatingLeastSquares(
    factors=32, 
    regularization=0.05, 
    iterations=50,
    alpha=40
)

model_bm25.fit(train_sparse_user_item_bm25)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [04:25<00:00,  5.31s/it]


### Testons une `recommandation sur la base d'un ou plusieurs utilisateurs`

In [143]:
# Make recommendations for the first 10 users in the dataset
userids = [59, 1024] # liste d'ID utilisateurs
rec_size = 5

codes, scores = model_bm25.recommend(userids, valid_sparse_user_item_bm25[userids], N=rec_size, filter_already_liked_items=True) 

for i, user_id in enumerate(userids):
    print(f"\n --- Liste d'articles candidats pour l'utilisateur {user_id} --- \n")
    
    for code, score in zip(codes[i], scores[i]):
        idx = data_train[data_train.article_cat_code==code]['article_id'].iloc[0]
        print(f"catCode: {code:10} \t article_id: {idx:10} \t score: {score:.2f}")


 --- Liste d'articles candidats pour l'utilisateur 59 --- 

catCode:       9556 	 article_id:     108854 	 score: 0.97
catCode:       6440 	 article_id:      74722 	 score: 0.96
catCode:      13935 	 article_id:     168868 	 score: 0.95
catCode:       5718 	 article_id:      68866 	 score: 0.91
catCode:       9429 	 article_id:     107073 	 score: 0.88

 --- Liste d'articles candidats pour l'utilisateur 1024 --- 

catCode:       8129 	 article_id:      96210 	 score: 1.23
catCode:      26190 	 article_id:     336245 	 score: 1.11
catCode:       1430 	 article_id:      20691 	 score: 1.10
catCode:      26306 	 article_id:     337441 	 score: 1.00
catCode:      22906 	 article_id:     288440 	 score: 0.94


### Testons une `recommandation sur la base d'un article` *(ce n'est pas le but d'un Collaborative Filtering, mais on peut le faire alors autant l'essayer)*

In [144]:
article_id = 162605
article_code = data_train[data_train.article_id == article_id]['article_cat_code'].iloc[0]
rec_size = 5

# Get similar items.
codes, scores = model_bm25.similar_items(article_code, N=rec_size , filter_items=[article_code])

print(f"\n --- Liste d'articles candidats sur la base de l'article {article_id} --- \n")
for code, score in zip(codes, scores):
    idx = data_train[data_train.article_cat_code==code]['article_id'].iloc[0]
    print(f"catCode: {code:10} \t article_id: {idx:10} \t score: {score:.2f}")


 --- Liste d'articles candidats sur la base de l'article 162605 --- 

catCode:       6359 	 article_id:      74112 	 score: 0.96
catCode:      13314 	 article_id:     162107 	 score: 0.85
catCode:      18658 	 article_id:     234377 	 score: 0.80
catCode:      13000 	 article_id:     159197 	 score: 0.79
catCode:       4077 	 article_id:      50569 	 score: 0.78


### Evaluons le modèle

> **Nous devons nous rappeler que la recommandation n'est pas une prédiction.**<br>
> S'appuyer sur des métriques ML pour déterminer la performance d'un système de recommandation n'est pas suffisant.<br>
> Seul **le retour des utilisateurs apporte des résultats valables et c'est pourquoi les tests A/B devraient toujours être priviligiés**.

- Dans la mesure ou notre jeu de données **ne dispose pas de scores explicites**, il ne parrait pas souhaitable d'utiliser des métrique du type `MAE` ou `RMSE`.
- Dans la mesure ou l'on **ne cherche pas particulièrement à obtenir un ordre précis**, il ne parrait pas souhaitable d'utiliser des métriques de ranking comme le `MAP@K` ou le `nDCCG`.
- Nous pourrions donc nous tourner vers la `Precision@k`, le `Recall@K` et donc le `F1@k`, mais il est probable que ce ne soit pas très représentatif.

#### Regardons la precision@k

In [145]:
evaluation.precision_at_k(model_bm25, train_sparse_user_item_bm25, valid_sparse_user_item_bm25, K=5, show_progress=True, num_threads=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 84041/84041 [00:28<00:00, 2972.14it/s]


0.00011030049004932008

> Le probleme c'est même si c'est l'une des métriques disponible les plus adaptée, elle reste peu adaptée à notre problème...<br>
> Ici `Precision = (# of top k recommendations that are relevant)/(# of items that are recommended)`<br>
> Mais malgré un nombre d'article assez large, on ne recommande que 5 articles et les utilisateurs ont un historique assez faible dans notre jeu de données. Donc les chances de recommander un article parmi 5 qui a effectivement été lu ensuite par l'utilisateur sont vraiment faible.

### Construisons une métrique sur mesure

Pour avoir une idée une métrique utilsable pour comparer nos différents modèles, nous pourrions comparer l'embedding moyen des articles lus APRÈS *(donc les actions contenues dans data_valid)* avec l'embedding moyen des article recommandés ET avec l'embedding moyen des articles recommandés.

#### Calculons la cosine similarity moyenne sur le jeu de validation

In [150]:
article_lookup = pd.DataFrame(data_train.groupby('article_cat_code')['article_id'].apply(lambda x: list(x)[0])).to_dict()

def lookup_articles(x):
    return article_lookup['article_id'][x]

In [185]:
def get_mean_cosine_similarity(data_ref, reco_ref, reco_model, reco_size = 5):

    # --- for each user, get the ids of the articles he/she has read
    select = data_ref.groupby('user_id')['article_id'].apply(list).reset_index(name='article_ids')
    select = select[select.article_ids.map(len) > 1]

    # --- for each user, compute the mean embedding vectors of the articles he/she has read
    select['read_mean_embedding'] = select.apply(lambda x : article_embedding[np.array(x[1])].mean(axis=0), axis=1)

    # --- for each user, make recommendations
    select['reco_article_ids'] = reco_model(select.index, reco_ref, reco_size)

    # --- for each user, compute the mean embedding vectors of the recommended articles
    select['reco_mean_embedding'] = select['reco_article_ids'].apply(lambda x : article_embedding[np.array(x)].mean(axis=0))

    # --- for each user, compute the cosine similarity between the read_mean_embedding and the pred_mean_embedding
    select['cosine'] = select.apply(lambda x: cosine_similarity(x['read_mean_embedding'].reshape(1, -1), x['reco_mean_embedding'].reshape(1, -1))[0][0], axis=1)

    # --- reset the index column
    select = select.set_index('user_id')

    # --- Compute & return overall mean cosine similarity
    return select.cosine.mean(), select

def reco_collaborative_filtering_bm25(user_ids, reco_ref, reco_size):
    
    apply_numpy2 = lambda x : list(map(lookup_articles,x))
    apply_numpy = lambda x : list(map(apply_numpy2,x))
    
    reco_codes, reco_score = model_bm25.recommend(user_ids, reco_ref[user_ids], N=reco_size, filter_already_liked_items=True) 
    return apply_numpy(reco_codes)

MCS, MCS_df = get_mean_cosine_similarity(data_valid, valid_sparse_user_item_bm25, reco_collaborative_filtering_bm25, 5)

display(MCS_df.head(), MCS_df.shape)
print(f"\nmean_cosine_similarity: {MCS:.2f} (sachant que la cosine similarity va de 1 à -1)")

Unnamed: 0_level_0,article_ids,read_mean_embedding,reco_article_ids,reco_mean_embedding,cosine
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,"[225010, 69353, 161872, 205845, 57748, 157815,...","[-0.024590481, -0.9645934, -0.06022447, -0.023...","[119592, 96663, 160474, 284463, 108854]","[-0.43367648, -0.96880037, -0.12601939, -0.163...",0.718369
7,"[199474, 87223, 352979, 284470, 36162, 156279]","[-0.11400774, -0.96468645, -0.32728586, -0.201...","[160474, 31836, 284463, 48403, 336223]","[-0.45764685, -0.9671749, -0.39627498, -0.4677...",0.70433
8,"[331116, 96141, 234481]","[-0.32962552, -0.97033435, 0.20547153, -0.0870...","[160974, 205832, 285533, 156381, 348113]","[-0.17988348, -0.9646123, 0.050530713, -0.0763...",0.654855
10,"[196588, 193449, 195689, 65991, 100931, 58556,...","[-0.23681411, -0.96241695, -0.08872973, -0.346...","[234698, 156964, 336221, 123909, 64409]","[-0.27939475, -0.9646348, -0.4417979, -0.46693...",0.756411
11,"[208582, 96877, 195177, 205824]","[-0.15431535, -0.9709704, 0.069619074, 0.13036...","[129434, 272660, 95716, 336221, 336220]","[-0.49827176, -0.9671997, -0.55224514, -0.6789...",0.344051


(46638, 5)


mean_cosine_similarity: 0.51 (sachant que la cosine similarity va de 1 à -1)


# 1.2 Content Based Filtering

articles_embeddings.pickle Pickle (Python 3) of a NumPy matrix containing the Article Content Embeddings (250-dimensional vectors), trained upon articles' text and metadata by the CHAMELEON's ACR module (see paper for details) for 364047 published articles.
P.s. The full text of news articles could not be provided due to license restrictions, but those embeddings can be used by Neural Networks to represent their content. See this paper for a t-SNE visualization of these embeddings, colored by category.

### Trouvons l'embedding moyen des articles lus par un utilisateur donnée

In [127]:
user_id = 20137
history_size = 5

def get_mean_vector(articles_idx):
    apply_numpy = lambda x : article_embedding[x].mean(axis=0)
    return apply_numpy(articles_idx)            

user_articles_idx = data_train[data_train.user_id == user_id]['article_id']
last_articles_idx = user_articles_idx.iloc[-history_size:].values #.sort_values('click_timestamp')
mean_vector = get_mean_vector(last_articles_idx)

print(f"Articles utilisés dans le mean embedding: {last_articles_idx}")

Articles utilisés dans le mean embedding: [288440 337441 202476 250043 284583]


### Calculons la similarité de cet embedding avec les embeddings des articles présents dans note fichier

In [200]:
def get_cosine(article_embedding, mean_vector, user_articles_idx=None):
    
    A = article_embedding.copy()
    B = mean_vector

    # --- Assurons nous de ne pas recommander les article déjà lus par cet utilisateur
    if user_articles_idx is not None:
        A[user_articles_idx] = -B # je donne aux articles lus un embedding inverse de celui que l'on cible
 
    # --- Calculons la cosine similarity entre l'embedding moyen de l'utilisateur et les articles connus
    return np.dot(A,B)/(norm(A, axis=1)*norm(B))

# --- Calculons la cosine similarity entre l'embedding moyen de l'utilisateur et les articles connus
cosine = get_cosine(article_embedding, mean_vector, user_articles_idx)
print("Cosine Similarity:", cosine, cosine.shape)

Cosine Similarity: [0.32582363 0.2783536  0.31348327 ... 0.38568467 0.2742523  0.42668363] (364047,)


### Recommandons 5 articles à l'utilisateur

In [26]:
def recommend_articles(cosine, pred_size=5):
    cos = pd.DataFrame(cosine, columns=['cosine_sim'])
    selection = cos.sort_values('cosine_sim', ascending=False)[:pred_size]
    selection.reset_index(inplace=True)
    selection.rename(columns={'index':'article_id'}, inplace=True)
    return selection

reco = recommend_articles(cosine, 5)
reco

ValueError: Shape of passed values is (1000, 46638), indices imply (1000, 1)

### Comparons avec les articles consultés par cet utilisateur dans le validation_set

In [130]:
mean_vector_recommended = get_mean_vector(reco.article_id)

In [131]:
viewed = data_valid[data_valid.user_id == user_id]['article_id'].values
mean_vector_viewed = get_mean_vector(viewed)

#### Similarité entre les articles lus dans le `valid_set` et les articles prédis

In [132]:
def get_cosine_similarity(A, B):
    return np.dot(A,B)/(norm(A)*norm(B))

cosine_similarity = get_cosine_similarity(mean_vector_predicted, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity)

Cosine Similarity: 0.71754134


#### Similarité entre les articles lus dans le `train_set` et les articles lus dans le `valid_set`

In [133]:
cosine_similarity = get_cosine_similarity(mean_vector, mean_vector_viewed)
print("Cosine Similarity:", cosine_similarity)

Cosine Similarity: 0.7984939


#### Calculons la cosine similarity moyenne sur l'ensemble du jeu de validation

In [202]:
def recommand_all_users(data_ref, user_ids, reco_size=5):
    print(user_ids)
    # --- for each user, get the ids of the articles he/she has read
    select = data_ref.groupby('user_id')['article_id'].apply(list).reset_index(name='article_ids')
    select = select[select.article_ids.map(len) > 1]

    # --- for each user, compute the mean embedding vectors of the articles he/she has read
    select['read_mean_embedding'] = select.apply(lambda x : article_embedding[np.array(x[1])].mean(axis=0), axis=1)
    display(select.head(), select.shape)

    # --- Calculons pour chaque utilisateur la similarité avec tous les articles et récupérons les premiers
    recos = dict() 
    for user_id in select.index:
        recos[user_id] = recommend_articles(get_cosine(article_embedding, mean_vector, user_articles_idx), reco_size)
        
    return recos
    
    
recos = recommand_all_users(data_valid, data_valid.user_id.values, 5)
print(recos, recos.shape)

[279777  29634     55 ... 201738 201738 252642]


ValueError: Cannot set a DataFrame with multiple columns to the single column read_mean_embedding

In [208]:
cosine_pred_viewed = []
cosine_hist_viewed = []
# rmse_pred_viewed = []
mean_vectors_viewed = []
mean_vectors_predicted = []

for user_id in data_valid.user_id[:100]:
    # print(user_id)
    last_articles_idx = data_train[data_train.user_id == user_id]['article_id'].iloc[-history_size:].values #.sort_values('click_timestamp')
    mean_vector_hist = get_mean_vector(last_articles_idx)
    
    A = article_embedding
    B = mean_vector_hist
    cosine = np.dot(A,B)/(norm(A, axis=1)*norm(B))
    
    pred = predict_articles(cosine, 5)
    mean_vector_pred = get_mean_vector(pred.article_id)
    
    viewed = data_valid[data_valid.user_id == user_id]['article_id'].values
    mean_vector_viewed = get_mean_vector(viewed)
    
    cosine_similarity_pred_viewed = get_cosine_similarity(mean_vector_pred, mean_vector_viewed)
    cosine_similarity_hist_viewed = get_cosine_similarity(mean_vector_hist, mean_vector_viewed)
    # print(f"Cosine Similarity :: pred/viewed={cosine_similarity_pred_viewed} | hist/viewed={cosine_similarity_hist_viewed}")
    #rmse = ml_metrics.rmse(mean_vector_viewed, mean_vector_predicted)
    
    cosine_pred_viewed.append(cosine_similarity_pred_viewed)
    cosine_hist_viewed.append(cosine_similarity_hist_viewed)
    #rmse_pred_viewed.append(rmse)
    mean_vectors_viewed.append(mean_vector_viewed)
    mean_vectors_predicted.append(mean_vector_predicted)

In [209]:
print(f"MEAN Cosine Similarity :: pred/viewed={np.mean(cosine_pred_viewed)} | hist/viewed={np.mean(cosine_hist_viewed)}")

MEAN Cosine Similarity :: pred/viewed=0.5353795289993286 | hist/viewed=0.5641014575958252


In [None]:
def XXX():
   
    A = article_embedding.copy()
    B = mean_vector

    # --- Assurons nous de ne pas recommander les article déjà lus par cet utilisateur
    if user_articles_idx is not None:
        A[user_articles_idx] = -B # je donne aux articles lus un embedding inverse de celui que l'on cible
 
    # --- Calculons la cosine similarity entre l'embedding moyen de l'utilisateur et les articles connus
    return np.dot(A,B)/(norm(A, axis=1)*norm(B))

    # --- Calculons la cosine similarity entre l'embedding moyen de l'utilisateur et les articles connus
    cosine = get_cosine(article_embedding, mean_vector, user_articles_idx)
    print("Cosine Similarity:", cosine, cosine.shape)
    
    cos = pd.DataFrame(cosine, columns=['cosine_sim'])
    selection = cos.sort_values('cosine_sim', ascending=False)[:pred_size]
    selection.reset_index(inplace=True)
    selection.rename(columns={'index':'article_id'}, inplace=True)
    return selection


In [21]:
A = [0,1,2,3,4,5,6,7,8,9,10]

step = 2
for x in range(0, article_embedding.shape[0], step):
    print(x, x+step)
    print(A[x:x+step])

0 2
[0, 1]
2 4
[2, 3]
4 6
[4, 5]
6 8
[6, 7]
8 10
[8, 9]
10 12
[10]
12 14
[]
14 16
[]


In [10]:
article_embedding.shape

(364047, 250)

In [9]:
import tables

In [16]:
# --- for each user, get the ids of the articles he/she has read
select = data_valid.groupby('user_id')['article_id'].apply(list).reset_index(name='article_ids')
select = select[select.article_ids.map(len) > 1]

# --- for each user, compute the mean embedding vectors of the articles he/she has read
select['read_mean_embedding'] = select.apply(lambda x : article_embedding[np.array(x[1])].mean(axis=0), axis=1)
display(select.head(), select.shape)

# --- for each user, recommend 5 articles
#cosines = np.empty(select.shape[0])

f = tables.open_file("cosines.h5", mode='w')
atom = tables.Float16Col()
cosines = f.create_earray(f.root, 'data', atom, (0, select.shape[0]))
#cosines = np.empty((0, select.shape[0]), int)

step = 1000
for x in range(0, article_embedding.shape[0], step):
    print(x, x+step)
    A = article_embedding[x:x+step].copy()
    B = pd.DataFrame.from_dict(dict(select.read_mean_embedding)).to_numpy(dtype='float16')
    cosine = cosine_similarity(A,B.T)
    cosines.append(cosine)
f.close()

Unnamed: 0,user_id,article_ids,read_mean_embedding
2,5,"[225010, 69353, 161872, 205845, 57748, 157815,...","[-0.024590481, -0.9645934, -0.06022447, -0.023..."
4,7,"[199474, 87223, 352979, 284470, 36162, 156279]","[-0.11400774, -0.96468645, -0.32728586, -0.201..."
5,8,"[331116, 96141, 234481]","[-0.32962552, -0.97033435, 0.20547153, -0.0870..."
7,10,"[196588, 193449, 195689, 65991, 100931, 58556,...","[-0.23681411, -0.96241695, -0.08872973, -0.346..."
8,11,"[208582, 96877, 195177, 205824]","[-0.15431535, -0.9709704, 0.069619074, 0.13036..."


(46638, 3)

0 1000
1000 2000
2000 3000
3000 4000
4000 5000
5000 6000
6000 7000
7000 8000
8000 9000
9000 10000
10000 11000
11000 12000
12000 13000
13000 14000
14000 15000
15000 16000
16000 17000
17000 18000
18000 19000
19000 20000
20000 21000
21000 22000
22000 23000
23000 24000
24000 25000
25000 26000
26000 27000
27000 28000
28000 29000
29000 30000
30000 31000
31000 32000
32000 33000
33000 34000
34000 35000
35000 36000
36000 37000
37000 38000
38000 39000
39000 40000
40000 41000
41000 42000
42000 43000
43000 44000
44000 45000
45000 46000
46000 47000
47000 48000
48000 49000
49000 50000
50000 51000
51000 52000
52000 53000
53000 54000
54000 55000
55000 56000
56000 57000
57000 58000
58000 59000
59000 60000
60000 61000
61000 62000
62000 63000
63000 64000
64000 65000
65000 66000
66000 67000
67000 68000
68000 69000
69000 70000
70000 71000
71000 72000
72000 73000
73000 74000
74000 75000
75000 76000
76000 77000
77000 78000
78000 79000
79000 80000
80000 81000
81000 82000
82000 83000
83000 84000
84000 85000
85

KeyboardInterrupt: 

In [17]:
f.close()

In [33]:
def recommend_articles2(cosine, pred_size=5):
    cos = pd.DataFrame(cosine, columns=['cosine_sim'])
    selection = cos.sort_values('cosine_sim', ascending=False)[:pred_size]
    selection.reset_index(inplace=True)
    selection.rename(columns={'index':'article_id'}, inplace=True)
    return selection

f = tables.open_file("cosines.h5", mode='r')
row = f.root.data[1:2,:]
print(row, row.shape)
recommend_articles2(row.T, 5)

[[ 0.2158   0.10645  0.4194  ... -0.04086  0.3286  -0.02745]] (1, 46638)


Unnamed: 0,article_id,cosine_sim
0,45597,0.65332
1,39110,0.651367
2,34418,0.646973
3,39773,0.639648
4,10191,0.630371
