In [1]:
import pandas as pd
import numpy as np

import scipy.sparse as sparse

import implicit

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_train = pd.read_csv('data/data_train.csv')
display(data_train.head(3), data_train.shape)

Unnamed: 0,user_id,article_id,score
0,59,234853,0.214286
1,79,159359,0.215827
2,154,96663,0.145631


(1577295, 3)

In [3]:
data_valid = pd.read_csv('data/data_valid.csv')
display(data_valid.head(3), data_valid.shape)

Unnamed: 0,user_id,article_id,score
0,279777,96210,0.109489
1,29634,284773,0.469863
2,55,162605,1.324273


(241105, 3)

In [4]:
data_train['user_id'] = data_train['user_id'].astype('category')
data_train['article_id'] = data_train['article_id'].astype('category')
data_train['user_cat_code'] = data_train['user_id'].cat.codes
data_train['article_cat_code'] = data_train['article_id'].cat.codes

sparse_article_user = sparse.csr_matrix((data_train['score'].astype(float), (data_train['article_cat_code'], data_train['user_cat_code'])))
sparse_user_article = sparse.csr_matrix((data_train['score'].astype(float), (data_train['user_cat_code'], data_train['article_cat_code'])))

In [5]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_user_article * alpha).astype('double')
model.fit(data)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [02:53<00:00,  3.48s/it]


In [63]:
# Similar item - Method 2

article_id = 96210
item_id = data_train[data_train.article_id == article_id]['article_cat_code'].iloc[0]
print(f"article_id: {article_id} | article_cat_code: {item_id}")
n_similar = 5

# Use implicit to get similar items.
ids, scores = model.similar_items(item_id, n_similar , filter_items=[item_id]) # ⚠️ On pourrait filtrer les articles déjà lus par l'utilisateur


for code, score in zip(ids, scores):
    idx = data_train[data_train.article_cat_code==code]['article_id'].iloc[0]
    print(f"catCode: {code:10} \t article_id: {idx:10} \t score: {score:.2f}")

article_id: 96210 | article_cat_code: 8129
catCode:       1430 	 article_id:      20691 	 score: 0.98
catCode:      22906 	 article_id:     288440 	 score: 0.96
catCode:      13050 	 article_id:     159697 	 score: 0.96
catCode:      26190 	 article_id:     336245 	 score: 0.95
catCode:       9955 	 article_id:     114767 	 score: 0.95


In [81]:
# Make recommendations for the first 10 users in the dataset
userids = np.arange(10)
codes, scores = model.recommend(userids, sparse_user_article[userids], 5, filter_already_liked_items=True) 
codes, codes.shape

(array([[23354, 18543, 22509, 10747, 12658],
        [18554, 23390, 13210, 24787, 18792],
        [ 8217, 22427, 13210, 13141, 21438],
        [10747, 23354,  8237, 21400, 12972],
        [26187, 21286,  8014, 26186, 18014],
        [13646, 13129, 13335, 13141, 17273],
        [12694,  5227, 16819, 21389, 12678],
        [18722, 12726, 18926, 10650, 26187],
        [10998, 26187, 26185, 23028, 21822],
        [21438, 22482, 18792, 24787, 19070]], dtype=int32),
 (10, 5))

In [12]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)

In [11]:
def calc_mean_auc(training_set, altered_persons, predictions, test_set):
    store_auc = [] # An empty list to store the AUC for each user that had an item removed from the training set
    popularity_auc = [] # To store popular AUC scores
    pop_contents = np.array(test_set.sum(axis = 1)).reshape(-1) # Get sum of item iteractions to find most popular
    content_vecs = predictions[1]
    for person in altered_persons: # Iterate through each user that had an item altered
        training_column = training_set[:,person].toarray().reshape(-1) # Get the training set column
        zero_inds = np.where(training_column == 0) # Find where the interaction had not yet occurred
        
        # Get the predicted values based on our user/item vectors
        person_vec = predictions[0][person,:]
        pred = person_vec.dot(content_vecs).toarray()[0,zero_inds].reshape(-1)
        
        # Get only the items that were originally zero
        # Select all ratings from the MF prediction for this user that originally had no iteraction
        actual = test_set[:,person].toarray()[zero_inds,0].reshape(-1)
        
        # Select the binarized yes/no interaction pairs from the original full data
        # that align with the same pairs in training 
        pop = pop_contents[zero_inds] # Get the item popularity for our chosen items
        
        store_auc.append(auc_score(pred, actual)) # Calculate AUC for the given user and store
        
        popularity_auc.append(auc_score(pop, actual)) # Calculate AUC using most popular and score
    # End users iteration
    
    return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))