### Item based CF

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

### Read split train and valid data (based on timestamp using Quantile)

In [2]:
behaviour_encoded_train = pd.read_pickle('behaviour_encoded_train.df')
behaviour_encoded_valid = pd.read_pickle('behaviour_encoded_valid.df')

In [9]:
newsid_encoded_concat = pd.read_pickle('newsid_encoded_concat.df')

### Item-based CF implementation

- Find k similar items for last interacted item
- should not drop duplicate users, history might be different for same user depending on the impression Id. As impression id is based on time (given in Time column)
- Build user-item interaction matrix
- So, we need to include clicked articles in the impression in the history
- If an user has 3 impression ids then clicks of older impression ids should be included in History 
- Now we have No_clicks and Clicks column derived from impression column
- So we can use Clicks column to update user_item_history matrix.
- get all impressions of a user: check Clicks column value for them and then update in user_item_history matrix

In [4]:
behaviour_encoded_train = behaviour_encoded_train.dropna()

In [5]:
behaviour_encoded_valid = behaviour_encoded_valid.dropna()

In [17]:
newsids = newsid_encoded_concat[0].to_list()

In [20]:
# build user-item matrix using history and impressions
unique_users_train = len(behaviour_encoded_train['User ID'].unique())
unique_items = len(newsids)
print('unique_users_train', unique_users_train)
print('unique_items in news.tsv', unique_items)

user_item_history = np.zeros((unique_users_train, unique_items), dtype = int)
print(user_item_history.shape)

# whereever value is NaN, value in array is zero (we do change in user_item_history).
for i in range(unique_users_train):
    if isinstance(behaviour_encoded_train['History'].iloc[i], list):
        for item in behaviour_encoded_train['History'].iloc[i]:
            user_item_history[i][item] = 1
    user = behaviour_encoded_train['User ID'].iloc[i]
    user_time_list = list(behaviour_encoded_train['Time'][behaviour_encoded_train['User ID']==user])
    user_time_sorted = sorted(user_time_list, key = lambda x:float(x))
    if not len(user_time_sorted) == 1:
        for time in user_time_sorted:
            for item in behaviour_encoded_train['Clicks'][behaviour_encoded_train['User ID']==user][behaviour_encoded_train['Time']==time]:
                user_item_history[i][item[0]] = 1

unique_users_train 79547
unique_items in news.tsv 93698
(79547, 93698)


In [22]:
user_item_history_sparse = csr_matrix(user_item_history)
user_item_history_sparse

<79547x93698 sparse matrix of type '<class 'numpy.int64'>'
	with 3089280 stored elements in Compressed Sparse Row format>

In [23]:
item_user_history_sparse = user_item_history_sparse.T
item_based_sim_train = cosine_similarity(item_user_history_sparse)
item_based_sim_train.shape

(93698, 93698)

In [24]:
# Model 
item_user_interaction = user_item_history_sparse.T
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(item_user_interaction)

In [25]:
# This function finds k similar items given the item_id and ratings matrix M
def findksimilaritems(item_id, item_user_matrix, model, metric = 'cosine', k = 4):
    similarities = []
    indices = []  
    distances, indices = model.kneighbors(item_user_interaction[item_id-1], n_neighbors = k+1) # item_id-1 as item indexing starts from 0 & k+1 as kneighbors returns the item itself
    similarities = 1 - distances.flatten()
    ksimilarities = []
    ksimilarities_indices = []
    for i in range(0, len(indices.flatten())):
       if indices.flatten()[i]+1 == item_id:
           continue;
       else:
           ksimilarities_indices.append(indices.flatten()[i])
           ksimilarities.append(similarities.flatten()[i])
    return ksimilarities, ksimilarities_indices

In [29]:
def recommend_items(user_id, item_user_matrix, model, valid_set):
    # find last interacted article
    user_time_lt = list(valid_set['Time'][valid_set['User ID']==user_id])
    user_time_sort_lt = sorted(user_time_lt, key = lambda x:float(x))
    history_lt = valid_set['History'][valid_set['User ID']==user_id][valid_set['Time']==user_time_sort_lt[-1]]
    history_lt = history_lt.to_list()
    last_interacted_item = history_lt[0][-1]        
    # find k similar items (using similarity matrix) for this last interacted article
    k_similar_items, k_similar_indices = findksimilaritems(last_interacted_item, item_user_matrix, model, metric = 'cosine', k = 4)
    return k_similar_items, k_similar_indices

In [34]:
#recommend_items(49, item_user_interaction, model_knn, behaviour_encoded_valid)

In [33]:
# Get recommended_articles: List of lists. Each inner list contains integers of recommended articles for the corresponding user.
recommended_articles_users = []
for customer in behaviour_encoded_valid['User ID']:
    similarities, idxs = recommend_items(customer, item_user_interaction, model_knn, behaviour_encoded_valid)
    recommended_articles_users.append(idxs)
recommended_articles_users

[437173.0]
[437175.0, 437172.0]
[437173.0, 437174.0]
[437172.0]
[437176.0]
[437173.0]
[437173.0]
[437172.0]
[437175.0]
[437175.0]
[437179.0, 437178.0]
[437174.0]
[437174.0]
[437176.0]
[437172.0, 437172.0]
[437173.0, 437173.0]
[437174.0]
[437172.0]
[437175.0]
[437174.0, 437175.0]
[437175.0, 437173.0, 437175.0]
[437173.0]
[437172.0]
[437176.0]
[437172.0]
[437176.0]
[437175.0]
[437176.0, 437173.0]
[437174.0]
[437174.0]
[437173.0]
[437173.0, 437172.0]
[437174.0]
[437173.0]
[437172.0]
[437174.0]
[437177.0]
[437173.0]
[437181.0, 437181.0]
[437172.0]
[437172.0]
[437176.0]
[437174.0]
[437172.0]
[437172.0]
[437178.0]
[437173.0]
[437181.0]
[437176.0, 437175.0]
[437177.0]
[437174.0]
[437172.0, 437173.0]
[437174.0]
[437175.0]
[437176.0, 437174.0]
[437179.0]
[437172.0]
[437172.0, 437173.0]
[437182.0]
[437173.0]
[437176.0]
[437173.0]
[437174.0]
[437173.0]
[437178.0]
[437173.0]
[437172.0]
[437177.0, 437178.0, 437176.0, 437177.0, 437173.0, 437174.0, 437177.0, 437180.0, 437175.0]
[437180.0, 437177.0, 4

[[56944, 29602, 60027, 90310],
 [59762, 59490, 73372, 24380],
 [80358, 16403, 30639, 3994],
 [79926, 79912, 40480, 46157],
 [20751, 66814, 56308, 24679, 91417],
 [63807, 75551, 57110, 59282],
 [62466, 62468, 62464, 62465, 62462],
 [62466, 62468, 62464, 62465, 62462],
 [83689, 73285, 61342, 75548],
 [49597, 93047, 48498, 86349],
 [72061, 92835, 65664, 93651],
 [73327, 40860, 83372, 55613],
 [57357, 57548, 68989, 51039],
 [62466, 62468, 62464, 62465, 62462],
 [16875, 83849, 65031, 16042],
 [62466, 62468, 62464, 62465, 62462],
 [47300, 82673, 44274, 22659],
 [92277, 36386, 42997, 91746],
 [62466, 62468, 62464, 62465, 62462],
 [20014, 41422, 53177, 83211],
 [3109, 50998, 26557, 58909],
 [58724, 89459, 82816, 64428],
 [62466, 62468, 62464, 62465, 62462],
 [66015, 91736, 51317, 16108],
 [62466, 62468, 62464, 62465, 62462],
 [81046, 55676, 91254, 86364],
 [57357, 57548, 68989, 51039],
 [62466, 62468, 62464, 62465, 62462],
 [24253, 65029, 84806, 67911],
 [62466, 62468, 62464, 62465, 62462],
 [

In [36]:
len(recommended_articles_users)

25249

In [37]:
len(behaviour_encoded_valid)

25249

In [38]:
# behaviors_data: List of tuples/lists. Each tuple/list should contain:
#                      [integer user ID, list of article interactions as integers]
behaviors_data_users = []
for i in range(len(behaviour_encoded_valid['User ID'])):
    behaviors_data_users.append([behaviour_encoded_valid['User ID'].iloc[i]])
    behaviors_data_users[i].append(behaviour_encoded_valid['History'].iloc[i])

In [39]:
len(behaviors_data_users)

25249

In [41]:
def calculate_mrr_from_recommendations(behaviors_data, recommended_articles):
    """
    Calculate Mean Reciprocal Rank (MRR) for a set of user interactions and recommendations.

    Parameters:
    - behaviors_data: List of tuples/lists. Each tuple/list should contain:
                      [integer user ID, list of article interactions as integers]
    - recommended_articles: List of lists. Each inner list contains integers of recommended articles for the corresponding user.

    Returns:
    - MRR (float): The mean reciprocal rank of the first relevant recommendation.
    """
    reciprocal_ranks = []

    for (user, history), recommendations in zip(behaviors_data, recommended_articles):
        # History is already a set of integers
        history_set = set(history)

        # Check for the first occurrence of any article from the user's history in the recommended list
        rank = next((1 + idx for idx, article in enumerate(recommendations) if article in history_set), None)
        
        # If there's at least one history article in the recommendations, calculate its reciprocal rank
        if rank is not None:
            reciprocal_ranks.append(1 / rank)
    
    # Calculate the mean of the reciprocal ranks
    if reciprocal_ranks:
        mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
    else:
        mrr = 0

    return mrr

In [42]:
calculate_mrr_from_recommendations(behaviors_data_users, recommended_articles_users)

0.350765306122449