In [2]:
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import time


# Data

In [9]:
behaviours_data = pd.read_csv('behaviours_encoded_data.csv')
last_clicks = behaviours_data.sort_values(by='Time', ascending=False)

news_data = pd.read_csv('news_with_timestamp.csv')

#I have here created a very simple training set to test the evaluation without needing to encode the evaluation set
train = last_clicks.head(120000)

#I have here created a very simple test set to test the evaluation without needing to encode the evaluation set
test_set = last_clicks.tail(36965)

last_clicks.head(6)


Unnamed: 0.1,Unnamed: 0,Impression ID,User ID,Time,History,Impressions,Noclicks,Click
90251,90251,90252,40037,1573775953000000000,"[22288, 20695, 301, 16639, 2393, 2706, 8657, 1...",N3678-0 N41934-0 N36786-0 N23535-0 N29490-0 N5...,"[35064, 43130, 28033, 30240, 39057, 39851, 391...","[45116, 36640]"
108432,108432,108433,43464,1573775946000000000,"[25721, 27147, 20912, 36920]",N29490-0 N22975-0 N27737-0 N6837-0 N47652-0 N1...,"[39057, 38173, 34263, 34286, 31885, 35104, 361...",[40780]
105362,105362,105363,8202,1573775931000000000,,N14478-0 N7342-0 N48487-0 N29490-0 N27737-0 N4...,"[35104, 44255, 43630, 39057, 34263, 37821, 149...",[40748]
36003,36003,36004,988,1573775927000000000,"[1881, 21916, 6014, 19940, 3459, 16765, 13644,...",N48487-0 N41934-0 N64037-0 N63913-0 N55322-0 N...,"[43630, 43130, 39697, 36295, 43344, 35104, 436...",[149]
66233,66233,66234,34269,1573775926000000000,"[919, 22416, 8032, 23913, 10314, 17267]",N7494-0 N46917-0 N62197-0 N2960-0 N22978-0 N57...,"[37954, 36146, 44529, 42112, 44550, 38165, 390...",[40780]
116390,116390,116391,3331,1573775912000000000,"[16319, 16123, 17638, 18235, 30705, 29826, 315...",N27737-0 N41934-0 N3678-0 N47652-0 N61233-0 N2...,"[34263, 43130, 35064, 31885, 40780, 42112, 302...",[31175]


# Most Popular

In [12]:
def remove_dups_and_old_articles(userID, list_most_popular):

    #Should really be the dato that the user accesses the system, but we know the timestamp in the validation data, so uses that
    #date = time.mktime(time.strptime("11/15/2019 11:59:34 PM", "%m/%d/%Y %I:%M:%S %p"))
    
    provided_date = pd.to_datetime("11/15/2019 11:59:34 PM") - pd.Timedelta(days=2)
    cutoff_time=provided_date.value
    
    #No articles older than two days
    #cutoff_time = provided_date - 100 * 24 * 60 * 60 * 1000
    
    #print(cutoff_time)
    
    #If the user is already in the system, remove all already read articles from the list
    if len(behaviours_data.loc[behaviours_data['User ID'] == userID]) > 0:
        previously_read_article_ids = (
          list(
              behaviours_data.loc[
                  ((behaviours_data['User ID'] == userID)),
                  'History'
              ].str.split()
          )[0]
        )
        article_ids = [int(item.strip('[],')) for item in previously_read_article_ids]    
        filter(lambda i: i not in article_ids, list_most_popular)
        

    #The list with the relevant most popular
    list_relevant_most_popular=[]
    
    for p in list_most_popular:
        # Check that article is not too old
        if ((news_data.loc[news_data['Index'] == p, 'Time'].iloc[0] > 0) and (news_data.loc[news_data['Index'] == p, 'Time'].iloc[0] > cutoff_time)):
            list_relevant_most_popular.append(p)
        
        if len(list_relevant_most_popular) == 5:
            break

        
    return list_relevant_most_popular





In [11]:

def most_popular(userID):
    #dict with newsID and how many each has been clicked
    id_count={}
    
    clicks = last_clicks.drop_duplicates(subset='User ID', keep='last')
    
    his = clicks['History'].dropna()
    click = clicks['Click'].dropna()
    data = pd.concat([his,click])
    #print(data)

    #Count the times an article has been clicked
    for article in data:
        id_list = ast.literal_eval(article)
        for id in id_list:
            #print(id)
            if id in id_count:
                id_count[id] += 1
            else:
                id_count[id] = 1

    #Sort by popularity
    id_count = dict(sorted(id_count.items(), key=lambda item: item[1], reverse=True))
    
    list_most_popular = list(id_count)
    
    #Go through the list of most popular, to only include relevant articles
    result = remove_dups_and_old_articles(userID, list_most_popular) 
        
    return result
    
most_popular(10000)


[37435, 15347, 14332, 320, 18471]

## Evaluation


In [None]:
recommended_articles_users = []
last_clicks.dropna(subset=['History'], inplace=True)
for user in last_clicks['User ID']:
    
    idxs = most_popular(user)
    recommended_articles_users.append(idxs)
recommended_articles_users

In [None]:
behaviors_data_users = []
for i in range(len(last_clicks['User ID'])):
    behaviors_data_users.append([last_clicks['User ID'].iloc[i]])
    behaviors_data_users[i].append(last_clicks['History'].iloc[i])
    
len(behaviors_data_users)

In [25]:

def calculate_mrr_from_recommendations(behaviors_data, recommended_articles):
    """
    Calculate Mean Reciprocal Rank (MRR) for a set of user interactions and recommendations.
 
    Parameters:
    - behaviors_data: List of tuples/lists. Each tuple/list should contain:
                      [integer user ID, list of article interactions as integers]
    - recommended_articles: List of lists. Each inner list contains integers of recommended articles for the corresponding user.
 
    Returns:
    - MRR (float): The mean reciprocal rank of the first relevant recommendation.
    """
    reciprocal_ranks = []
 
    for (user, history), recommendations in zip(behaviors_data, recommended_articles):
        # History is already a set of integers
        history_set = set(history)
 
        # Check for the first occurrence of any article from the user's history in the recommended list
        rank = next((1 + idx for idx, article in enumerate(recommendations) if article in history_set), None)
        # If there's at least one history article in the recommendations, calculate its reciprocal rank
        if rank is not None:
            reciprocal_ranks.append(1 / rank)
    # Calculate the mean of the reciprocal ranks
    if reciprocal_ranks:
        mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
    else:
        mrr = 0
 
    return mrr


