## Evaluation

In [1]:
# import libraries and datasets

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime, timedelta

# load the dataset

behaviors_dev_df = pd.read_csv("data/MINDsmall_dev/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news_dev_df = pd.read_csv("data/MINDsmall_dev/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])

behaviors_train_df = pd.read_csv("data/MINDsmall_train/behaviors.tsv", sep="\t", header=None, names=["Impression ID", "User ID", "Time", "History", "Impressions"])
news_train_df = pd.read_csv("data/MINDsmall_train/news.tsv", sep="\t", header=None, names=["News ID", "Category", "Subcategory", "Title", "Abstract", "URL", "Title Entities", "Abstract Entities", "Title Topics", "Abstract Topics"])

#Fill missing abstracts with placeholder
news_dev_df['Abstract'].fillna('No abstract available', inplace=True)
news_train_df['Abstract'].fillna('No abstract available', inplace=True)


# if there are rows with no impressions, drop them
behaviors_dev_df = behaviors_dev_df.dropna(subset=['Impressions']) # this looses some user information, could instead manually overwrite and fill in the missing values based on the typo combining the impression and history columns
behaviors_train_df = behaviors_train_df.dropna(subset=['Impressions']) # this looses some user information, could instead manually overwrite and fill in the missing values based on the typo combining the impression and history columns


In [4]:
%run most_popular.ipynb

In [5]:
def dcg_at_k(scores, k):
    #Calculate DCG, given the relevance scores sorted by predicted ranking.
    scores = np.asfarray(scores)[:k]
    return np.sum((2**scores - 1) / np.log2(np.arange(2, len(scores) + 2)))

def ndcg_at_k(predicted_scores, true_scores, k):
    #Calculate NDCG at rank K.
    predicted_scores = [1 if id in true_scores else 0 for id in predicted_scores]
    true_scores = np.ones(len(true_scores))  # Since all true scores are clicked items
    best_dcg = dcg_at_k(true_scores, k)
    actual_dcg = dcg_at_k(predicted_scores, k)
    return actual_dcg / best_dcg if best_dcg != 0 else 0

def precision_at_k(predicted, true, k):
    #Calculate Precision at rank K using DataFrames.
    predicted_set = set(predicted['News ID'][:k])
    true_set = set(true['News ID'])
    true_positives = len(predicted_set.intersection(true_set))
    return true_positives / k

In [6]:
#Since there is no model to be trained in this method, chose to first use results from train
#Then check if the assumed most popular recommendations were good
#Then increment the train with the dev we just checked 

def extract_clicked_news_ids(behaviors_df):
    # Initialize a list to store the clicked article IDs
    clicked_articles = []
    
    # Iterate through the DataFrame to extract clicked articles
    for index, row in behaviors_df.iterrows():
        for impression in row["Impressions"].split(" "):
            article_id, clicked = impression.split("-")
            if clicked == "1":
                clicked_articles.append(article_id)
                
    # Convert the list to a DataFrame
    clicked_articles_df = pd.DataFrame(clicked_articles, columns=['News ID'])
    
    return clicked_articles_df



def evaluate_mind(behavior_train_df, behaviors_dev_df, window_duration='1H', k=5):
    # Combine datasets temporally
    combined_df = pd.concat([behavior_train_df, behaviors_dev_df])
    combined_df['Time'] = pd.to_datetime(combined_df['Time'])
    combined_df.sort_values('Time', inplace=True)

    #Convert to datetime
    behavior_train_df['Time'] = pd.to_datetime(behavior_train_df['Time'])
    behaviors_dev_df['Time'] = pd.to_datetime(behaviors_dev_df['Time'])
    
    # Initialize start and end times
    start_time = behaviors_dev_df['Time'].min()
    end_time = behaviors_dev_df['Time'].max()
    current_time = start_time
    
    ndcg_scores = []
    precision_scores = []
    
    while current_time < end_time:
        window_end = current_time + pd.Timedelta(window_duration)
        
        # Use updated training data up to current time
        train_df = combined_df[combined_df['Time'] < window_end]
        
        # Test on the next window in dev data
        test_df = behaviors_dev_df[(behaviors_dev_df['Time'] >= current_time) & (behaviors_dev_df['Time'] < window_end)]
        
        # Generate top-K recommendations from training data
        recommendations = find_most_popular_news(train_df, k)
        
        # Extract true positives from test_df
        true_positives = extract_clicked_news_ids(test_df)
        
        # Calculate NDCG and Precision at K for the recommendations against true positives
        ndcg_score = ndcg_at_k(recommendations, true_positives, k)
        precision_score = precision_at_k(recommendations, true_positives, k)
        
        ndcg_scores.append(ndcg_score)
        precision_scores.append(precision_score)
        
        # Move to the next window
        current_time = window_end
    
    # Calculate average scores
    avg_ndcg = np.mean(ndcg_scores)
    avg_precision = np.mean(precision_scores)
    
    return avg_precision, avg_ndcg

In [7]:
average_presition, average_ndcg = evaluate_mind(behavior_train_df=behaviors_train_df, behaviors_dev_df=behaviors_dev_df)

print(f"Average Precision@5: {average_presition}")
print(f"Average NDCG@5: {average_ndcg}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  behaviors_df['Time'] = pd.to_datetime(behaviors_df['Time'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  behaviors_df['Time'] = pd.to_datetime(behaviors_df['Time'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  behaviors_df['Time'] = pd.to_datetime(behaviors_df['Time'])
A value is trying to be s

Average Precision@5: 0.625
Average NDCG@5: 0.3391602052736162
