In [39]:
from collections import Counter
from itertools import chain
import json
import math
from pathlib import Path
from typing import Callable, Union

import pandas as pd

from dataset import DataMaps

In [40]:
def compute_entropy(category_counts):
    """
    Computes the entropy of a category distribution.

    Args:
        category_counts (dict): A dictionary with category names as keys and their counts as values.

    Returns:
        float: The entropy value.
    """
    total = sum(category_counts.values())
    entropy = 0.0

    for count in category_counts.values():
        if count == 0:
            continue
        p_i = count / total
        entropy -= p_i * math.log2(p_i)

    return entropy


def entropy_of_predictions(
    predictions_of_users: list[list[str]],
    category_map: dict[str, str],
    top_k: int = 6,
    *,
    multi_category: bool = False,
):
    """
    Computes the entropy of ranked predictions averaged over multiple users.

    Args:
        predictions_of_users (list[list[str]]): A list of users that each have a list of 
            predicted items as strings.

        category_map (dict[str, str]): A dictionary with the mapping item2category.
            Can be single and multi category.

        top_k (int): The number of highest ranked predictions that are used.

        multi_category (bool): Whether an item (a prediction) has multiple categories.

    Returns:
        float: The average entropy.
    """
    total_entropy = 0

    for user_predicition in predictions_of_users:
        # Only keep the top k predictions
        top_predictions = user_predicition[:top_k]
        # Transform the predicted ids to the actual categories
        categories_of_user_predictions = map(
            lambda item_id: category_map[str(item_id)], top_predictions
        )

        if multi_category:
            categories_of_user_predictions = list(
                chain.from_iterable(categories_of_user_predictions)
            )

        category_counter = Counter(categories_of_user_predictions)
        entropy = compute_entropy(category_counter)
        total_entropy += entropy

    avg_entropy = total_entropy / len(predictions_of_users)

    # TODO: Remove this and put it somewhere else. Not in this function
    print(f"Average Entropy for Top-{top_k} recommendations: {avg_entropy}")
    return avg_entropy

In [41]:
def prediction_str2prediction_list(string: str, sep: str = ", "):
    return string.split(sep)

In [42]:
PROJECT_ROOT = Path()
PREDICTION_DIR = PROJECT_ROOT / "output" / "predictions"
CATEGORY_MAP_DIR = PROJECT_ROOT / "data" / "category_maps"
DATA_MAP_DIR = PROJECT_ROOT / "data" / "self_processed" / "data_maps"

In [43]:
# All paths to the saved predictions
bsarec_beauty_predictions_path = PREDICTION_DIR / "BSARec_Beauty_best_predictions.csv"
bsarec_ml1m_predictions_path = PREDICTION_DIR / "BSARec_ML-1M_predictions.csv"
bsarec_lastfm_predictions_path = PREDICTION_DIR / "BSARec_LastFM_predictions.csv"

In [44]:
# The paths to the category maps e.g. "artist 1" -> "Rock"
lastfm_category_map_path = CATEGORY_MAP_DIR / "LastFM" / "artist_category_mapping.json"
ml1m_category_map_path = CATEGORY_MAP_DIR / "ml-1m" / "movie_category_mapping.json"

def load_category_map(path: Path):
    """Load a category map from a path.
    
    Returns a dict with the mapping from item2category. Can be single and multiple categories.
    """
    with path.open("r", newline="", encoding="utf-8") as file:
        return json.load(file)

In [45]:
# The data maps are the user2id, id2user, item2id, id2item for each dataset
beauty_data_maps_path = DATA_MAP_DIR / "Beauty_maps.json"
ml1m_data_maps_path = DATA_MAP_DIR / "ML-1M_maps.json"
lastfm_data_maps_path = DATA_MAP_DIR / "LastFM_maps.json"    

In [46]:
def load_predictions(prediction_path: Path, prediction_id2item_id: Callable) -> list[list[Union[str, int]]]:
    users_item_id_predictions = pd.read_csv(prediction_path)
    users_item_id_predictions["item_id_predictions"] = users_item_id_predictions["item_id_predictions"].apply(prediction_str2prediction_list)
    item_predictions = users_item_id_predictions["item_id_predictions"].apply(lambda sequence: list(map(prediction_id2item_id, sequence)))
    return list(item_predictions)

### Dataset: LastFM

In [47]:
# Load LastFM data maps
lastfm_data_maps = DataMaps.read_json(lastfm_data_maps_path)

# Load lastfm artist -> genre map
artist_category_map = load_category_map(lastfm_category_map_path)

# Load predictions from specific models
sasrec_artist_id_predictions = ...
bert4rec_artist_id_predictions = ...
duorec_artist_id_predictions = ...
fearec_artist_id_predictions = ...
bsarec_artist_id_predictions = load_predictions(bsarec_lastfm_predictions_path, lastfm_data_maps.id2item)


In [48]:
_ = entropy_of_predictions(bsarec_artist_id_predictions, artist_category_map)

Average Entropy for Top-6 recommendations: 1.2412944561506467


### Dataset: ML-1M

In [49]:
# Load ML-1M data maps
ml1m_data_maps = DataMaps.read_json(ml1m_data_maps_path)

# Load ML-1M movie -> genres map
movie_category_map = load_category_map(ml1m_category_map_path)


# Load predictions from specific models
sasrec_movie_id_predictions = ...
bert4rec_movie_id_predictions = ...
duorec_movie_id_predictions = ...
fearec_movie_id_predictions = ...
bsarec_movie_id_predictions = load_predictions(bsarec_ml1m_predictions_path, ml1m_data_maps.id2item)

In [50]:
_ = entropy_of_predictions(bsarec_movie_id_predictions, movie_category_map, multi_category=True)

Average Entropy for Top-6 recommendations: 2.0451340462138994
