In [101]:
from collections import Counter
import csv
from itertools import chain
import json
import math
from pathlib import Path
from typing import Callable, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from dataset import DataMaps
from metrics import recall_at_k, ndcg_k

In [102]:
def prediction_str2prediction_list(string: str, sep: str = ", "):
    return string.split(sep)

In [103]:
PROJECT_ROOT = Path()
PREDICTION_DIR = PROJECT_ROOT / "output" / "predictions"
RATINGS_DIR = PROJECT_ROOT / "output" / "ratings"
DATA_DIR = PROJECT_ROOT / "data"
CATEGORY_MAP_DIR = DATA_DIR / "category_maps"
DATA_MAP_DIR = DATA_DIR / "self_processed" / "data_maps"
RAW_DATA_DIR = DATA_DIR / "raw"

In [104]:
# The paths to the category maps e.g. "artist 1" -> "Rock"
lastfm_category_map_path = CATEGORY_MAP_DIR / "LastFM" / "artist_category_mapping.json"
lastfm_popularity_map_path = CATEGORY_MAP_DIR / "LastFM" / "artist_popularity_mapping.json"

ml1m_category_map_path = CATEGORY_MAP_DIR / "ml-1m" / "movie_category_mapping.json"

diginetica_category_map_path = CATEGORY_MAP_DIR / "Diginetica" / "product_category_mapping.json"
diginetica_popularity_map_path = CATEGORY_MAP_DIR / "Diginetica" / "product_popularity_mapping.json"

def load_category_map(path: Path):
    """Load a category map from a path.
    
    Returns a dict with the mapping from item2category. Can be single and multiple categories.
    """
    with path.open("r", newline="", encoding="utf-8") as file:
        return json.load(file)

In [105]:
lastfm_data_maps_path = DATA_MAP_DIR / "LastFM_maps.json"  

In [106]:
def load_predictions(prediction_path: Path, prediction_id2item_id: Callable) -> list[list[Union[str, int]]]:
    users_item_id_predictions = pd.read_csv(prediction_path)
    users_item_id_predictions["item_id_predictions"] = users_item_id_predictions["item_id_predictions"].apply(prediction_str2prediction_list)
    item_predictions = users_item_id_predictions["item_id_predictions"].apply(lambda sequence: list(map(prediction_id2item_id, sequence)))
    return list(item_predictions)

def load_user_and_predictions(prediction_path: Path) -> dict[int, list]:
    users_item_id_predictions = pd.read_csv(prediction_path)
    # Transform the prediction from 1 string to a list of predicted items
    users_item_id_predictions["item_id_predictions"] = users_item_id_predictions["item_id_predictions"].apply(prediction_str2prediction_list)
    # Transform items in the list from string to int
    users_item_id_predictions["item_id_predictions"] = users_item_id_predictions["item_id_predictions"].apply(lambda x: [int(item) for item in x])
    return {row.user_id: row.item_id_predictions for row in users_item_id_predictions.itertuples()}

In [107]:
def load_raw_diginetica() -> pd.DataFrame:
    data_path = RAW_DATA_DIR / "Diginetica" / "diginetica_train.csv"
    df = pd.read_csv(data_path, delimiter=";", header=0, names=["userId", "itemId", "timeframe", "eventdate"])

    # Drop all users which are nan
    df = df.dropna(subset=["userId", "itemId", "timeframe", "eventdate"])
    
    return df

def load_preprocessed_dataset(data_path: Path) -> dict[int, list[int]]:
    data = {}
    with data_path.open("r", encoding="utf-8", newline="") as file:
        for line in file:
            user, items = line.split(" ", maxsplit=1)
            user = int(user)
            items = items.split(" ")
            items = list(map(lambda item: int(item.strip()), items))
            assert user not in data, "User should not exist twice in the dictionary"
            data[user] = items
    return data

In [108]:
# Load LastFM data maps
lastfm_data_maps = DataMaps.read_json(lastfm_data_maps_path)

# Load lastfm artist -> popularity map
artist_popularity_map = load_category_map(lastfm_popularity_map_path)

In [109]:
bsarec_lastfm_ratings = np.load(RATINGS_DIR / "BSARec_LastFM_ratings.npy")
bsarec_lastfm_ratings = bsarec_lastfm_ratings[:, :-1]

In [110]:
bsarec_lastfm_ratings.shape

(1090, 3646)

In [111]:
n_items = bsarec_lastfm_ratings.shape[1]
n_categories = len(set(artist_popularity_map.values()))
categories = sorted(set(artist_popularity_map.values()))

M = np.zeros((n_items, n_categories), int)

for item_idx in range(n_items):
    item_id = lastfm_data_maps.id2item(item_idx + 1)
    item_category = artist_popularity_map[str(item_id)]
    category_idx = categories.index(item_category)
    M[item_idx, category_idx] = 1

assert np.all(M.sum(axis=1) == 1)

In [112]:
n_users = len(bsarec_lastfm_ratings)

rerank_list = []
B_l = np.zeros(n_categories)
k = 20

lambda_ = 0.5
for user_idx in range(n_users):
    minimax_reg = lambda_ * np.matmul(M, 1 - (B_l / (np.sum(B_l) + 1e-5)))
    rel = bsarec_lastfm_ratings[user_idx, :] + minimax_reg
    result_item = np.argsort(rel)[::-1]
    result_item = result_item[:k]
    rerank_list.append(result_item)
    B_l = B_l + np.sum(M[result_item, :], axis=0, keepdims=False)

In [113]:
pred_path = str(PREDICTION_DIR / "BSARec_LastFM_cpair_predictions.csv")
with open(pred_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["user_id", "item_id_predictions"])
    writer.writeheader()
    for idx, pred in enumerate(rerank_list):
        # f.write(f"User {idx}: {pred.tolist()}\n")
        writer.writerow(
            {"user_id": idx, "item_id_predictions": ", ".join(map(str, pred))}
        )
    print(f"Saved predictions in `{pred_path}`")

Saved predictions in `output/predictions/BSARec_LastFM_cpair_predictions.csv`


In [114]:
def get_full_sort_score(answers, pred_list):
    recall, ndcg = [], []
    for k in [5, 10, 15, 20]:
        recall.append(recall_at_k(answers, pred_list, k))
        ndcg.append(ndcg_k(answers, pred_list, k))
    post_fix = {
        "HR@5": "{:.4f}".format(recall[0]),
        "NDCG@5": "{:.4f}".format(ndcg[0]),
        "HR@10": "{:.4f}".format(recall[1]),
        "NDCG@10": "{:.4f}".format(ndcg[1]),
        "HR@20": "{:.4f}".format(recall[3]),
        "NDCG@20": "{:.4f}".format(ndcg[3]),
    }
    return post_fix

In [115]:
lastfm = load_preprocessed_dataset(DATA_DIR / "LastFM.txt")
answers = list(map(lambda items: items[-1], lastfm.values()))

In [116]:
get_full_sort_score(answers, rerank_list)

{'HR@5': '0.0514',
 'NDCG@5': '0.0337',
 'HR@10': '0.0734',
 'NDCG@10': '0.0407',
 'HR@20': '0.1110',
 'NDCG@20': '0.0501'}